import pandas as pd #importing packages
import os as os
#pd.describe_option() #describe options for customizing
#pd.get_option("display.memory_usage")#setting some options
os.getcwd() #current working directory
'/home/ajay'
os.chdir('/home/ajay/Desktop')
os.getcwd()
'/home/ajay/Desktop'
a=os.getcwd()
os.listdir(a)
['adult.data']
names2=["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income"]
len(names2)
15
adult=pd.read_csv("adult.data",header=None)
len(adult)
32562
adult.columns
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')
adult.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 32562 entries, 0 to 32561 Data columns (total 15 columns): 0 32561 non-null float64 1 32561 non-null object 2 32561 non-null float64 3 32561 non-null object 4 32561 non-null float64 5 32561 non-null object 6 32561 non-null object 7 32561 non-null object 8 32561 non-null object 9 32561 non-null object 10 32561 non-null float64 11 32561 non-null float64 12 32561 non-null float64 13 32561 non-null object 14 32561 non-null object dtypes: float64(6), object(9)
adult.head(8)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
5 | 37 | Private | 284582 | Masters | 14 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 0 | 0 | 40 | United-States | <=50K |
6 | 49 | Private | 160187 | 9th | 5 | Married-spouse-absent | Other-service | Not-in-family | Black | Female | 0 | 0 | 16 | Jamaica | <=50K |
7 | 52 | Self-emp-not-inc | 209642 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 45 | United-States | >50K |
adult.columns= names2
adult.head(30)
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
5 | 37 | Private | 284582 | Masters | 14 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 0 | 0 | 40 | United-States | <=50K |
6 | 49 | Private | 160187 | 9th | 5 | Married-spouse-absent | Other-service | Not-in-family | Black | Female | 0 | 0 | 16 | Jamaica | <=50K |
7 | 52 | Self-emp-not-inc | 209642 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 45 | United-States | >50K |
8 | 31 | Private | 45781 | Masters | 14 | Never-married | Prof-specialty | Not-in-family | White | Female | 14084 | 0 | 50 | United-States | >50K |
9 | 42 | Private | 159449 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 5178 | 0 | 40 | United-States | >50K |
10 | 37 | Private | 280464 | Some-college | 10 | Married-civ-spouse | Exec-managerial | Husband | Black | Male | 0 | 0 | 80 | United-States | >50K |
11 | 30 | State-gov | 141297 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | India | >50K |
12 | 23 | Private | 122272 | Bachelors | 13 | Never-married | Adm-clerical | Own-child | White | Female | 0 | 0 | 30 | United-States | <=50K |
13 | 32 | Private | 205019 | Assoc-acdm | 12 | Never-married | Sales | Not-in-family | Black | Male | 0 | 0 | 50 | United-States | <=50K |
14 | 40 | Private | 121772 | Assoc-voc | 11 | Married-civ-spouse | Craft-repair | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | ? | >50K |
15 | 34 | Private | 245487 | 7th-8th | 4 | Married-civ-spouse | Transport-moving | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 45 | Mexico | <=50K |
16 | 25 | Self-emp-not-inc | 176756 | HS-grad | 9 | Never-married | Farming-fishing | Own-child | White | Male | 0 | 0 | 35 | United-States | <=50K |
17 | 32 | Private | 186824 | HS-grad | 9 | Never-married | Machine-op-inspct | Unmarried | White | Male | 0 | 0 | 40 | United-States | <=50K |
18 | 38 | Private | 28887 | 11th | 7 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
19 | 43 | Self-emp-not-inc | 292175 | Masters | 14 | Divorced | Exec-managerial | Unmarried | White | Female | 0 | 0 | 45 | United-States | >50K |
20 | 40 | Private | 193524 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | >50K |
21 | 54 | Private | 302146 | HS-grad | 9 | Separated | Other-service | Unmarried | Black | Female | 0 | 0 | 20 | United-States | <=50K |
22 | 35 | Federal-gov | 76845 | 9th | 5 | Married-civ-spouse | Farming-fishing | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
23 | 43 | Private | 117037 | 11th | 7 | Married-civ-spouse | Transport-moving | Husband | White | Male | 0 | 2042 | 40 | United-States | <=50K |
24 | 59 | Private | 109015 | HS-grad | 9 | Divorced | Tech-support | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
25 | 56 | Local-gov | 216851 | Bachelors | 13 | Married-civ-spouse | Tech-support | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
26 | 19 | Private | 168294 | HS-grad | 9 | Never-married | Craft-repair | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K |
27 | 54 | ? | 180211 | Some-college | 10 | Married-civ-spouse | ? | Husband | Asian-Pac-Islander | Male | 0 | 0 | 60 | South | >50K |
28 | 39 | Private | 367260 | HS-grad | 9 | Divorced | Exec-managerial | Not-in-family | White | Male | 0 | 0 | 80 | United-States | <=50K |
29 | 49 | Private | 193366 | HS-grad | 9 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 40 | United-States | <=50K |
adult.describe() #numerical summaries
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
---|---|---|---|---|---|---|
count | 32561.000000 | 32561.000000 | 32561.000000 | 32561.000000 | 32561.000000 | 32561.000000 |
mean | 38.581647 | 189778.366512 | 10.080679 | 1077.648844 | 87.303830 | 40.437456 |
std | 13.640433 | 105549.977697 | 2.572720 | 7385.292085 | 402.960219 | 12.347429 |
min | 17.000000 | 12285.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 28.000000 | 117827.000000 | 9.000000 | 0.000000 | 0.000000 | 40.000000 |
50% | 37.000000 | 178356.000000 | 10.000000 | 0.000000 | 0.000000 | 40.000000 |
75% | 48.000000 | 237051.000000 | 12.000000 | 0.000000 | 0.000000 | 45.000000 |
max | 90.000000 | 1484705.000000 | 16.000000 | 99999.000000 | 4356.000000 | 99.000000 |
workclass=adult.groupby("workclass")
len(workclass)
9
workclass.sum()
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
---|---|---|---|---|---|---|
workclass | ||||||
? | 75203 | 346115997 | 17002 | 1114077 | 111556 | 58604 |
Federal-gov | 40887 | 177812394 | 10535 | 799903 | 107778 | 39724 |
Local-gov | 87385 | 394822919 | 23111 | 1842264 | 229925 | 85777 |
Never-worked | 144 | 1581927 | 52 | 0 | 0 | 199 |
Private | 835158 | 4374974348 | 224230 | 20181687 | 1815878 | 913902 |
Self-emp-inc | 51355 | 196395180 | 12429 | 5441274 | 173135 | 54481 |
Self-emp-not-inc | 114268 | 446221558 | 25985 | 4792483 | 296361 | 112876 |
State-gov | 51188 | 239009324 | 14766 | 910806 | 108067 | 50663 |
Without-pay | 669 | 2439745 | 127 | 6830 | 0 | 458 |
workclass.count()
age | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
workclass | ||||||||||||||
? | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 | 1836 |
Federal-gov | 960 | 960 | 960 | 960 | 960 | 960 | 960 | 960 | 960 | 960 | 960 | 960 | 960 | 960 |
Local-gov | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 | 2093 |
Never-worked | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 |
Private | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 | 22696 |
Self-emp-inc | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 | 1116 |
Self-emp-not-inc | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 | 2541 |
State-gov | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 | 1298 |
Without-pay | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 |
workclass.describe()
age | capital-gain | capital-loss | education-num | fnlwgt | hours-per-week | ||
---|---|---|---|---|---|---|---|
workclass | |||||||
? | count | 1836.000000 | 1836.000000 | 1836.000000 | 1836.000000 | 1836.000000 | 1836.000000 |
mean | 40.960240 | 606.795752 | 60.760349 | 9.260349 | 188516.338235 | 31.919390 | |
std | 20.334587 | 5147.323872 | 354.685264 | 2.601986 | 107089.902252 | 14.909903 | |
min | 17.000000 | 0.000000 | 0.000000 | 1.000000 | 12285.000000 | 1.000000 | |
25% | 21.000000 | 0.000000 | 0.000000 | 9.000000 | 117771.250000 | 20.000000 | |
50% | 35.000000 | 0.000000 | 0.000000 | 9.000000 | 175617.000000 | 36.000000 | |
75% | 61.000000 | 0.000000 | 0.000000 | 10.000000 | 234568.500000 | 40.000000 | |
max | 90.000000 | 99999.000000 | 4356.000000 | 16.000000 | 981628.000000 | 99.000000 | |
Federal-gov | count | 960.000000 | 960.000000 | 960.000000 | 960.000000 | 960.000000 | 960.000000 |
mean | 42.590625 | 833.232292 | 112.268750 | 10.973958 | 185221.243750 | 41.379167 | |
std | 11.509171 | 4101.966767 | 453.504623 | 2.113650 | 117502.359524 | 8.838605 | |
min | 17.000000 | 0.000000 | 0.000000 | 3.000000 | 19914.000000 | 4.000000 | |
25% | 34.000000 | 0.000000 | 0.000000 | 9.000000 | 97781.250000 | 40.000000 | |
50% | 43.000000 | 0.000000 | 0.000000 | 10.000000 | 175771.000000 | 40.000000 | |
75% | 51.000000 | 0.000000 | 0.000000 | 13.000000 | 243960.250000 | 40.000000 | |
max | 90.000000 | 99999.000000 | 3683.000000 | 16.000000 | 930948.000000 | 99.000000 | |
Local-gov | count | 2093.000000 | 2093.000000 | 2093.000000 | 2093.000000 | 2093.000000 | 2093.000000 |
mean | 41.751075 | 880.202580 | 109.854276 | 11.042045 | 188639.712852 | 40.982800 | |
std | 12.272856 | 5775.043442 | 439.513203 | 2.552536 | 100254.775314 | 10.771559 | |
min | 17.000000 | 0.000000 | 0.000000 | 1.000000 | 14878.000000 | 2.000000 | |
25% | 32.000000 | 0.000000 | 0.000000 | 9.000000 | 121124.000000 | 40.000000 | |
50% | 41.000000 | 0.000000 | 0.000000 | 11.000000 | 179580.000000 | 40.000000 | |
75% | 50.000000 | 0.000000 | 0.000000 | 13.000000 | 236487.000000 | 44.000000 | |
max | 90.000000 | 99999.000000 | 2444.000000 | 16.000000 | 1125613.000000 | 99.000000 | |
Never-worked | count | 7.000000 | 7.000000 | 7.000000 | 7.000000 | 7.000000 | 7.000000 |
mean | 20.571429 | 0.000000 | 0.000000 | 7.428571 | 225989.571429 | 28.428571 | |
std | 4.613644 | 0.000000 | 0.000000 | 2.299068 | 108135.748347 | 15.186147 | |
min | 17.000000 | 0.000000 | 0.000000 | 4.000000 | 153663.000000 | 4.000000 | |
25% | 18.000000 | 0.000000 | 0.000000 | 6.000000 | 166902.000000 | 20.000000 | |
50% | 18.000000 | 0.000000 | 0.000000 | 7.000000 | 188535.000000 | 35.000000 | |
... | ... | ... | ... | ... | ... | ... | ... |
Self-emp-inc | std | 12.553194 | 17976.548086 | 549.488497 | 2.603210 | 96436.282913 | 13.900417 |
min | 17.000000 | 0.000000 | 0.000000 | 2.000000 | 21626.000000 | 1.000000 | |
25% | 37.000000 | 0.000000 | 0.000000 | 9.000000 | 113539.750000 | 40.000000 | |
50% | 45.000000 | 0.000000 | 0.000000 | 10.000000 | 165667.000000 | 50.000000 | |
75% | 54.000000 | 0.000000 | 0.000000 | 13.000000 | 213722.750000 | 60.000000 | |
max | 84.000000 | 99999.000000 | 2559.000000 | 16.000000 | 1097453.000000 | 99.000000 | |
Self-emp-not-inc | count | 2541.000000 | 2541.000000 | 2541.000000 | 2541.000000 | 2541.000000 | 2541.000000 |
mean | 44.969697 | 1886.061787 | 116.631641 | 10.226289 | 175608.641480 | 44.421881 | |
std | 13.338162 | 10986.233506 | 467.611687 | 2.768132 | 100735.757730 | 16.674958 | |
min | 17.000000 | 0.000000 | 0.000000 | 2.000000 | 20098.000000 | 1.000000 | |
25% | 35.000000 | 0.000000 | 0.000000 | 9.000000 | 104973.000000 | 40.000000 | |
50% | 44.000000 | 0.000000 | 0.000000 | 10.000000 | 168109.000000 | 40.000000 | |
75% | 54.000000 | 0.000000 | 0.000000 | 13.000000 | 227298.000000 | 50.000000 | |
max | 90.000000 | 99999.000000 | 2824.000000 | 16.000000 | 795830.000000 | 99.000000 | |
State-gov | count | 1298.000000 | 1298.000000 | 1298.000000 | 1298.000000 | 1298.000000 | 1298.000000 |
mean | 39.436055 | 701.699538 | 83.256549 | 11.375963 | 184136.613251 | 39.031587 | |
std | 12.431065 | 3777.749185 | 394.469789 | 2.538604 | 111512.980926 | 11.697014 | |
min | 17.000000 | 0.000000 | 0.000000 | 1.000000 | 19395.000000 | 1.000000 | |
25% | 30.000000 | 0.000000 | 0.000000 | 9.000000 | 108903.750000 | 38.000000 | |
50% | 39.000000 | 0.000000 | 0.000000 | 10.000000 | 169402.500000 | 40.000000 | |
75% | 48.000000 | 0.000000 | 0.000000 | 13.000000 | 238532.750000 | 40.000000 | |
max | 81.000000 | 99999.000000 | 3683.000000 | 16.000000 | 1033222.000000 | 99.000000 | |
Without-pay | count | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 |
mean | 47.785714 | 487.857143 | 0.000000 | 9.071429 | 174267.500000 | 32.714286 | |
std | 21.075610 | 1300.780467 | 0.000000 | 1.685426 | 85536.385921 | 17.357900 | |
min | 19.000000 | 0.000000 | 0.000000 | 4.000000 | 27012.000000 | 10.000000 | |
25% | 23.750000 | 0.000000 | 0.000000 | 9.000000 | 138446.500000 | 20.000000 | |
50% | 57.000000 | 0.000000 | 0.000000 | 9.000000 | 171531.500000 | 27.500000 | |
75% | 65.000000 | 0.000000 | 0.000000 | 9.750000 | 209006.500000 | 47.500000 | |
max | 72.000000 | 4416.000000 | 0.000000 | 12.000000 | 344858.000000 | 65.000000 |
72 rows × 6 columns
race=adult.groupby("race")
race.sum()
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
---|---|---|---|---|---|---|
race | ||||||
Amer-Indian-Eskimo | 11561 | 37578487 | 2896 | 194458 | 10629 | 12455 |
Asian-Pac-Islander | 39219 | 166178293 | 11388 | 1536014 | 101014 | 41692 |
Black | 117987 | 712313000 | 29635 | 1905454 | 188643 | 120033 |
Other | 9067 | 53420656 | 2396 | 253293 | 16550 | 10696 |
White | 1078423 | 5209882956 | 281922 | 31200105 | 2525864 | 1131808 |
race.mean()
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
---|---|---|---|---|---|---|
race | ||||||
Amer-Indian-Eskimo | 37.173633 | 120831.147910 | 9.311897 | 625.266881 | 34.176849 | 40.048232 |
Asian-Pac-Islander | 37.746872 | 159940.609240 | 10.960539 | 1478.358037 | 97.222329 | 40.127045 |
Black | 37.767926 | 228013.124200 | 9.486236 | 609.940461 | 60.385083 | 38.422855 |
Other | 33.457565 | 197124.191882 | 8.841328 | 934.660517 | 61.070111 | 39.468635 |
White | 38.769881 | 187298.064280 | 10.135246 | 1121.660375 | 90.806155 | 40.689100 |
pd.crosstab(adult.race, adult.workclass)
workclass | ? | Federal-gov | Local-gov | Never-worked | Private | Self-emp-inc | Self-emp-not-inc | State-gov | Without-pay |
---|---|---|---|---|---|---|---|---|---|
race | |||||||||
Amer-Indian-Eskimo | 25 | 19 | 36 | 0 | 190 | 2 | 24 | 15 | 0 |
Asian-Pac-Islander | 65 | 44 | 39 | 0 | 713 | 46 | 73 | 58 | 1 |
Black | 213 | 169 | 288 | 2 | 2176 | 23 | 93 | 159 | 1 |
Other | 23 | 7 | 10 | 0 | 213 | 5 | 9 | 4 | 0 |
White | 1510 | 721 | 1720 | 5 | 19404 | 1040 | 2342 | 1062 | 12 |
pd.crosstab(adult.race, adult.sex)
sex | Female | Male |
---|---|---|
race | ||
Amer-Indian-Eskimo | 119 | 192 |
Asian-Pac-Islander | 346 | 693 |
Black | 1555 | 1569 |
Other | 109 | 162 |
White | 8642 | 19174 |
pd.crosstab(adult.income, adult.sex)
sex | Female | Male |
---|---|---|
income | ||
<=50K | 9592 | 15128 |
>50K | 1179 | 6662 |
pd.crosstab(adult.income, adult.race)
race | Amer-Indian-Eskimo | Asian-Pac-Islander | Black | Other | White |
---|---|---|---|---|---|
income | |||||
<=50K | 275 | 763 | 2737 | 246 | 20699 |
>50K | 36 | 276 | 387 | 25 | 7117 |
adult.corr(method='pearson', min_periods=1)
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
---|---|---|---|---|---|---|
age | 1.000000 | -0.076646 | 0.036527 | 0.077674 | 0.057775 | 0.068756 |
fnlwgt | -0.076646 | 1.000000 | -0.043195 | 0.000432 | -0.010252 | -0.018768 |
education-num | 0.036527 | -0.043195 | 1.000000 | 0.122630 | 0.079923 | 0.148123 |
capital-gain | 0.077674 | 0.000432 | 0.122630 | 1.000000 | -0.031615 | 0.078409 |
capital-loss | 0.057775 | -0.010252 | 0.079923 | -0.031615 | 1.000000 | 0.054256 |
hours-per-week | 0.068756 | -0.018768 | 0.148123 | 0.078409 | 0.054256 | 1.000000 |