To accomplish in this problem set
Do pandas imports and set up variables for accessing census data sets
import pandas as pd
from pandas import Series, DataFrame
import os
DATA_DIR = os.path.join(os.pardir, "data")
# see what's in the directory using os.walk
list(os.walk(os.path.join(DATA_DIR, "census")))
[('../data/census', [], ['.DS_Store', 'DataDict.txt', 'DataSet.txt', 'FIPS_CountyName.txt', 'README.txt', 'st99_d00.dbf', 'st99_d00.shp', 'st99_d00.shx'])]
# do shell command for linux/mac -- sorry windows
!ls $DATA_DIR/census/*
../data/census/DataDict.txt ../data/census/FIPS_CountyName.txt ../data/census/st99_d00.dbf ../data/census/st99_d00.shx ../data/census/DataSet.txt ../data/census/README.txt ../data/census/st99_d00.shp
import os
# relative to parent dir
DATA_FILES = {"datadict":"data/census/DataDict.txt",
"dataset":"data/census/DataSet.txt",
"fips": "data/census/FIPS_CountyName.txt"}
def file_path(key):
return os.path.join(os.pardir, DATA_FILES[key])
for file_key in DATA_FILES.keys():
abs_fname = file_path(file_key)
assert os.path.exists(abs_fname) is True
import codecs
from itertools import islice
from pandas import Series, DataFrame
f = codecs.open(file_path("fips"), encoding='iso-8859-1')
fips_list = list()
for row in islice(f, None):
fips_list.append({'fips': row[:5], 'geog_entity': row[6:-1]})
fips_df = DataFrame(fips_list)
# print out first 5 rows
fips_df[:5]
fips | geog_entity | |
---|---|---|
0 | 00000 | UNITED STATES |
1 | 01000 | ALABAMA |
2 | 01001 | Autauga County, AL |
3 | 01003 | Baldwin County, AL |
4 | 01005 | Barbour County, AL |
e.g., 06 fo California
# FILL IN YOUR CODE
fips_df["fips_prefix"] = fips_df.fips.str[:2]
# TEST
assert np.all(fips_df[fips_df.fips == '00000']['fips_prefix'] == '00')
assert np.all(fips_df[fips_df.fips == '06000']['fips_prefix'] == '06')
assert np.all(fips_df[fips_df.fips == '06001']['fips_prefix'] == '06')
# FILL IN YOUR CODE
fips_df["geog_type"] = np.where(fips_df.fips.str[-3:] != '000', 'county', np.where(fips_df.fips.str[:2] == '00', 'country', 'state'))
# TEST
# check specific values
assert fips_df[fips_df.fips == '00000']['geog_type'] == 'country'
assert fips_df[fips_df.fips == '06000']['geog_type'] == 'state'
assert fips_df[fips_df.fips == '06001']['geog_type'] == 'county'
# check the numbers of various geog_type
assert set(fips_df.geog_type.value_counts().iteritems()) == set([('state', 51), ('country', 1), ('county', 3143)])
# let's try pd.read_csv
# how to give hints about data type to pd.read_csv?
import codecs
# let's try object for fips and int for POP010210
dtype = {'fips':'S5', 'POP010210':np.int}
f = codecs.open(file_path("dataset"), encoding='iso-8859-1')
dataset_df = pd.read_csv(f, dtype=dtype)
df = pd.merge(fips_df, dataset_df)
df[['fips', 'geog_entity', 'geog_type', 'POP010210']][:5]
fips | geog_entity | geog_type | POP010210 | |
---|---|---|---|---|
0 | 00000 | UNITED STATES | country | 308745538 |
1 | 01000 | ALABAMA | state | 4779736 |
2 | 01001 | Autauga County, AL | county | 54571 |
3 | 01003 | Baldwin County, AL | county | 182265 |
4 | 01005 | Barbour County, AL | county | 27457 |
Requirement for top_five_states_by_pop
DataFrame:
geog_entity
, POP010210
Hints:
sort_index
# FILL IN YOUR CODE
top_five_states_by_pop = df[df.geog_type=='state'][['geog_entity', 'POP010210']].sort_index(by='POP010210')[::-1][:5].copy()
top_five_states_by_pop
geog_entity | POP010210 | |
---|---|---|
191 | CALIFORNIA | 37253956 |
2567 | TEXAS | 25145561 |
1861 | NEW YORK | 19378102 |
330 | FLORIDA | 18801310 |
609 | ILLINOIS | 12830632 |
# TEST
# 5 rows, column values, and specific values
assert len(top_five_states_by_pop) == 5
assert set(top_five_states_by_pop.columns) == set(['POP010210', 'geog_entity'])
assert list(top_five_states_by_pop[['geog_entity', 'POP010210']].values.flatten()) == [u'CALIFORNIA',
37253956L,
u'TEXAS',
25145561L,
u'NEW YORK',
19378102L,
u'FLORIDA',
18801310L,
u'ILLINOIS',
12830632L]
Purpose: construct state_pop DataFrame to use for plotting
state_pop = df[df.geog_type=='state']['POP010210'].copy()
# use sort to do in-place sort
state_pop.sort()
state_pop[:5]
3171 563626 328 601723 2852 625741 2025 672591 69 710231 Name: POP010210
state_pop.order()[::-1]
state_pop.sum()
308745538
Some specific values to be plotted:
0 0.120662
1 0.202107
2 0.264871
3 0.325766
....
....
47 0.994199
48 0.996226
49 0.998174
50 1.000000
Consider using cumsum in relation to state_pop
.
Your graph should look something like the following:
# some verification of the numbers
from pandas import Series
k = Series(state_pop[::-1].cumsum().div(float(state_pop.sum())).values, index= arange(51))
k
0 0.120662 1 0.202107 2 0.264871 3 0.325766 4 0.367324 5 0.408466 6 0.445831 7 0.477844 8 0.509221 9 0.540106 10 0.568582 11 0.594496 12 0.616277 13 0.637484 14 0.658484 15 0.679188 16 0.699742 17 0.719140 18 0.737840 19 0.756259 20 0.773438 21 0.789727 22 0.805209 23 0.820190 24 0.834873 25 0.848928 26 0.861336 27 0.873487 28 0.885063 29 0.894930 30 0.904540 31 0.913985 32 0.923226 33 0.932178 34 0.940925 35 0.947594 36 0.953596 37 0.959511 38 0.964589 39 0.968994 40 0.973297 41 0.977561 42 0.980970 43 0.984175 44 0.987083 45 0.989720 46 0.992020 47 0.994199 48 0.996226 49 0.998174 50 1.000000 Length: 51
# FILL IN WITH YOUR CODE TO GENERATE PLOT
state_pop[::-1].cumsum().div(float(state_pop.sum())).plot(use_index=False, style="o")
<matplotlib.axes.AxesSubplot at 0x4f05910>
First, consider a simple example bar graph to learn from
import string
num_letter_series = Series(arange(10), index=list(string.lowercase[:10]))
num_letter_series
a 0 b 1 c 2 d 3 e 4 f 5 g 6 h 7 i 8 j 9
num_letter_series.plot(kind="bar")
<matplotlib.axes.AxesSubplot at 0x56c5f50>
# FILL IN WITH YOUR CODE TO GENERATE BAR PLOT
Series(df[df.geog_type=='state']['POP010210'].values, index= df[df.geog_type=='state']['geog_entity'].values).order()[::-1].plot(kind="bar")
<matplotlib.axes.AxesSubplot at 0x4f0d9f0>
state_pop2 = df[df.geog_type=='state'][['geog_entity', 'POP010210']].sort_index(by='POP010210')[::-1]
state_pop2.set_index('geog_entity', inplace=True, drop=True)
/Users/raymondyee/.virtualenvs/epd1/lib/python2.7/site-packages/pandas/core/frame.py:2762: FutureWarning: set_index with inplace=True will return None from pandas 0.11 onward " from pandas 0.11 onward", FutureWarning)
POP010210 | |
---|---|
geog_entity | |
CALIFORNIA | 37253956 |
TEXAS | 25145561 |
NEW YORK | 19378102 |
FLORIDA | 18801310 |
ILLINOIS | 12830632 |
PENNSYLVANIA | 12702379 |
OHIO | 11536504 |
MICHIGAN | 9883640 |
GEORGIA | 9687653 |
NORTH CAROLINA | 9535483 |
NEW JERSEY | 8791894 |
VIRGINIA | 8001024 |
WASHINGTON | 6724540 |
MASSACHUSETTS | 6547629 |
INDIANA | 6483802 |
ARIZONA | 6392017 |
TENNESSEE | 6346105 |
MISSOURI | 5988927 |
MARYLAND | 5773552 |
WISCONSIN | 5686986 |
MINNESOTA | 5303925 |
COLORADO | 5029196 |
ALABAMA | 4779736 |
SOUTH CAROLINA | 4625364 |
LOUISIANA | 4533372 |
KENTUCKY | 4339367 |
OREGON | 3831074 |
OKLAHOMA | 3751351 |
CONNECTICUT | 3574097 |
IOWA | 3046355 |
MISSISSIPPI | 2967297 |
ARKANSAS | 2915918 |
KANSAS | 2853118 |
UTAH | 2763885 |
NEVADA | 2700551 |
NEW MEXICO | 2059179 |
WEST VIRGINIA | 1852994 |
NEBRASKA | 1826341 |
IDAHO | 1567582 |
HAWAII | 1360301 |
MAINE | 1328361 |
NEW HAMPSHIRE | 1316470 |
RHODE ISLAND | 1052567 |
MONTANA | 989415 |
DELAWARE | 897934 |
SOUTH DAKOTA | 814180 |
ALASKA | 710231 |
NORTH DAKOTA | 672591 |
VERMONT | 625741 |
DISTRICT OF COLUMBIA | 601723 |
WYOMING | 563626 |
font = {'family' : 'normal',
'weight' : 'normal',
'size' : 7}
plt.rc('font', **font)
state_pop_total
that sums up the populations for all counties in the state. state_pop_total
must be indexed on fips_prefix
. Hint: do a group_by
operation on df
and then sort bydescending population.
extracted_state_pops
that is indexed by fips_prefix
and holds the total populationas stated for given state in df
. Make extracted_state_pops
sorted by descending population also
# FILL IN WITH YOUR CODE for state_pop_total and extracted_state_pops
state_pop_total = df[df.geog_type == 'county'][['fips_prefix', 'POP010210']].groupby('fips_prefix').sum().sort_index(by='POP010210')[::-1]
extracted_state_pops = df[df.geog_type == 'state'][['fips_prefix', 'POP010210']].set_index('fips_prefix').sort_index(by='POP010210')[::-1]
# TEST
assert isinstance(state_pop_total, DataFrame)
assert map(unicode, list(state_pop_total.index)) == [u'06', u'48', u'36', u'12', u'17', u'42', u'39', u'26',
u'13', u'37', u'34', u'51', u'53', u'25', u'18', u'04', u'47', u'29', u'24', u'55',
u'27', u'08', u'01', u'45', u'22', u'21', u'41', u'40', u'09', u'19', u'28',
u'05', u'20', u'49', u'32', u'35', u'54', u'31', u'16', u'15', u'23', u'33', u'44',
u'30', u'10', u'46', u'02', u'38', u'50', u'11', u'56']
assert set(state_pop_total.columns) == set(['POP010210'])
assert state_pop_total[:5].sum() == 113409561
# compare all the state totals
assert np.all(state_pop_total['POP010210'] == extracted_state_pops['POP010210'])
# Series must be sorted in same order for comparison to work -- the following is False
np.all(state_pop_total['POP010210'].order() == extracted_state_pops['POP010210'])
False
# let's shuffle the index
import random
shuffle_index = list(state_pop_total.index)
random.shuffle(shuffle_index)
print shuffle_index
[u'41', u'02', u'25', u'06', u'30', u'47', u'53', u'54', u'17', u'38', u'40', u'21', u'50', u'28', u'12', u'44', u'01', u'46', u'04', u'24', u'18', u'16', u'20', u'22', u'49', u'32', u'51', u'56', u'13', u'10', u'09', u'37', u'23', u'29', u'48', u'08', u'35', u'39', u'36', u'26', u'42', u'33', u'15', u'19', u'34', u'05', u'45', u'27', u'31', u'55', u'11']
# use reindex to create a shuffled Series
shuffled_series = state_pop_total['POP010210'].reindex(index=shuffle_index)
shuffled_series[:5]
41 3831074 02 710231 25 6547629 06 37253956 30 989415 Name: POP010210
# When you insert these three Series into a common DataFrame, you can do comparisons across the columns
comp = DataFrame(index=extracted_state_pops.index)
comp['extracted'] = extracted_state_pops['POP010210']
comp['total'] = state_pop_total['POP010210']
comp['shuffle'] = shuffled_series
np.all(comp['extracted'] == comp['total'])
True
np.all(comp['shuffle'] == comp['total'])
True