import pandas as pd
print pd.__version__
0.12.0
/home/aman/Workspace/ENVSYS/lib/python2.7/site-packages/pytz/__init__.py:35: UserWarning: Module numpy was already imported from /usr/local/anaconda/lib/python2.7/site-packages/numpy/__init__.pyc, but /home/aman/Workspace/ENVSYS/lib/python2.7/site-packages is being added to sys.path from pkg_resources import resource_stream
!head 'National Program of Cancer Registries, 1999-2010 Incidence.txt'
#!tail -n 55 'National Program of Cancer Registries, 1999-2010 Incidence.txt'
raw_df = pd.io.parsers.read_csv('National Program of Cancer Registries, 1999-2010 Incidence.txt',
sep='\t',
skipfooter=55,
na_values=['Missing', 'Not Applicable'],
)
#Check col names, also making sure file read properly
raw_df.columns
Index([u'Notes', u'Leading Cancer Sites', u'Leading Cancer Sites Code', u'Year', u'Year Code', u'State', u'State Code', u'Sex', u'Sex Code', u'Count', u'Population', u'Age-Adjusted Rate', u'Age-Adjusted Rate Lower 95% Confidence Interval', u'Age-Adjusted Rate Upper 95% Confidence Interval'], dtype=object)
#easier col names to deal with
col_names = ['notes', 'cancer_site', 'cancer_site_code', 'year', 'year_code', 'state', 'state_code', 'sex', 'sex_code', 'cancer_count', 'population', 'age_adj_rate',
'age_adj_rate_lower_95_confidence', 'age_adj_rate_upper_95_confidence']
#Make sure didn't miss anything
print len(col_names), len(raw_df.columns)
14 14
raw_df.columns = col_names
print raw_df.columns
Index([u'notes', u'cancer_site', u'cancer_site_code', u'year', u'year_code', u'state', u'state_code', u'sex', u'sex_code', u'cancer_count', u'population', u'age_adj_rate', u'age_adj_rate_lower_95_confidence', u'age_adj_rate_upper_95_confidence'], dtype=object)
#Let's drop some columns we don't need
desired_cols = ['cancer_site', 'cancer_site_code', 'year', 'state', 'state_code', 'sex_code', 'cancer_count', 'population', 'age_adj_rate']
df = raw_df[desired_cols]
#Separate out the combined data from the data on invididual cancer sites
combined_df = df[df.cancer_site_code == '00']
df = df[df.cancer_site_code != '00']
len(df.cancer_site.unique())
22
print df.age_adj_rate.describe()
count 18873.000000 mean 24.193403 std 34.180612 min 0.000000 25% 6.900000 50% 11.300000 75% 22.000000 max 244.800000 dtype: float64
"""
Plotting cancer occurrences in each cancer site, by sex
"""
#excerpt (this is a view, right?)
tt = df[['cancer_site', 'age_adj_rate', 'sex_code']]
for idx, group in tt.groupby(by='cancer_site', ):
plt.figure()
group.boxplot(by='sex_code')
plt.ylim((0,100))
plt.title(idx)
"""
I'm curious about the shape of the boxplot for Urinary bladders
"""
#temporary df for convenience plots
tt = df[['age_adj_rate', 'state', 'cancer_site', 'sex_code']].fillna(0)
tt = tt[tt.cancer_site == 'Urinary Bladder']
tt.boxplot(by='sex_code',figsize=(8,4), vert=False)
plt.title("Urinary Bladder, adjusted rate by sex")
<matplotlib.text.Text at 0x4babd10>
tt.boxplot(by='state', figsize=(8,12), vert=False)
plt.title("Urinary Bladder, count by state")
<matplotlib.text.Text at 0x4bc5810>
"""
I want those boxplots sorted, damnit.
"""
sorted_df = tt.groupby(by='state').age_adj_rate.sum()
sorted_df.sort()
sorted_states = list(sorted_df.index)
fig, axarr = plt.subplots(nrows=len(sorted_states), sharex=True, )
axarr[0].set_title("Urinary Bladder occurrence, age adjusted\nby State")
for ipos, state in enumerate(sorted_states):
mask = (tt.state == state)
tt[mask].boxplot(
ax=axarr[ipos],
positions = [ipos+1],
vert=False,
rot=90,
widths=0.75,
)
#Cosmetics (axis)
axarr[ipos].set_yticks([])
axarr[ipos].set_ylabel('{}'.format(state), rotation='horizontal')
axarr[ipos].set_frame_on(False)
#Cosmetics (figure)
fig.set_size_inches(8,25)
fig.subplots_adjust(hspace = 0)
fig.show()