In [1]:

import pandas as pd
print pd.__version__

0.12.0

/home/aman/Workspace/ENVSYS/lib/python2.7/site-packages/pytz/__init__.py:35: UserWarning: Module numpy was already imported from /usr/local/anaconda/lib/python2.7/site-packages/numpy/__init__.pyc, but /home/aman/Workspace/ENVSYS/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream

In [2]:

!head 'National Program of Cancer Registries, 1999-2010 Incidence.txt'
#!tail -n 55 'National Program of Cancer Registries, 1999-2010 Incidence.txt'

In [3]:

raw_df = pd.io.parsers.read_csv('National Program of Cancer Registries, 1999-2010 Incidence.txt', 
        sep='\t', 
        skipfooter=55,
        na_values=['Missing', 'Not Applicable'],
)

In [4]:

#Check col names, also making sure file read properly
raw_df.columns

Out[4]:

Index([u'Notes', u'Leading Cancer Sites', u'Leading Cancer Sites Code', u'Year', u'Year Code', u'State', u'State Code', u'Sex', u'Sex Code', u'Count', u'Population', u'Age-Adjusted Rate', u'Age-Adjusted Rate Lower 95% Confidence Interval', u'Age-Adjusted Rate Upper 95% Confidence Interval'], dtype=object)

In [5]:

#easier col names to deal with
col_names = ['notes', 'cancer_site', 'cancer_site_code', 'year', 'year_code', 'state', 'state_code', 'sex', 'sex_code', 'cancer_count', 'population', 'age_adj_rate',
 'age_adj_rate_lower_95_confidence', 'age_adj_rate_upper_95_confidence']

#Make sure didn't miss anything
print len(col_names), len(raw_df.columns)

14 14

In [6]:

raw_df.columns = col_names
print raw_df.columns

Index([u'notes', u'cancer_site', u'cancer_site_code', u'year', u'year_code', u'state', u'state_code', u'sex', u'sex_code', u'cancer_count', u'population', u'age_adj_rate', u'age_adj_rate_lower_95_confidence', u'age_adj_rate_upper_95_confidence'], dtype=object)

In [7]:

#Let's drop some columns we don't need
desired_cols = ['cancer_site', 'cancer_site_code', 'year', 'state', 'state_code', 'sex_code', 'cancer_count', 'population', 'age_adj_rate']
df = raw_df[desired_cols]

#Separate out the combined data from the data on invididual cancer sites
combined_df = df[df.cancer_site_code == '00']
df = df[df.cancer_site_code != '00']

In [8]:

len(df.cancer_site.unique())

Out[8]:

In [9]:

print df.age_adj_rate.describe()

count    18873.000000
mean        24.193403
std         34.180612
min          0.000000
25%          6.900000
50%         11.300000
75%         22.000000
max        244.800000
dtype: float64

In [10]:

"""
Plotting cancer occurrences in each cancer site, by sex
"""
#excerpt (this is a view, right?)
tt = df[['cancer_site', 'age_adj_rate', 'sex_code']]

for idx, group in tt.groupby(by='cancer_site', ):
    plt.figure()
    group.boxplot(by='sex_code')
    plt.ylim((0,100))
    plt.title(idx)

In [11]:

"""
I'm curious about the shape of the boxplot for Urinary bladders
"""
#temporary df for convenience plots
tt = df[['age_adj_rate', 'state', 'cancer_site', 'sex_code']].fillna(0)
tt = tt[tt.cancer_site == 'Urinary Bladder']

tt.boxplot(by='sex_code',figsize=(8,4), vert=False)
plt.title("Urinary Bladder, adjusted rate by sex")

Out[11]:

<matplotlib.text.Text at 0x4babd10>

In [12]:

tt.boxplot(by='state', figsize=(8,12), vert=False)
plt.title("Urinary Bladder, count by state")

Out[12]:

<matplotlib.text.Text at 0x4bc5810>

In [19]:

"""
I want those boxplots sorted, damnit. 
"""
sorted_df = tt.groupby(by='state').age_adj_rate.sum()
sorted_df.sort()
sorted_states = list(sorted_df.index)

fig, axarr = plt.subplots(nrows=len(sorted_states), sharex=True, )
axarr[0].set_title("Urinary Bladder occurrence, age adjusted\nby State")
for ipos, state in enumerate(sorted_states):
    mask = (tt.state == state)
    tt[mask].boxplot(
            ax=axarr[ipos],
            positions = [ipos+1], 
            vert=False, 
            rot=90, 
            widths=0.75,
            )
    
    #Cosmetics (axis)
    axarr[ipos].set_yticks([])
    axarr[ipos].set_ylabel('{}'.format(state), rotation='horizontal')
    axarr[ipos].set_frame_on(False)
    

#Cosmetics (figure)
fig.set_size_inches(8,25)
fig.subplots_adjust(hspace = 0)

fig.show()

In [ ]: