By David Taylor, www.prooffreader.com (blog), www.dtdata.io (hire me!)
There is a blog post about this notebook at (html).
The CDC's Compressed Mortality Files were downloaded from http://wonder.cdc.gov/mortsql.html. They can't be linked to directly, so I'll describe how to download them here.
On the mortsql.html page, follow the following steps for the following three links:
I looked through the ICD-8 through ICD-10 at http://www.wolfbane.com/icd/index.html to find unusual causes of death.
import sys
print(sys.version) #I'm using Python 2.7 because I can't get plotly to work in 3.4
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
py.sign_in(open('Z:/DT/plotly_emanresu.txt', 'r').read(), #keep moving, nothing to see here.
open('Z:/DT/plotly_yekipa.txt', 'r').read())
2.7.8 |Anaconda 2.1.0 (64-bit)| (default, Jul 2 2014, 15:12:11) [MSC v.1500 64 bit (AMD64)]
df = pd.read_csv('Compressed Mortality, 1968-1978 per year.txt', sep='\t', encoding="latin-1")
df = pd.merge(df, pd.read_csv('Compressed Mortality, 1979-1998 per year.txt', sep='\t', encoding="latin-1"), 'outer')
df = pd.merge(df, pd.read_csv('Compressed Mortality, 1999-2012 per year.txt', sep='\t', encoding="latin-1"), 'outer')
df = df.dropna(subset=['Year'])
df.drop(['Notes', 'Year Code'], axis=1, inplace=True)
df.head()
# note that causes of death like "Classical, 000.0" are subsets of groups of cause of death;
# the group names are not included in this table
Cause of death | Cause of death Code | Year | Deaths | Population | Crude Rate | |
---|---|---|---|---|---|---|
3107 | Classical | 000.0 | 1968 | 0 | 0 | 0.0 (Unreliable) |
3108 | Classical | 000.0 | 1969 | 0 | 0 | 0.0 (Unreliable) |
3109 | Classical | 000.0 | 1970 | 0 | 0 | 0.0 (Unreliable) |
3110 | Classical | 000.0 | 1971 | 0 | 0 | 0.0 (Unreliable) |
3111 | Classical | 000.0 | 1972 | 0 | 0 | 0.0 (Unreliable) |
** Note: Plotly is good for making quick stacked bar graphs, but its legends are not very customizable (that I could determine, in any case). So I have made versions with and without legends, to be assembled later in Photoshop. **
def plot_graphs(df, search_term, title):
dfchart = df[(df['Cause of death'].str.contains(search_term))]
years = range(1968, 2013)
bars = []
for cause in dfchart['Cause of death'].unique():
deaths = []
for year in years:
if len(dfchart[(dfchart['Cause of death'] == cause)&(dfchart.Year == year)]) > 0:
deaths.append(dfchart[(dfchart['Cause of death'] == cause)&(dfchart.Year == year)].Deaths.iloc[0])
else:
deaths.append(0)
bars.append( Bar(
x=years,
y=deaths,
name=cause
) )
data = Data(bars)
layout_nolegend = Layout(
autosize=False,
width=650,
height=250,
margin=Margin(
l=50,
r=10,
b=20,
t=40,
pad=0
),
barmode='stack',
title=title,
yaxis=YAxis(
title='Deaths per year',
titlefont=Font(
size=12
),
tickfont=Font(
size=12,
)),
showlegend=False
)
layout_legend = Layout(
autosize=True,
legend=Legend(
x=0,
y=0,
),
barmode='stack',
)
fig1 = Figure(data=data, layout=layout_nolegend)
py.image.ishow(fig1)
fig2 = Figure(data=data, layout=layout_legend)
py.image.ishow(fig2)
plot_graphs(df, 'caries', 'Deaths due to Dental Caries in CDC database, 1968-2012')
plot_graphs(df, '[Pp]enis', "Causes of death containing the word 'penis' in CDC database, 1968-2012")
df[(df['Cause of death'].str.contains('[Tt]ransvest'))]
Cause of death | Cause of death Code | Year | Deaths | Population | Crude Rate | |
---|---|---|---|---|---|---|
19676 | Transvestitism | 302.3 | 1968 | 0 | 0 | 0.0 (Unreliable) |
19677 | Transvestitism | 302.3 | 1969 | 0 | 0 | 0.0 (Unreliable) |
19678 | Transvestitism | 302.3 | 1970 | 0 | 0 | 0.0 (Unreliable) |
19679 | Transvestitism | 302.3 | 1971 | 0 | 0 | 0.0 (Unreliable) |
19680 | Transvestitism | 302.3 | 1972 | 0 | 0 | 0.0 (Unreliable) |
19681 | Transvestitism | 302.3 | 1973 | 0 | 0 | 0.0 (Unreliable) |
19682 | Transvestitism | 302.3 | 1974 | 0 | 0 | 0.0 (Unreliable) |
19683 | Transvestitism | 302.3 | 1975 | 0 | 0 | 0.0 (Unreliable) |
19684 | Transvestitism | 302.3 | 1976 | 0 | 0 | 0.0 (Unreliable) |
19685 | Transvestitism | 302.3 | 1977 | 0 | 0 | 0.0 (Unreliable) |
19686 | Transvestitism | 302.3 | 1978 | 0 | 0 | 0.0 (Unreliable) |
plot_graphs(df, '([Ww]eather|[^ro] storm)', "Causes of death contain 'weather' or 'storm' in CDC database, 1968-2012")
plot_graphs(df, '[Mm]igraine', "Causes of death containing word 'migraine', CDC database 1968-2012")
plot_graphs(df, '[Cc]left [pl]', "Causes of death containing words 'cleft palate'/'cleft lip', CDC database 1968-2012")
plot_graphs(df, '[Ff]oreign body', "Causes of death containing the words 'foreign body' in CDC database, 1968-2012")
plot_graphs(df, '[Ee]lbow', "Causes of death containing the word 'elbow' in CDC database, 1968-2012")
plot_graphs(df, '[Ee]nthesopath', "Causes of death containing the word 'enthesopathy' in CDC database, 1968-2012")
df2 = df[~df['Cause of death'].str.contains('pedestrian')]
plot_graphs(df2, '[Cc]onjunctivitis', "Causes of death containing the word 'conjunctivitis' in CDC database, 1968-2012")
plot_graphs(df, '[Dd]og', "Causes of death containing the word 'dog' in CDC database, 1968-2012")
plot_graphs(df, ' war[^tf]', "Causes of death containing the word 'war' in CDC database, 1968-2012")
plot_graphs(df, '[Pp]soriasis', "Causes of death containing the word 'psoriasis' in CDC database, 1968-2012")
plot_graphs(df, '[Ii]ngrow', "Cause of death by ingrowing nail in CDC database, 1968-2012")
df[(df['Cause of death'].str.contains('ngrow'))&(df.Deaths>0)]
Cause of death | Cause of death Code | Year | Deaths | Population | Crude Rate | |
---|---|---|---|---|---|---|
31029 | Ingrowing nail | 703.0 | 1976 | 1 | 217615788 | 0.0 (Unreliable) |
31030 | Ingrowing nail | 703.0 | 1977 | 1 | 219808632 | 0.0 (Unreliable) |
31032 | Ingrowing nail | 703.0 | 1991 | 1 | 253088068 | 0.0 (Unreliable) |
31033 | Ingrowing nail | 703.0 | 1984 | 1 | 235922142 | 0.0 (Unreliable) |
31034 | Ingrowing nail | L60.0 | 2000 | 1 | 281421906 | 0.0 (Unreliable) |
31035 | Ingrowing nail | L60.0 | 2011 | 1 | 311591917 | 0.0 (Unreliable) |
plot_graphs(df, '[Ss]pacecraft', "Causes of death containing the word 'spacecraft' in CDC database, 1968-2012")
plot_graphs(df, '[Aa]nimal', "Causes of death containing the word 'animal' in CDC database, 1968-2012")
df2 = df[~df['Cause of death'].str.contains('pedestrian')]
plot_graphs(df2, '([Aa]nimal|[Bb]ee|[Dd]og)', "Causes of death by 'animal'/'dog'/'bee' in CDC database, 1968-2012")
# No cats
df2 = df[~df['Cause of death'].str.contains('[^ ]cat')]
df2 = df2[~df2['Cause of death'].str.contains('cath')]
df2[df2['Cause of death'].str.contains('cat')]['Cause of death'].unique()
array([u'Congenital cataract', u'Senile cataract', u'Unspecified cataract', u'Congenital cataract and lens anomalies', u'Defects of catalase and peroxidase', u'Organic catatonic disorder', u'Narcolepsy and cataplexy', u'Victim of cataclysmic storm'], dtype=object)
plot_graphs(df, '[Ss]yndactyl', "Causes of death containing the word 'syndactyly' in CDC database, 1968-2012")