#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np, requests, zipfile, StringIO, pandas as pd, json, copy # In[2]: #data source WHO Mortality database #http://www.who.int/healthinfo/statistics/mortality_rawdata/en/ # In[3]: # you might want to skip this cell, depending on your system configuration import sys reload(sys) sys.setdefaultencoding('utf8') # In[4]: #read country codes z = zipfile.ZipFile('country_codes.zip') cc = pd.read_csv(z.open('country_codes'),low_memory=False).set_index('country') # In[5]: def part1(): #read data z = zipfile.ZipFile('morticd10_part1.zip') df = pd.read_csv(z.open('Morticd10_part1'),low_memory=False) #filter out non-country (subdivision) data df = df[np.isnan(df['Admin1'])] df = df[np.isnan(df['SubDiv'])] df = df.drop(['Admin1','SubDiv'],axis=1) #filter out data after ICD10 classification df = df[df['List']=='104'] df = df.drop(['List'],axis=1) #filter out non-detailed mortality #df = df[df['Frmat']==1] #df = df.drop(['Frmat'],axis=1) #filter out detailed infant mortality #df = df[df['IM_Frmat']==8] df = df.drop(['IM_Frmat'],axis=1) df=df.drop(['IM_Deaths1'],axis=1).\ drop(['IM_Deaths2'],axis=1).\ drop(['IM_Deaths3'],axis=1).\ drop(['IM_Deaths4'],axis=1) return df # In[6]: def part2(): #read data z = zipfile.ZipFile('morticd10_part2.zip') df2 = pd.read_csv(z.open('Morticd10_part2'),low_memory=False) #filter out non-country (subdivision) data df2 = df2[np.isnan(df2['Admin1'])] df2 = df2[df2['SubDiv']!='A30'] df2 = df2.drop(['Admin1','SubDiv'],axis=1) #filter out data after ICD10 classification df2 = df2[df2['List']=='104'] df2 = df2.drop(['List'],axis=1) #filter out non-detailed mortality #df2 = df2[df2['Frmat']==1] #df2 = df2.drop(['Frmat'],axis=1) #filter out detailed infant mortality #df2 = df2[df2['IM_Frmat']==8] df2 = df2.drop(['IM_Frmat'],axis=1) df2=df2.drop(['IM_Deaths1'],axis=1).\ drop(['IM_Deaths2'],axis=1).\ drop(['IM_Deaths3'],axis=1).\ drop(['IM_Deaths4'],axis=1) return df2 # In[7]: def load_df(index): #merge the two dataframes df = pd.concat([part1(),part2()]) #set index df = df.set_index(index) #set column names df.columns=['Deaths1']+range(5)+list(np.arange(1,20)*5)+['Deaths26'] #normalize data dg0=df.loc[0].copy() dg1=df.loc[1].copy() dg1[90]=df.loc[1][85]*3/10.0 dg1[95]=df.loc[1][85]*1/10.0 dg1[85]=df.loc[2][85]*6/10.0 dg2=df.loc[2].copy() dg2[90]=df.loc[2][85]*3/10.0 dg2[95]=df.loc[2][85]*1/10.0 dg2[85]=df.loc[2][85]*6/10.0 dg2[2]=df.loc[2][1]*1/4.0 dg2[3]=df.loc[2][1]*1/4.0 dg2[4]=df.loc[2][1]*1/4.0 dg2[1]=df.loc[2][1]*1/4.0 return pd.concat([dg0,dg1,dg2]) # In[8]: #Load df for processing df=load_df(['Frmat','Country','Year','Cause','Sex']) # In[9]: countries=pd.read_html('http://www.geonames.org/countries/',header=0)[1] countries.columns=['ISO2','ISO3','ISONUM','FIPS','Country','Capital','Area','Population','Continent'] countries.set_index('Country',drop=True,inplace=True) countries=countries['ISONUM'] countries.head(5) # In[10]: ch={} for i in countries.index: try: ch[countries.loc[i]]=i except: pass # In[11]: def ccv(f): if f=='Russian Federation':return 'Russia' elif f=='Brunei Darussalam':return 'Brunei' elif f=='Reunion':return u'R\xc3\xa9union' elif f=='Saint Vincent and Grenadines':return 'Saint Vincent and the Grenadines' elif f=='United States of America':return 'United States' elif f=='Virgin Islands (USA)':return 'U.S. Virgin Islands' elif f=='Hong Kong SAR':return 'Hong Kong' elif f=='Republic of Korea':return 'South Korea' elif f=='Republic of Moldova':return 'Moldova' elif f=='Serbia and Montenegro, Former':return 'Serbia' else: return f todrop=set() for i in list(df.index.levels[0]): if ccv(cc.loc[i][0]) not in list(countries.index): print cc.loc[i][0] todrop.add(i) # In[12]: #icd=pd.read_excel('icd.xlsx').set_index('code') icd=pd.read_excel('icd_hun.xlsx',).set_index('code') # In[13]: igroup=[1000, 1001, 1026, 1048, 1051, 1055, 1058, 1062, 1063, 1064, 1072, 1078, 1082, 1083, 1084, 1087, 1092, 1093, 1094, 1095, 2000]#protector dummy # In[14]: #create hierarchy of diseases #numbers first hierarchy={} currenti=0 for k in icd.T.iteritems(): i=k[0] if i>igroup[currenti]: currenti+=1 if igroup[currenti] not in hierarchy: hierarchy[igroup[currenti]]={'sub':{}} if i1985: #no mortality data before that if str(p[0][1]) not in pp[str(country_id)]:pp[str(country_id)][str(p[0][1])]={} if str(p[0][1]) not in pp['007']:pp['007'][str(p[0][1])]={} if p[0][2]>1:g='f' else: g='m' if g not in pp[str(country_id)][str(p[0][1])]:pp[str(country_id)][str(p[0][1])][g]={} if g not in pp['007'][str(p[0][1])]:pp['007'][str(p[0][1])][g]={} for j in pop.columns: v=p[1][j] if np.isnan(p[1][j]):v=0 pp[str(country_id)][str(p[0][1])][g][str(j)]=str(v) if str(j) not in pp['007'][str(p[0][1])][g]:pp['007'][str(p[0][1])][g][str(j)]=0 pp['007'][str(p[0][1])][g][str(j)]+=v except: pass # In[26]: #convert to str for y in pp['007']: for g in pp['007'][y]: for a in pp['007'][y][g]: pp['007'][y][g][a]=str(pp['007'][y][g][a]) # Alternative population counter # In[27]: #load pop data from http://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS wd=pd.read_excel('WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS',skiprows=16) wd=wd.drop(['Major area, region, country or area *','Index','Variant','Notes','80+'],axis=1) wd.columns=['Country','Year']+list(np.arange(21)*5) wd=wd.set_index(['Country','Year']) #load pop data from http://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS wf=pd.read_excel('WPP2015_POP_F15_3_ANNUAL_POPULATION_BY_AGE_FEMALE.XLS',skiprows=16) wf=wf.drop(['Major area, region, country or area *','Index','Variant','Notes','80+'],axis=1) wf.columns=['Country','Year']+list(np.arange(21)*5) wf=wf.set_index(['Country','Year']) # In[28]: #fill missing years and aggregates wd[2]=wd[0]*1/5.0 wd[3]=wd[0]*1/5.0 wd[4]=wd[0]*1/5.0 wd[1]=wd[0]*1/5.0 wd[0]=wd[0]*1/5.0 wd[95]=wd[95]+wd[100] wd=wd.drop(100,axis=1) #fill missing years and aggregates wf[2]=wf[0]*1/5.0 wf[3]=wf[0]*1/5.0 wf[4]=wf[0]*1/5.0 wf[1]=wf[0]*1/5.0 wf[0]=wf[0]*1/5.0 wf[95]=wf[95]+wf[100] wf=wf.drop(100,axis=1) # In[29]: wdp={} for p in wd.T.iteritems(): try: country_id=p[0][0] if str(country_id) not in wdp: wdp[str(country_id)]={} if p[0][1]>1985: #no mortality data before that if str(p[0][1]) not in wdp[str(country_id)]:wdp[str(country_id)][str(p[0][1])]={} g='m' if g not in wdp[str(country_id)][str(p[0][1])]:wdp[str(country_id)][str(p[0][1])][g]={} for j in wd.columns: v=p[1][j] if np.isnan(p[1][j]):v=0 wdp[str(country_id)][str(p[0][1])][g][str(j)]=str(v*1000.0) except: pass for p in wf.T.iteritems(): try: country_id=p[0][0] if str(country_id) not in wdp: wdp[str(country_id)]={} if p[0][1]>1985: #no mortality data before that if str(p[0][1]) not in wdp[str(country_id)]:wdp[str(country_id)][str(p[0][1])]={} g='f' if g not in wdp[str(country_id)][str(p[0][1])]:wdp[str(country_id)][str(p[0][1])][g]={} for j in wd.columns: v=p[1][j] if np.isnan(p[1][j]):v=0 wdp[str(country_id)][str(p[0][1])][g][str(j)]=str(v*1000.0) except: pass # Data parser # In[ ]: hierarchy2={} c={} for country in df.index.get_level_values('Country').unique(): if ccv(cc.loc[country][0]) not in ['Netherlands Antilles', 'United Kingdom, England and Wales', 'United Kingdom, Scotland', 'United Kingdom, Northern Ireland', 'Rodrigues']: try: country_id=countries.loc[ccv(cc.loc[country][0])] print country_id,ccv(cc.loc[country][0]) c[country_id]=cc.loc[country][0] data={} data3=[] dk=df.loc[country].drop(['Deaths1','Deaths26'],axis=1) dk.columns=range(5)+list(np.arange(1,20)*5) for i in dk.stack().iteritems(): if i[0][2]>1:gender='f' else: gender='m' cause=i[0][1].strip() if cause not in 'AAA': key='A'+str(i[0][3])+'C'+str(cause)+'T'+str(i[0][0]) if key not in data: data[key]={} data[key]['a']=i[0][3] data[key]['c']=cause data[key]['t']=i[0][0] data[key]['s']=i[1] data[key]['g']=gender if cause not in hierarchy2: hierarchy2[cause]={} cause2=cause[:3] hierarchy2[cause]["cause2"]=cause2 hierarchy2[cause]["parent"]=get_parent(cause2) hierarchy2[cause]["group"]=get_group(cause2) data3.append(data[key]) file('db/data.json','w').write(json.dumps(data3)) try: import zlib compression = zipfile.ZIP_DEFLATED except: compression = zipfile.ZIP_STORED zf = zipfile.ZipFile('db/'+str(country_id)+'.zip', mode='w') zf.write('db/data.json','data.json',compress_type=compression) zf.close() except: print 'error',country_id,ccv(cc.loc[country][0]) # In[563]: file('hierarchy.json','w').write(json.dumps(hierarchy2)) #only do once ever, dont overwrite! # In[76]: hierarchy3={} c={} for country in df.index.get_level_values('Country').unique(): if ccv(cc.loc[country][0]) not in ['Netherlands Antilles', 'United Kingdom, England and Wales', 'United Kingdom, Scotland', 'United Kingdom, Northern Ireland', 'Rodrigues']: try: country_id=countries.loc[ccv(cc.loc[country][0])] print country_id,ccv(cc.loc[country][0]) c[country_id]=cc.loc[country][0] data={} data3=[] dk=df.loc[country].drop(['Deaths1','Deaths26'],axis=1) dk.columns=range(5)+list(np.arange(1,20)*5) for i in dk.stack().iteritems(): if i[0][2]>1:gender='f' else: gender='m' cause=i[0][1].strip() if cause not in 'AAA': cause=get_parent(i[0][1].strip()[:3]) key='A'+str(i[0][3])+'C'+str(cause)+'T'+str(i[0][0])+'G'+gender if key not in data: data[key]={} data[key]['a']=i[0][3] data[key]['c']=cause data[key]['g']=gender data[key]['t']=i[0][0] if 's' not in data[key]:data[key]['s']=0 data[key]['s']+=i[1] if cause not in hierarchy3: hierarchy3[cause]={} cause2=cause hierarchy3[cause]["cause2"]=cause hierarchy3[cause]["parent"]=cause hierarchy3[cause]["group"]=get_group(cause2) for key in data: data3.append(data[key]) file('db2/data.json','w').write(json.dumps(data3)) try: import zlib compression = zipfile.ZIP_DEFLATED except: compression = zipfile.ZIP_STORED zf = zipfile.ZipFile('db2/'+str(country_id)+'.zip', mode='w') zf.write('db2/data.json','data.json',compress_type=compression) zf.close() except: pass#print 'error',country_id,ccv(cc.loc[country][0]) # In[565]: file('hierarchy2.json','w').write(json.dumps(hierarchy3)) #only do once ever, dont overwrite! # In[71]: #run once! #save country population for which there is modrtality data mdata=['900'] for country in df.index.get_level_values('Country').unique(): if ccv(cc.loc[country][0]) not in ['Netherlands Antilles', 'United Kingdom, England and Wales', 'United Kingdom, Scotland', 'United Kingdom, Northern Ireland', 'Rodrigues']: try: country_id=countries.loc[ccv(cc.loc[country][0])] mdata.append(str(country_id)) #append country id to list of available countries except: print 'error',country_id,ccv(cc.loc[country][0]) for c in wdp.keys(): if c not in mdata: wdp.pop(c); file('pop.json','w').write(json.dumps(wdp)) #only do once ever, dont overwrite! file('wpop.json','w').write(json.dumps(wdp)) #only do once ever, dont overwrite! # In[94]: pp['32'].keys() # In[254]: #get scaler of world vs. actual data dr=[] for y in range(1990,2005): #years with best overall data availability yr=str(y) for g in ['f','m']: for i in pp['007'][yr][g].keys(): dr.append(float(pp['007'][yr][g][i])/float(wdp['900'][yr][g][i])) wsc=np.array(dr).mean() # In[30]: #Load df for global data df=load_df(['Frmat','Sex', 'Year', 'Cause','Country']) years=df.loc[1].index.get_level_values('Year').unique() # In[258]: c={} country_id=900 #world with open("db2/data.json", "w") as data3: data3.write("") with open("db2/data.json", "a") as data3: data3.write("[") for y in range(2000,2014):#years: print y for g in [1,2]: try: dk=df.loc[g].loc[y].stack().unstack('Country').T.sum().unstack().drop(['Deaths1','Deaths26'],axis=1) dk.columns=range(5)+list(np.arange(1,20)*5) data={} for i in dk.stack().iteritems(): if g>1:gender='f' else: gender='m' cause=i[0][0].strip() if cause not in 'AAA': #if i[1]>0: cause=get_parent(i[0][0].strip()[:3]) key='A'+str(i[0][1])+'C'+str(cause)+'T'+str(y)+'G'+gender if key not in data: data[key]={} data[key]['a']=i[0][1] data[key]['c']=cause data[key]['g']=gender data[key]['t']=str(y) if 's' not in data[key]:data[key]['s']=0 data[key]['s']+=i[1]/wsc #multiply with global scaler for key in data: data3.write(json.dumps(data[key])+',') except: pass#print 'error',country_id,ccv(cc.loc[country][0]) # In[259]: #remove last comma character #from here http://stackoverflow.com/questions/1877999/delete-final-line-in-file-via-python with open("db2/data.json", "r+") as data3: #Move the pointer (similar to a cursor in a text editor) to the end of the file. data3.seek(0, os.SEEK_END) #This code means the following code skips the very last character in the file - #i.e. in the case the last line is null we delete the last line #and the penultimate one pos = data3.tell() - 1 data3.seek(pos, os.SEEK_SET) data3.truncate() data3.close() with open("db2/data.json", "a") as data3: data3.write("]") # In[260]: #try this, if it freezes, then just zip manually try: import zlib compression = zipfile.ZIP_DEFLATED except: compression = zipfile.ZIP_STORED zf = zipfile.ZipFile('db2/'+str(country_id)+'.zip', mode='w') zf.write('db2/data.json','data.json',compress_type=compression) zf.close() # In[261]: c={} country_id=900 #world with open("db/data.json", "w") as data3: data3.write("") with open("db/data.json", "a") as data3: data3.write("[") for y in range(2001,2014):#years: print y for g in [1,2]: try: dk=df.loc[g].loc[y].stack().unstack('Country').T.sum().unstack().drop(['Deaths1','Deaths26'],axis=1) dk.columns=range(5)+list(np.arange(1,20)*5) data={} for i in dk.stack().iteritems(): if g>1:gender='f' else: gender='m' cause=i[0][0].strip() if cause not in 'AAA': #if i[1]>0: cause=i[0][0].strip()[:3] key='A'+str(i[0][1])+'C'+str(cause)+'T'+str(y)+'G'+gender if key not in data: data[key]={} data[key]['a']=i[0][1] data[key]['c']=cause data[key]['g']=gender data[key]['t']=str(y) if 's' not in data[key]:data[key]['s']=0 data[key]['s']+=i[1]/wsc #multiply with global scaler for key in data: data3.write(json.dumps(data[key])+',') except: pass#print 'error',country_id,ccv(cc.loc[country][0]) # In[262]: #remove last comma character #from here http://stackoverflow.com/questions/1877999/delete-final-line-in-file-via-python with open("db/data.json", "r+") as data3: #Move the pointer (similar to a cursor in a text editor) to the end of the file. data3.seek(0, os.SEEK_END) #This code means the following code skips the very last character in the file - #i.e. in the case the last line is null we delete the last line #and the penultimate one pos = data3.tell() - 1 data3.seek(pos, os.SEEK_SET) data3.truncate() data3.close() with open("db/data.json", "a") as data3: data3.write("]") # In[263]: #try this, if it freezes, then just zip manually try: import zlib compression = zipfile.ZIP_DEFLATED except: compression = zipfile.ZIP_STORED zf = zipfile.ZipFile('db/'+str(country_id)+'.zip', mode='w') zf.write('db/data.json','data.json',compress_type=compression) zf.close() # In[217]: #pretty country nams, EN+HUN cnames=json.loads(file("../szekelyfold lakossag 2/cnames.json").read()) hnames=json.loads(file("../szekelyfold lakossag 2/hnames.json").read()) # In[218]: #need ot run only once def hun(c): if c=='Antigua and Barbuda': return u'Antigua és Barbuda' elif c=='Bahamas': return u'Bahamák' elif c=='British Virgin Islands': return u'Brit Virgin-szigetek' elif c=='Cayman Islands': return u'Kajmán-szigetek' elif c=='Dominica': return u'Dominika' elif c==u'R\xc3\xa9union': return u'Réunion' elif c=='French Guiana': return u'Francia Guyana' elif c=='Saint Kitts and Nevis': return u'Saint Kitts és Nevis' elif c=='Saint Vincent and the Grenadines': return u'Szent Vincent és a Grenadine-szigetek' elif c=='Turks and Caicos Islands': return u'Turks és Caicos-szigetek' elif c=='U.S. Virgin Islands': return u'U.S. Virgin-szigetek' elif c=='Saint Pierre and Miquelon': return u'Saint Pierre és Miquelon' elif c=='World': return u'Egész Világ' else: return c for country in df.index.get_level_values('Country').unique(): try: country_id=countries.loc[ccv(cc.loc[country][0])] if str(country_id) not in cnames: cnames[str(country_id)]=ccv(cc.loc[country][0]) if ccv(cc.loc[country][0]) not in hnames: print repr(ccv(cc.loc[country][0])) hnames[ccv(cc.loc[country][0])]=hun(ccv(cc.loc[country][0])) except:pass cnames[u'900']=u"World" hnames[u'World']=u"Egész Világ" file('cnames.json','w').write(json.dumps(cnames)) file('hnames.json','w').write(json.dumps(hnames)) # In[219]: #pretty country nams, EN+HUN cnames=json.loads(file("cnames.json").read()) hnames=json.loads(file("hnames.json").read()) # In[220]: def hc(c): if c=="Reunion": return u'R\xc3\xa9union' if c=='Saint Vincent and Grenadines': return u'Saint Vincent and the Grenadines' if c=='Serbia and Montenegro, Former': return "Serbia" if c=='United States of America': return "United States" if c=='Virgin Islands (USA)': return 'U.S. Virgin Islands' if c=='Hong Kong SAR': return 'Hong Kong' if c=='Republic of Moldova': return 'Moldova' if c=='Republic of Korea': return 'South Korea' return c # In[221]: #only run if you haven't run country data parser #recreate country list c={} for country in df.index.get_level_values('Country').unique(): if ccv(cc.loc[country][0]) not in ['Netherlands Antilles', 'United Kingdom, England and Wales', 'United Kingdom, Scotland', 'United Kingdom, Northern Ireland', 'Rodrigues']: country_id=countries.loc[ccv(cc.loc[country][0])] c[country_id]=cc.loc[country][0] # In[229]: q={} for i in c: q[hnames[hc(c[i])]]=i e=[] itera=q.keys() itera.sort(cmp=locale.strcoll) for i in itera: e.append(str(q[i])) file('countries.json','w').write(json.dumps(['900']+e)) # In[236]: q={} for i in c: q[hc(c[i])]=i e=[] itera=q.keys() itera.sort(cmp=locale.strcoll) for i in itera: e.append(str(q[i])) file('wcountries.json','w').write(json.dumps(['900']+e))