#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import datetime # Import data # In[2]: hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0) hospitalized.head() # Breakdown by: # # - month of age (and greater than 1 year) # - sex # - birthweight # - prematurity # - feeding: % with any breastfeeding # - maternal ed # - number in household # - daycare # - smoking exposure: cigarette, maternal during pregnacy, nargalia # - % with comorbidity: past medical history # - nationality: jordanian, palestinean, syrian, egptian, other # - diagnosis: pneumo, bronchiolitis, bronchopnewumonia # # Severity measures: # # - length of stay # - ICU # - O2 # - Mechanical vent. # - days of symptoms before admission # - % with antibiotics prior # - % with antibiotics during # # Columns: # # - RSV, MPV, Rhino, Influenza A/B/C, Adeno, Parinfluenza, Hospitalzied pop, no virus # In[3]: # Positive culture lookup pcr_lookup = {'pcr_result___1': 'RSV', 'pcr_result___2': 'HMPV', 'pcr_result___3': 'flu A', 'pcr_result___4': 'flu B', 'pcr_result___5': 'rhino', 'pcr_result___6': 'PIV1', 'pcr_result___7': 'PIV2', 'pcr_result___8': 'PIV3', 'pcr_result___13': 'H1N1', 'pcr_result___14': 'H3N2', 'pcr_result___15': 'Swine', 'pcr_result___16': 'Swine H1', 'pcr_result___17': 'flu C', 'pcr_result___18': 'Adeno'} # In[4]: hospitalized['RSV'] = hospitalized.pcr_result___1.astype(bool) hospitalized['HMPV'] = hospitalized.pcr_result___2.astype(bool) hospitalized['Rhino'] = hospitalized.pcr_result___5.astype(bool) hospitalized['Influenza'] = hospitalized.pcr_result___3 | hospitalized.pcr_result___4 hospitalized['Adeno'] = hospitalized.pcr_result___18.astype(bool) hospitalized['PIV'] = hospitalized.pcr_result___6 | hospitalized.pcr_result___7 | hospitalized.pcr_result___8 hospitalized['No virus'] = hospitalized[list(pcr_lookup.keys())].sum(1) == 0 hospitalized['All'] = True # In[5]: viruses = ['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All'] # In[6]: pcr_lookup = {'pcr_result___1': 'RSV', 'pcr_result___2': 'HMPV', 'pcr_result___3': 'flu A', 'pcr_result___4': 'flu B', 'pcr_result___5': 'rhino', 'pcr_result___6': 'PIV1', 'pcr_result___7': 'PIV2', 'pcr_result___8': 'PIV3', 'pcr_result___13': 'H1N1', 'pcr_result___14': 'H3N2', 'pcr_result___15': 'Swine', 'pcr_result___16': 'Swine H1', 'pcr_result___17': 'flu C', 'pcr_result___18': 'Adeno'} # In[7]: non_rsv_lookup = pcr_lookup.copy() non_rsv_lookup.pop('pcr_result___1') # In[8]: other_virus_index = (hospitalized[list(non_rsv_lookup.keys())].sum(1) > 0).astype(int) # In[9]: hospitalized['RSV'] = hospitalized['pcr_result___1'] hospitalized['non-RSV virus'] = (hospitalized['pcr_result___1']==0) & other_virus_index hospitalized['no virus'] = (hospitalized['pcr_result___1']==0) & (other_virus_index==0) # In[10]: hospitalized["prev_cond"] = (hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1) > 0) # In[11]: hospitalized['male'] = (hospitalized.sex=='M').astype(int) age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23])) age_groups.index = hospitalized.index age_groups.columns = 'under 2 months', '2-11 months', '12-23 months' hospitalized = hospitalized.join(age_groups) # In[12]: nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'} hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup) hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int) hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int) # In[13]: hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int) hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int) hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan # In[14]: hospitalized['any_cigarette'] = (hospitalized.cigarette_smokers > 0).astype(int) # In[15]: hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int) # ## Rate Estimation # In[16]: hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date) hospitalized.admission_date.describe() # Age groups: # # - Under 2 months # - 2-5 mo. # - 6-11 mo. # - Over 11 mo. # In[17]: age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,5,11,24], include_lowest=True)) age_groups.index = hospitalized.index age_groups.columns = 'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11' hospitalized = hospitalized.join(age_groups) # ### Estimation of age group distribution # In[18]: hosp_age_counts = hospitalized.groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum').values # In[19]: pre_2013 = hospitalized.admission_date < datetime(2013, 1, 1) # In[20]: age_counts = hospitalized[pre_2013].groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum') age_counts # Rates and demographics (via [World Bank](http://databank.worldbank.org)) and 2004 census data (via Jordan [Department of Statistics](http://www.dos.gov.jo/dos_home_e/main/population/census2004/group3/table_31.pdf)). # In[21]: # Jordan population, 2008-2012 population = 5786000, 5915000, 6046000, 6181000, 6318000 # Interpolated population by gender and age, 2010-2012 female_0 = 93649, 95739, 96486 male_0 = 98435, 100525, 101160 female_1 = 87941, 93511, 93067 male_1 = 92548, 98306, 97763 kids_0 = np.array(female_0) + np.array(male_0) kids_1 = np.array(female_1) + np.array(male_1) kids = kids_0 + kids_1 # Proportion in Amman amman_urban_2004 = 1784502. jordan_2004 = 5103639. amman_prop = amman_urban_2004 / jordan_2004 # Birth rates (per 1000) birth_rate = 29.665, 29.322, 28.869, 28.317, 27.699 # Neonatal mortality (per 1000) neonatal_mort = 12.7, 12.4, 12.1, 11.8, 11.5 # Infant mortality (per 1000) infant_mort = 18.4, 17.9, 17.3, 16.8, 16.4 # In[22]: amman_prop # In[23]: admissions_by_month = hospitalized.groupby('admission_date')['child_name'].count().resample('1M', how='sum') # In[24]: admissions_by_month.head() # In[25]: # Dict to return pcr_result variable corresponding to virus virus_lookup = {pcr_lookup[k]: k for k in pcr_lookup.keys()} # In[26]: # Enrolling 5 days per week p_enroll = 5./7 # In[27]: rsv_subset = hospitalized[hospitalized.RSV==1] rsv_subset.shape # In[29]: kids_1 # Import hospitalization data # In[30]: spreadsheets = get_ipython().getoutput('ls data/Al-Bashir*') data = [] col_names = 'date', 'admissions', 'admissions_2', 'resp_admissions', 'resp_admissions_2' for spreadsheet in spreadsheets: this_file = pd.ExcelFile(spreadsheet) # Need to fix data entry error in years > 2011 dot = spreadsheet.find('.') fix_columns = int(spreadsheet[dot-4:dot]) > 2011 for name in this_file.sheet_names: d = this_file.parse(name) d.columns = col_names if fix_columns: total = d['resp_admissions'] + d['resp_admissions_2'] d['resp_admissions_2'] = d['resp_admissions'] d['resp_admissions'] = total data.append(d) hospitalizations = pd.concat(data).set_index('date') # In[31]: hospitalizations.head() # In[32]: under_2_hosp = hospitalizations[hospitalizations.index < datetime(2013, 1, 1)].resample('M', how=sum)['admissions_2'] under_2_hosp.head() # In[33]: rsv_by_date = rsv_subset.groupby('admission_date')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']] rsv_counts = rsv_by_date.sum().resample('1M', how='sum').fillna(0) # In[34]: rsv_counts.head() # In[35]: under2_rsv = pd.concat([rsv_counts, under_2_hosp], axis=1).dropna() under2_rsv.head() # In[36]: def extract_virus(virus): virus_subset = hospitalized[hospitalized[virus]==1] virus_by_date = virus_subset.groupby('admission_date')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']] virus_counts = virus_by_date.sum().resample('1M', how='sum').fillna(0) return pd.concat([virus_counts, under_2_hosp], axis=1).dropna() # In[41]: foo = extract_virus('RSV') # In[44]: adm = foo.pop('admissions_2').values data_subset = foo = foo.values # In[45]: foo.shape # In[39]: [(x*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int) for x,d in zip(adm,data_subset)] # In[77]: import pymc as pm def hosp_subset(virus): data_subset = extract_virus(virus) admissions = data_subset.pop('admissions_2').values data_subset = data_subset.values n_months, n_ages = data_subset.shape # Estimate age distribution of admissions p_age = pm.Dirichlet('p_age', np.ones(4), value=data_subset.sum(0)[:-1]/data_subset.sum()) counts = pm.Multinomial('counts', data_subset.sum(1), p_age, value=data_subset, observed=True) # Estimate denominators n_hosp = [] for i,ni in enumerate(admissions): d = data_subset[i] n_init = (ni*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int)[:-1] n_init = np.append(n_init, ni - n_init.sum()) n_hosp.append(pm.Multinomial('n_hosp_{}'.format(i), ni, p_age, value=n_init)) n_hosp_t = pm.Lambda('n_hosp_t', lambda x=n_hosp: np.array([[xij for xij in xi] for xi in x]).T) # Virus rates by age and time mu_alpha = pm.Normal('mu_alpha', 0, 0.01, value=[0]*n_ages) sigma_alpha = pm.Uniform('sigma_alpha', 0, 100, value=[10]*n_ages) rho = pm.Uniform('rho', -1, 1, value=0) mu = [pm.Lambda('mu_{}'.format(i), lambda mu=mu_alpha: np.array([mu[i]]*n_months)) for i in range(n_ages)] off_diag = np.eye(n_months, k=1) Sigma = [pm.Lambda('Sigma_{}'.format(i), lambda s=sigma_alpha, r=rho: (np.eye(n_months) + off_diag*r + off_diag.T*r)*(s[i]**2)) for i in range(n_ages)] alpha = [pm.MvNormalCov('alpha_{}'.format(i), mu[i], Sigma[i], value=[0]*n_months) for i in range(n_ages)] p_virus = [pm.Lambda('p_virus_{}'.format(i), lambda a=a: pm.invlogit(a)) for i,a in enumerate(alpha)] # Viral rate likelihood @pm.observed def rate_like(value=data_subset.T, p=p_virus, n=n_hosp_t): return np.sum([pm.binomial_like(value[i], n[i], p[i]) for i in range(n_ages)]) return(locals()) # In[78]: M = pm.MCMC(hosp_subset('RSV')) # In[79]: M.sample(20000, 10000) # In[81]: pm.Matplot.summary_plot(M.p_age) # In[82]: pm.Matplot.summary_plot(M.p_virus[0]) # In[83]: import seaborn as sb # In[87]: M.p_virus[0].trace().shape # In[95]: # for i in range(4): sb.tsplot(M.p_virus[0].trace()[-1000:], err_style="unit_traces") # In[105]: sb.set(style="white", palette="muted") f, axes = plt.subplots(2, 2, figsize=(10, 8)) for i,axis in enumerate(np.ravel(axes)): sb.tsplot(M.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis) axis.set_title(age_groups.columns[i]) # In[106]: M_rhino = pm.MCMC(hosp_subset('Rhino')) M_rhino.sample(20000, 10000) # In[108]: sb.set(style="white", palette="muted") f, axes = plt.subplots(2, 2, figsize=(10, 8)) for i,axis in enumerate(np.ravel(axes)): sb.tsplot(M_rhino.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis) axis.set_title(age_groups.columns[i]) f.suptitle('Rhino', fontsize=14) # ### Production runs # In[122]: for virus in viruses: print('\nRunning', virus) M = pm.MCMC(hosp_subset(virus)) M.sample(50000, 40000) sb.set(style="white", palette="muted") f, axes = plt.subplots(2, 2, figsize=(10, 8)) for i,axis in enumerate(np.ravel(axes)): sb.tsplot(M.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis) axis.set_title(age_groups.columns[i]) f.suptitle(virus, fontsize=14) f.savefig(virus) M.write_csv(virus, variables=[x.__name__ for x in M.p_virus]) # In[ ]: