%matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0) hospitalized.dir_icu.mean() hospitalized.dir_icu.sum() hospitalized['icu_any'] = (hospitalized.icu + hospitalized.dir_icu).astype(bool) hospitalized.icu_any.sum() hospitalized.icu_any.mean() hospitalized['icu_later'] = hospitalized.icu_any & (hospitalized.dir_icu.astype(bool) - True) dir_icu_subset = hospitalized[hospitalized.dir_icu.astype(bool)] dir_icu_subset.age_months.hist() (dir_icu_subset.age_months<1).mean() (hospitalized.age_months<1).mean() (dir_icu_subset.age_months<1).mean() (hospitalized.age_months<1).mean() (dir_icu_subset.gest_age<37).mean() dir_icu_subset[dir_icu_subset.days_symptoms<10].boxplot(column='days_symptoms') hospitalized.birth_wt_child.hist(color='red', normed=True) dir_icu_subset.birth_wt_child.hist(color='blue', normed=True, alpha=0.8) (dir_icu_subset.z_score < -1).mean() (hospitalized.z_score < -1).mean() (dir_icu_subset.z_score < -2).mean() (hospitalized.z_score < -2).mean() hospitalized.length_of_stay.hist(bins=30, normed=True, color='darkred') dir_icu_subset.length_of_stay.hist(bins=30, normed=True, alpha=0.75) (dir_icu_subset.death).mean() dir_icu_subset[dir_icu_subset.death].length_of_stay.hist(bins=25) # Extract ICU subset icu_subset = hospitalized[hospitalized.icu_any] icu_subset['icu_direct'] = icu_subset.dir_icu.replace({0:'Later', 1:'Direct'}) # Group by direct to ICU or not by_direct = icu_subset.groupby('icu_direct') by_direct['days_vent'].describe() by_direct['length_of_stay'].describe() by_direct['z_score'].describe() icu_subset['age_months'].hist(by=icu_subset['icu_direct'], sharey=True) hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1) variables = hospitalized[["icu_any", "hospitalized_vitamin_d", "gest_age", "heart_hx", "prev_cond", "cigarette_smokers", "sex_child", "birth_wt_child", "z_score", "age_months", "wheezing" ]] variables['vitamin_d_norm'] = ((variables.hospitalized_vitamin_d - variables.hospitalized_vitamin_d.mean()) / variables.hospitalized_vitamin_d.std()) variables['wt_norm'] = ((variables.birth_wt_child - variables.birth_wt_child.mean()) / variables.birth_wt_child.std()) variables['age_norm'] = ((variables.age_months - variables.age_months.mean()) / variables.age_months.std()) variables.isnull().sum() variables = variables.dropna() variables.shape data_glm = variables.to_dict(outtype='list') from pymc import Model, glm import theano.tensor as t from pymc import sample, Slice, NUTS from pymc import forestplot, traceplot, summary def tinvlogit(x): return t.exp(x) / (1 + t.exp(x)) with Model() as model: formula = 'icu_any ~ vitamin_d_norm + cigarette_smokers + sex_child + prev_cond + z_score + age_norm' glm.glm(formula, data_glm, family=glm.families.Binomial(link=glm.links.Logit)) with model: trace = sample(5000, Slice())[4000:] plt.figure(figsize=(14,8)) forestplot(trace, vars=trace.varnames[1:]) traceplot(trace, vars=['z_score']) hospitalized['hospitalized_vitamin_d'].hist(by=hospitalized['icu_any'], sharey=True)