%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/io/parsers.py:1071: DtypeWarning: Columns (140,142,144,146,148,181,206,207,212,213,261,280,281,282,296,297) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
Proportion admitted to ICU
hospitalized.dir_icu.mean()
0.071315872514988957
hospitalized.dir_icu.sum()
226
Number and proportion in ICU at any time
hospitalized['icu_any'] = (hospitalized.icu + hospitalized.dir_icu).astype(bool)
hospitalized.icu_any.sum()
312
hospitalized.icu_any.mean()
0.098453770905648469
Create subset of cases that were admitted to ICU later
hospitalized['icu_later'] = hospitalized.icu_any & (hospitalized.dir_icu.astype(bool) - True)
Subset who were admitted directly to ICU
dir_icu_subset = hospitalized[hospitalized.dir_icu.astype(bool)]
Age distribution of this subset
dir_icu_subset.age_months.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x103d12050>
In the subset admitted directly to ICU, what proportion were <1 month old
(dir_icu_subset.age_months<1).mean()
(hospitalized.age_months<1).mean()
0.12212054275796781
Proportion <1 month in entire dataset
(dir_icu_subset.age_months<1).mean()
(hospitalized.age_months<1).mean()
0.12212054275796781
Gestational age of direct-to-ICU subset
(dir_icu_subset.gest_age<37).mean()
0.19026548672566371
How many days sick before hospitalization
dir_icu_subset[dir_icu_subset.days_symptoms<10].boxplot(column='days_symptoms')
{'boxes': [<matplotlib.lines.Line2D at 0x103880290>], 'caps': [<matplotlib.lines.Line2D at 0x105ce3290>, <matplotlib.lines.Line2D at 0x105ce38d0>], 'fliers': [<matplotlib.lines.Line2D at 0x106062550>], 'means': [], 'medians': [<matplotlib.lines.Line2D at 0x105ce3f10>], 'whiskers': [<matplotlib.lines.Line2D at 0x105ce5550>, <matplotlib.lines.Line2D at 0x105ce5c10>]}
Birth weight of entire hospitalized dataset (red) and direct-to-ICU subset (blue):
hospitalized.birth_wt_child.hist(color='red', normed=True)
dir_icu_subset.birth_wt_child.hist(color='blue', normed=True, alpha=0.8)
<matplotlib.axes._subplots.AxesSubplot at 0x106073e10>
z-scores less than -1 for direct-to-ICU and all hospitalizations:
(dir_icu_subset.z_score < -1).mean()
0.47345132743362833
(hospitalized.z_score < -1).mean()
0.26001893341748183
More than 2 sd below mean:
(dir_icu_subset.z_score < -2).mean()
0.32300884955752213
(hospitalized.z_score < -2).mean()
0.13568949195329758
Days in ICU of all (red) and direct-to-ICU (blue):
hospitalized.length_of_stay.hist(bins=30, normed=True, color='darkred')
dir_icu_subset.length_of_stay.hist(bins=30, normed=True, alpha=0.75)
<matplotlib.axes._subplots.AxesSubplot at 0x104f47e10>
Death in ICU for direct-to-ICU subset:
(dir_icu_subset.death).mean()
0.06637168141592921
Length of stay for direct-to-ICU subset:
dir_icu_subset[dir_icu_subset.death].length_of_stay.hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x105fd8490>
# Extract ICU subset
icu_subset = hospitalized[hospitalized.icu_any]
icu_subset['icu_direct'] = icu_subset.dir_icu.replace({0:'Later', 1:'Direct'})
# Group by direct to ICU or not
by_direct = icu_subset.groupby('icu_direct')
Days on ventilator by whether patient was sent direct to ICU or later during hospitalization
by_direct['days_vent'].describe()
icu_direct Direct count 34.000000 mean 4.617647 std 4.431394 min 1.000000 25% 2.000000 50% 3.000000 75% 6.750000 max 20.000000 Later count 17.000000 mean 3.941176 std 3.543843 min 1.000000 25% 1.000000 50% 2.000000 75% 5.000000 max 10.000000 dtype: float64
Length of stay
by_direct['length_of_stay'].describe()
icu_direct Direct count 222.000000 mean 9.108108 std 5.280120 min 2.000000 25% 6.000000 50% 8.000000 75% 11.000000 max 42.000000 Later count 61.000000 mean 8.327869 std 6.166364 min 1.000000 25% 4.000000 50% 7.000000 75% 10.000000 max 33.000000 dtype: float64
z-score
by_direct['z_score'].describe()
icu_direct Direct count 225.000000 mean -1.198444 std 2.964529 min -11.190000 25% -2.790000 50% -0.800000 75% 0.570000 max 25.330000 Later count 84.000000 mean -1.163452 std 2.333833 min -7.250000 25% -2.335000 50% -0.840000 75% 0.490000 max 3.810000 dtype: float64
Age distribution of those sent directly to ICU versus later
icu_subset['age_months'].hist(by=icu_subset['icu_direct'], sharey=True)
array([<matplotlib.axes._subplots.AxesSubplot object at 0x106b37b10>, <matplotlib.axes._subplots.AxesSubplot object at 0x106d2aad0>], dtype=object)
Predictive model for ICU admission
hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1)
variables = hospitalized[["icu_any",
"hospitalized_vitamin_d",
"gest_age",
"heart_hx",
"prev_cond",
"cigarette_smokers",
"sex_child",
"birth_wt_child",
"z_score",
"age_months",
"wheezing"
]]
variables['vitamin_d_norm'] = ((variables.hospitalized_vitamin_d - variables.hospitalized_vitamin_d.mean())
/ variables.hospitalized_vitamin_d.std())
variables['wt_norm'] = ((variables.birth_wt_child - variables.birth_wt_child.mean()) /
variables.birth_wt_child.std())
variables['age_norm'] = ((variables.age_months - variables.age_months.mean()) /
variables.age_months.std())
Observations with missing values, by variable
variables.isnull().sum()
icu_any 0 hospitalized_vitamin_d 480 gest_age 0 heart_hx 0 prev_cond 0 cigarette_smokers 1 sex_child 0 birth_wt_child 2 z_score 13 age_months 0 wheezing 0 vitamin_d_norm 480 wt_norm 2 age_norm 0 dtype: int64
variables = variables.dropna()
variables.shape
(2677, 14)
data_glm = variables.to_dict(outtype='list')
from pymc import Model, glm
import theano.tensor as t
from pymc import sample, Slice, NUTS
from pymc import forestplot, traceplot, summary
def tinvlogit(x):
return t.exp(x) / (1 + t.exp(x))
with Model() as model:
formula = 'icu_any ~ vitamin_d_norm + cigarette_smokers + sex_child + prev_cond + z_score + age_norm'
glm.glm(formula, data_glm,
family=glm.families.Binomial(link=glm.links.Logit))
with model:
trace = sample(5000, Slice())[4000:]
[-----------------100%-----------------] 5000 of 5000 complete in 64.5 sec
plt.figure(figsize=(14,8))
forestplot(trace, vars=trace.varnames[1:])
<matplotlib.gridspec.GridSpec at 0x11c778e10>
traceplot(trace, vars=['z_score'])
Vitamin D levels for ICU vs non-ICU
hospitalized['hospitalized_vitamin_d'].hist(by=hospitalized['icu_any'], sharey=True)
array([<matplotlib.axes._subplots.AxesSubplot object at 0x108aa3bd0>, <matplotlib.axes._subplots.AxesSubplot object at 0x107f34190>], dtype=object)