%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import imp
pm3 = imp.load_module("pymc3", *imp.find_module("pymc", ["/Users/fonnescj/GitHub/pymc"]))
pm.__version__
/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/importlib/_bootstrap.py:1471: UserWarning: theano modules are deprecated and will be removed in release 0.7 _call_with_frames_removed(exec, code, module.__dict__)
'2.3.4'
import qgrid
qgrid.nbinstall()
pm3.__version__
'3.0'
Correlate RSV severity with vitamin D levels (at hospital)
Outcomes:
Covariates:
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1152: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date)
hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date)
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date)
hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1)
Plot z-scores by year of admission
hospitalized['year_admission'] = hospitalized.admission_date.apply(lambda x: x.year)
nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian',
6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)
hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized['vitamin D < 20'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized['vitamin D < 11'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app /usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)
hospitalized.shape
(3169, 423)
hospitalized['respiratory_class'] = 0
hospitalized.loc[(hospitalized.respiratory_rate>30) & (hospitalized.respiratory_rate<=45), 'respiratory_class'] = 1
hospitalized.loc[(hospitalized.respiratory_rate>45) & (hospitalized.respiratory_rate<=60), 'respiratory_class'] = 2
hospitalized.loc[hospitalized.respiratory_rate>60, 'respiratory_class'] = 3
hospitalized.respiratory_class.value_counts()
1 1648 0 767 2 636 3 118 dtype: int64
hospitalized['sats_class'] = 0
hospitalized.loc[(hospitalized.sats_number>=90) & (hospitalized.sats_number<95), 'sats_class'] = 1
hospitalized.loc[(hospitalized.sats_number>=85) & (hospitalized.sats_number<90), 'sats_class'] = 2
hospitalized.loc[hospitalized.sats_number<85, 'sats_class'] = 3
hospitalized.sats_class.value_counts()
0 3020 1 131 2 15 3 3 dtype: int64
hospitalized.sats_range.unique()
array(['85-89', '>95', nan, '90-94', '80-85 ', '<85', '(90-94)%', '>95%', '(85-89)%', '(90-94)', '> 95', '90-95', '<85%', '85-59', '.>95', '> 95 ', '95-98', '85-98', '95-97', '92-95', '93-95'], dtype=object)
hospitalized['sats_score'] = hospitalized.sats_range.replace({'85-89': 2, '90-94': 1, '>95': 0, '<85': 3, '(90-94)%': 1, '>95%': 0, '>60': np.nan,
'90-95': 1, '(85-89)%': 2, '<85%': 3, '85-59': 2, '85-98': 2, '95-98': 0, '92-95': 1,
'93-95': 1})
pd.crosstab(hospitalized.sats_score, hospitalized.oxygen)
oxygen | 0.0 | 1.0 |
---|---|---|
sats_score | ||
0 | 877 | 293 |
1 | 1029 | 550 |
2 | 77 | 118 |
3 | 6 | 11 |
(90-94) | 1 | 0 |
.>95 | 1 | 0 |
80-85 | 0 | 1 |
95-97 | 1 | 1 |
> 95 | 1 | 0 |
> 95 | 2 | 0 |
Month of admission
hospitalized['admission_month'] = hospitalized.admission_date.apply(lambda x: x.month)
hospitalized['admission_week'] = hospitalized.admission_date.apply(lambda x: x.week)
hospitalized['admission_year'] = hospitalized.admission_date.apply(lambda x: x.year)
hospitalized['male'] = hospitalized.sex_child
Convert gestational age to number of weeks less than 37.
hospitalized['premature'] = np.maximum(0, 37 - hospitalized.gest_age)
hospitalized.loc[hospitalized.gest_age.isnull(), 'premature'] = np.nan
Not many preemies -- drop covariate?
hospitalized.premature.hist(bins=20, grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x10f274358>
# Create indicator for LRTI
hospitalized['ltri'] = ((hospitalized.adm_bronchopneumo + hospitalized.adm_bronchiolitis
+ hospitalized.adm_pneumo + hospitalized.adm_wheezing
+ hospitalized.adm_asthma
+ (hospitalized.wheezing>0) + (hospitalized.flaring>1)) > 1).astype(int)
pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}
virus_vars = ['Influenza', 'HMPV', 'Rhino']
hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['Influenza'] = (hospitalized['pcr_result___3'] | hospitalized['pcr_result___4']).astype(int)
hospitalized['HMPV'] = hospitalized['pcr_result___2']
hospitalized['Rhino'] = hospitalized['pcr_result___5']
For interpretability, center vitamin D at 20.
hospitalized['vitamin_d_norm'] = ((hospitalized.hospitalized_vitamin_d - 20)
/ hospitalized.hospitalized_vitamin_d.std())
Center age at sample mean
hospitalized['age_centered'] = hospitalized.age_months - hospitalized.age_months.mean()
Create enrollment season variable
hospitalized['enroll_month'] = [d.month for d in hospitalized.enrollment_date]
hospitalized['enroll_spring'] = hospitalized.enroll_month.isin((3,4,5))
hospitalized['enroll_summer'] = hospitalized.enroll_month.isin((6,7,8))
hospitalized['enroll_autumn'] = hospitalized.enroll_month.isin((9,10,11))
Add interaction between age and vitamin D
hospitalized['age_X_vitamin_d'] = hospitalized.vitamin_d_norm * hospitalized.age_centered
hospitalized['male_X_vitamin_d'] = hospitalized.vitamin_d_norm * hospitalized.male
Remove individuals with missing/QNS vitamin D
hospitalized.hosp_vitd.replace({'<1': 0}).astype(float).notnull().sum()
2689
analysis_subset = hospitalized[hospitalized.qns.isnull() & hospitalized.hosp_vitd.notnull()]
analysis_subset.shape
(2518, 443)
RSV-only subset
rsv_subset = analysis_subset[analysis_subset.pcr_result___1==1]
rsv_subset.shape
(1148, 443)
Several covariates are too sparse to consider
rsv_subset.heart_hx.mean()
0.028745644599303136
rsv_subset.daycare.mean()
0.018292682926829267
rsv_subset.cigarette_preg.mean()
0.075783972125435542
rsv_subset['viral_coinfection'] = (rsv_subset[['pcr_result___2', 'pcr_result___3', 'pcr_result___4', 'pcr_result___5',
'pcr_result___6', 'pcr_result___7', 'pcr_result___8', 'pcr_result___13', 'pcr_result___14',
'pcr_result___15', 'pcr_result___16', 'pcr_result___17', 'pcr_result___18']].sum(1)>0).astype(int)
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance()
covnames =["vitamin_d_norm",
"prev_cond",
"cigarette_smokers",
"male",
"z_score",
# "ltri",
"premature",
# "viral_coinfection",
# "enroll_spring",
# "enroll_summer",
# "enroll_autumn",
# "admission_week",
"age_X_vitamin_d",
"male_X_vitamin_d",
"age_centered"]
covariates = rsv_subset[covnames]
covariates.isnull().sum()
vitamin_d_norm 0 prev_cond 0 cigarette_smokers 0 male 0 z_score 3 premature 0 age_X_vitamin_d 0 male_X_vitamin_d 0 age_centered 0 dtype: int64
severity = rsv_subset[['oxygen', 'sats_score']]
outcome = 'oxygen'
complete = (covariates.isnull().sum(axis=1).astype(bool)==False) & (severity[outcome].isnull().astype(bool)==False)
covariates_complete = covariates[complete]
y_complete = severity[outcome][complete]
variables = pd.concat([covariates, severity['oxygen']], axis=1)
Drop rows not worth imputing
variables.isnull().sum()
vitamin_d_norm 0 prev_cond 0 cigarette_smokers 0 male 0 z_score 3 premature 0 age_X_vitamin_d 0 male_X_vitamin_d 0 age_centered 0 oxygen 13 dtype: int64
variables = variables.dropna()
assert not variables.isnull().sum().any()
qgrid.show_grid(variables, remote_js=True)
rsv_vars = variables.columns[:-1].values
rsv_vars['vent_or_icu'] = ((rsv_no_hx.vent + rsv_no_hx.icu) > 0).astype(int)
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-252-fcde3616715a> in <module>() ----> 1 rsv_vars['vent_or_icu'] = ((rsv_no_hx.vent + rsv_no_hx.icu) > 0).astype(int) IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
glm = pm3.glm.glm
with pm3.Model() as severity_model:
formula = 'oxygen ~ ' + '+'.join(rsv_vars)
glm(formula, variables, family=pm3.glm.families.Binomial())
severity_trace = pm3.sample(2000, pm3.NUTS())
[-----------------100%-----------------] 2000 of 2000 complete in 16.8 sec
/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/importlib/_bootstrap.py:321: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility return f(*args, **kwds)
Forest plot of parameter estimates. Thick line is the interquartile range of the estimate, thin line the 95% posterior interval. Positive values mean an increase in the probability of ending up on oxygen when the value of that parameter increasees by one unit.
So, for example, the z-score main effect is negative, so it decreases the probability of oxygen when it increases.
pm3.forestplot(severity_trace, vars=rsv_vars)
<matplotlib.gridspec.GridSpec at 0x10ef4d198>
from pandas.tools.plotting import scatter_matrix
trace_df = pm3.trace_to_dataframe(severity_trace)
print(trace_df.describe().drop('count').T)
print("\nP(Vitamin D < 0) = {}".format((trace_df['vitamin_d_norm'] < 0).mean()))
mean std min 25% 50% 75% \ Intercept -0.503361 0.131519 -0.940283 -0.587602 -0.503821 -0.420436 age_X_vitamin_d 0.046439 0.019813 -0.022419 0.033992 0.046953 0.059310 age_centered -0.121763 0.016688 -0.187188 -0.132823 -0.121512 -0.110532 cigarette_smokers -0.000451 0.068638 -0.231318 -0.046710 0.000034 0.044452 male -0.148358 0.140683 -0.564303 -0.245916 -0.145783 -0.055139 male_X_vitamin_d 0.164338 0.131361 -0.206424 0.072769 0.161942 0.252866 premature 0.062237 0.055419 -0.117127 0.024272 0.060918 0.096899 prev_cond 0.479694 0.216676 -0.200901 0.325782 0.483269 0.627661 vitamin_d_norm -0.125964 0.110375 -0.514271 -0.197223 -0.124351 -0.052352 z_score -0.035586 0.038875 -0.166133 -0.062835 -0.034477 -0.008353 max Intercept -0.056627 age_X_vitamin_d 0.107831 age_centered -0.059559 cigarette_smokers 0.213562 male 0.345077 male_X_vitamin_d 0.718376 premature 0.259560 prev_cond 1.273045 vitamin_d_norm 0.231789 z_score 0.102323 P(Vitamin D < 0) = 0.878
Odds ratios of parameters
trace_df.drop('Intercept', axis=1).apply(np.exp).describe().drop('count').T
mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|
age_X_vitamin_d | 1.047739 | 0.020730 | 0.977831 | 1.034576 | 1.048073 | 1.061104 | 1.113860 |
age_centered | 0.885481 | 0.014774 | 0.829288 | 0.875620 | 0.885580 | 0.895358 | 0.942180 |
cigarette_smokers | 1.001908 | 0.068997 | 0.793487 | 0.954365 | 1.000034 | 1.045455 | 1.238080 |
male | 0.870701 | 0.123278 | 0.568757 | 0.781988 | 0.864345 | 0.946354 | 1.412098 |
male_X_vitamin_d | 1.188900 | 0.158749 | 0.813488 | 1.075482 | 1.175792 | 1.287710 | 2.051099 |
premature | 1.065855 | 0.059420 | 0.889472 | 1.024569 | 1.062812 | 1.101749 | 1.296360 |
prev_cond | 1.653888 | 0.362039 | 0.817993 | 1.385114 | 1.621366 | 1.873224 | 3.571711 |
vitamin_d_norm | 0.887001 | 0.097410 | 0.597937 | 0.821008 | 0.883070 | 0.948995 | 1.260853 |
z_score | 0.965769 | 0.037542 | 0.846934 | 0.939099 | 0.966110 | 0.991682 | 1.107742 |
variables['O2'] = variables.oxygen + np.random.normal(size=len(variables.oxygen))*0.01
variables.plot(x='vitamin_d_norm', y='O2', kind='scatter', alpha=0.2, yticks=[0,1])
<matplotlib.axes._subplots.AxesSubplot at 0x1174cef98>
variables['vitd_bins'] = pd.cut(variables.vitamin_d_norm, bins=12)
ax = variables.groupby('vitd_bins')['oxygen'].mean().plot(grid=False)
variables.groupby('vitd_bins')['oxygen'].count().plot(secondary_y=True, style='k--')
ax.set_ylabel('Proportion on O2')
ax.right_ax.set_ylabel('Number in bin')
<matplotlib.text.Text at 0x117251a90>
(hospitalized.hospitalized_vitamin_d > 30).mean()
0.12874723887661724
variables.vitamin_d_norm.hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x1171e97b8>
variables[rsv_vars[1:]].head()
prev_cond | cigarette_smokers | male | z_score | premature | age_X_vitamin_d | male_X_vitamin_d | age_centered | |
---|---|---|---|---|---|---|---|---|
case_id | ||||||||
A0024 | 0 | 0 | 1 | 0.56 | 0 | 6.356025 | 1.110316 | 5.724519 |
A0054 | 0 | 2 | 1 | 0.41 | 0 | 4.390161 | -1.340310 | -3.275481 |
A0062 | 0 | 0 | 1 | 2.44 | 0 | -3.390811 | 0.793083 | -4.275481 |
A0063 | 0 | 1 | 1 | -0.10 | 0 | 7.531009 | -1.427549 | -5.275481 |
A0065 | 0 | 1 | 1 | -1.54 | 6 | 1.025765 | 0.594812 | 1.724519 |
rsv_vars[1:]
array(['prev_cond', 'cigarette_smokers', 'male', 'z_score', 'premature', 'age_X_vitamin_d', 'male_X_vitamin_d', 'age_centered'], dtype=object)
def changepoint_model():
X = variables[['prev_cond', 'cigarette_smokers', 'male', 'z_score', 'premature']].values
change = pm.Uniform('change', variables.vitamin_d_norm.min(), 1.5, value=-0.5)
# Low vitamin D effect
alpha = pm.Normal('alpha', 0, 0.001, value=0)
mu = pm.Normal('mu', 0, 0.001, value=0)
# Covariates for confounders
beta = pm.Normal('beta', 0, 0.001, value=[0]*X.shape[1])
@pm.deterministic
def theta(b=beta, a=alpha, c=change, m=mu):
#import pdb; pdb.set_trace()
return pm.invlogit(m + a*(variables.vitamin_d_norm.values < c) + X.dot(b))
y = pm.Bernoulli('y', p=theta, value=variables.oxygen.values, observed=True)
return(locals())
M_change = pm.MCMC(changepoint_model())
M_change.sample(20000, 10000)
[-----------------100%-----------------] 20000 of 20000 complete in 8.9 sec
pm.Matplot.plot(M_change.change)
Plotting change
M_change.change.summary()
change: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ [[-1.13]] [[ 0.219]] [[ 0.011]] [-1.296 -0.594] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| [[-1.287]] [[-1.229]] [[-1.198]] [[-1.13]] [[-0.57]]
pm.Matplot.plot(M_change.alpha)
Plotting alpha
M_change.alpha.summary()
alpha: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ [[ 0.551]] [[ 0.146]] [[ 0.006]] [ 0.281 0.849] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| [[ 0.254]] [[ 0.459]] [[ 0.555]] [[ 0.656]] [[ 0.827]]
from pymc.gp import Covariance, Mean, GPSubmodel
from pymc.gp.cov_funs import matern
vitD_mesh = np.linspace(variables.vitamin_d_norm.min(), variables.vitamin_d_norm.max())
gp_covs =["prev_cond",
"cigarette_smokers",
"male",
"z_score",
"premature",
"age_centered"]
def gp_model():
X = variables[gp_covs].values
# Mean risk
mu = pm.Normal('mu', 0, 0.001, value=0)
# Covariates for confounders
beta = pm.Normal('beta', 0, 0.001, value=[0]*X.shape[1])
# GP hyperpriors
amp = pm.Exponential('amp', 1, value=1)
scale = pm.Uniform('scale' , 0, 10, value=1)
diff_degree = pm.Uniform('diff_degree', 0, 10, value=1)
@pm.deterministic
def C(diff_degree=diff_degree, amp=amp, scale=scale):
"""
The Matern covariance function
"""
return Covariance(matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale)
@pm.deterministic
def M():
"""
The mean function is the zero function
"""
return Mean(lambda x: np.zeros(x.shape))
alpha = GPSubmodel('alpha', M, C, mesh=vitD_mesh)
@pm.deterministic
def theta(b=beta, a=alpha.f(variables.vitamin_d_norm.values), m=mu):
return pm.invlogit(m + a + X.dot(b))
y = pm.Bernoulli('y', p=theta, value=variables.oxygen.values, observed=True)
return(locals())
M_gp = pm.MCMC(gp_model())
M_gp.sample(20000, 10000)
plt.figure(figsize=(18, 5))
ax = plt.subplot(1, 2, 1)
for i in range(0, 100):
f = np.random.choice(M_gp.alpha.f.trace()[-1000:])(vitD_mesh)
plt.plot(vitD_mesh, f, 'k-', linewidth=0.5, alpha=0.5)
plt.xlabel('Normalized Vitamin D')
plt.ylabel('Vitamin D effect on oxygen')
import theano.tensor as TT
def tinvlogit(x):
return TT.exp(x) / (1 + TT.exp(x))
with pm3.Model() as severity_lasso_model:
X = variables[rsv_vars].values
alpha = pm3.Exponential('alpha', 1)
beta = pm3.Laplace('beta', 0, alpha, shape=X.shape[1])
mu = pm3.Normal('mu', 0, 0.01)
p = tinvlogit(mu + beta.dot(X.T))
y = pm3.Bernoulli('y', p, observed=variables.oxygen)
with severity_lasso_model:
trace_lasso = pm3.sample(2000, (pm3.NUTS(vars=[beta], scaling=pm3.find_MAP(vars=[beta])), pm3.Slice(vars=[alpha, mu])))
pm3.forestplot(trace_lasso[1000:], vars=['beta'], ylabels=list(variables[rsv_vars].columns))
_ = pm3.traceplot(trace_lasso[1000:], vars=['alpha'])
with pm3.Model() as severity_lasso_model:
b = pm3.Exponential('b', 1)
# Define priors for intercept and regression coefficients.
priors = {v:pm3.Laplace.dist(mu=0, b=b) for v in variables.columns[:-1]}
priors['Intercept'] = pm3.Normal.dist(mu=0, sd=50)
formula = 'oxygen ~ prev_cond + cigarette_smokers + male + z_score'
formula += '+ premature + age_X_vitamin_d + male_X_vitamin_d + age_centered + vitamin_d_norm'
glm(formula, variables, family=pm3.glm.families.Binomial(), priors=priors)
trace_lasso = pm3.sample(2000, pm3.NUTS())
pm3.forestplot(trace_lasso, vars=variables.columns[:-1])
_ = pm3.traceplot(trace_lasso[1000:], vars=['b'])
trace_df = pm3.trace_to_dataframe(trace_lasso)
#scatter_matrix(trace_df, figsize=(8, 8));
print(trace_df.apply(np.exp).describe().drop('count').T)
rsv_no_hx = rsv_subset#[rsv_subset.no_hx.astype(bool) & rsv_subset.oxygen.notnull()]
rsv_no_hx.shape
(1148, 444)
rsv_vars
array(['vitamin_d_norm', 'prev_cond', 'cigarette_smokers', 'male', 'z_score', 'premature', 'age_X_vitamin_d', 'male_X_vitamin_d', 'age_centered'], dtype=object)
rsv_no_hx.icu.mean()
0.082010582010582006
rsv_no_hx.vent.mean()
0.04585537918871252
rsv_no_hx.death.mean()
0.0060975609756097563
rsv_no_hx.oxygen.mean()
0.41585903083700443
rsv_no_hx['vent_or_icu'] = ((rsv_no_hx.vent + rsv_no_hx.icu) > 0).astype(int)
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
rsv_no_hx['premature_ind'] = (rsv_no_hx.premature>0).astype(int)
rsv_no_hx.adm_pneumo.mean()
0.16069868995633188
no_hx_vars = ['vitamin_d_norm', 'breastfed', 'male', 'age_X_vitamin_d', 'z_score',
'age_centered', 'premature_ind', 'adm_pneumo']
rsv_no_hx[no_hx_vars].head()
vitamin_d_norm | breastfed | male | age_X_vitamin_d | z_score | age_centered | premature_ind | adm_pneumo | |
---|---|---|---|---|---|---|---|---|
case_id | ||||||||
A0024 | 1.110316 | 1 | 1 | 6.356025 | 0.56 | 5.724519 | 0 | 0 |
A0054 | -1.340310 | 1 | 1 | 4.390161 | 0.41 | -3.275481 | 0 | 0 |
A0062 | 0.793083 | 1 | 1 | -3.390811 | 2.44 | -4.275481 | 0 | 0 |
A0063 | -1.427549 | 1 | 1 | 7.531009 | -0.10 | -5.275481 | 0 | 0 |
A0065 | 0.594812 | 1 | 1 | 1.025765 | -1.54 | 1.724519 | 1 | 0 |
rsv_no_hx.premature.value_counts()
0 1000 1 58 3 28 4 17 2 17 6 14 9 4 7 4 5 2 8 1 dtype: int64
rsv_no_hx = rsv_no_hx.dropna(subset=['oxygen'] + no_hx_vars)
rsv_no_hx[no_hx_vars].isnull().sum(0)
vitamin_d_norm 0 breastfed 0 male 0 age_X_vitamin_d 0 z_score 0 age_centered 0 premature_ind 0 adm_pneumo 0 dtype: int64
rsv_no_hx.oxygen.isnull().sum()
0
glm = pm3.glm.glm
with pm3.Model() as oxygen_no_hx:
formula = 'oxygen ~ ' + '+'.join(no_hx_vars)
glm(formula, rsv_no_hx, family=pm3.glm.families.Binomial())
oxygen_trace = pm3.sample(2000, pm3.NUTS())
[-----------------100%-----------------] 2000 of 2000 complete in 16.9 sec
pm3.forestplot(oxygen_trace, vars=no_hx_vars)
<matplotlib.gridspec.GridSpec at 0x12bdb2198>
with pm3.Model() as severity_no_hx:
formula = 'vent_or_icu ~ ' + '+'.join(no_hx_vars)
glm(formula, rsv_no_hx, family=pm3.glm.families.Binomial())
severity_trace = pm3.sample(2000, pm3.NUTS())
[-----------------100%-----------------] 2000 of 2000 complete in 23.8 sec
pm3.forestplot(severity_trace, vars=no_hx_vars)
<matplotlib.gridspec.GridSpec at 0x12ee5aa58>
pm3.forestplot(severity_trace, vars=no_hx_vars)
<matplotlib.gridspec.GridSpec at 0x126ad8160>
pm3.forestplot(severity_trace, vars=no_hx_vars)
<matplotlib.gridspec.GridSpec at 0x11e22a438>
analysis_subset.shape
(2518, 443)
Distribution of vitamin D levels
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.set(style="white", palette="deep")
sns.distplot(analysis_subset.hospitalized_vitamin_d.dropna(), kde=False, color="steelblue",
axlabel='Hospitalized VitaminD (ng/ml)')
sns.despine(trim=True)
analysis_subset.hospitalized_vitamin_d.hist(bins=np.sqrt(1000), normed=True, grid=False, color='grey', figsize=(10,6))
plt.xlabel('Vitamin D (ng/ml)'); plt.ylabel('Relative Frequency')
<matplotlib.text.Text at 0x11d2f1ed0>
Distribution of smoking exposure
analysis_subset.cigarette_smokers.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x11cd26390>
Proportion and number of children exposed to cigarette smoking
(analysis_subset.cigarette_smokers > 0).mean()
0.72398729150119145
(analysis_subset.cigarette_smokers > 0).sum()
1823
Distribution of nargila exposure
analysis_subset.nargila_smokers.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x11c02b350>
Proportion and number of children exposed to nargila
(analysis_subset.nargila_smokers > 0).mean()
0.17911040508339951
(analysis_subset.nargila_smokers > 0).sum()
451
Proportion of children with mothers who smoked during pregnancy
analysis_subset.cigarette_preg.mean()
0.077045274027005561
analysis_subset.nargila_preg.mean()
0.029388403494837172
Vitamin D levels by breastfeeding status
analysis_subset.breastfed = analysis_subset.breastfed.replace({0: 'Not breastfed', 1: 'Breastfed'})
_ = analysis_subset.groupby(['breastfed']).boxplot(column='hospitalized_vitamin_d', grid=False)
/usr/local/lib/python2.7/site-packages/pandas/core/generic.py:1951: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /usr/local/lib/python2.7/site-packages/pandas/tools/plotting.py:2601: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'.
RSV count by oxygen status
_ = analysis_subset.groupby('oxygen').boxplot(column='rsv_count', grid=False)
Proportion on oxygen and vitamin D, by month
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
groupby_month = analysis_subset.groupby('admission_month')
ax = groupby_month['oxygen'].mean().plot(grid=False, color='r', figsize=(14,8))
groupby_month['hospitalized_vitamin_d'].mean().plot(secondary_y=True, color='b')
ax.set_ylabel('Proportion on oxygen', color='r')
ax.right_ax.set_ylabel('Mean vitamin D', color='b')
ax.set_xlabel('Admission month')
ax.set_xticks(np.arange(12)+1)
ax.set_xticklabels(months);
vitD_O2 = analysis_subset[['oxygen', 'hospitalized_vitamin_d']].set_index(analysis_subset.admission_date)
vitD_O2_by_week = vitD_O2.resample('M', how='mean').fillna(0)
ax = vitD_O2_by_week['oxygen'].plot(grid=False, color='r')
vitD_O2_by_week['hospitalized_vitamin_d'].plot(secondary_y=True, color='b')
ax.set_ylabel('Proportion on oxygen', color='r')
ax.right_ax.set_ylabel('Mean vitamin D', color='blue')
<matplotlib.text.Text at 0x11bcc03d0>
Combined plot of proportion on oxygen, vitamin D and frequencies of RSV hospitalization for entire cohort.
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
import datetime
plt.figure(figsize=(12,6))
host = host_subplot(111, axes_class=AA.Axes)
plt.subplots_adjust(right=0.75)
par1 = host.twinx()
par2 = host.twinx()
offset = 60
new_fixed_axis = par2.get_grid_helper().new_fixed_axis
par2.axis["right"] = new_fixed_axis(loc="right",
axes=par2,
offset=(offset, 0))
par2.axis["right"].toggle(all=True)
dates = [datetime.datetime.utcfromtimestamp(i.astype(int)*1e-9) for i in vitD_O2_by_week.index.values]
vitD_O2 = analysis_subset[['oxygen', 'hospitalized_vitamin_d']].set_index(analysis_subset.admission_date)
vitD_O2_by_week = vitD_O2.resample('M', how='mean').fillna(0)
host.plot(dates, vitD_O2_by_week['oxygen'], 'k-', label='Oxygen')
par1.plot(dates, vitD_O2_by_week['hospitalized_vitamin_d'], 'k--', label='Vitamin D')
par2.plot(dates, vitD_O2['hospitalized_vitamin_d'].resample('M', how='count'), linestyle='dotted', color='grey', label='RSV')
#host.get_xaxis().get_major_formatter().set_useOffset(False)
host.set_xlabel("Month")
host.set_ylabel("Proportion on oxygen")
par1.set_ylabel("Mean vitamin D")
par2.set_ylabel("RSV hospitalizations")
host.legend(loc='lower right');
Sample size of analysis subset
analysis_subset.oxygen.notnull().sum()
2494
Function for calculating odds ratios and simulation-based intervals (RSV subset only)
se = lambda p, n: np.sqrt(p * (1. - p) / n)
def odds_ratio(x, y, n_sim=10000, alpha=0.05):
try:
n_x, n_y = len(x.dropna()), len(y.dropna())
p_x, p_y = x.mean(), y.mean()
se_x = se(p_x, n_x)
se_y = se(p_y, n_y)
p_x_sim = np.random.normal(p_x, se_x, n_sim)
p_y_sim = np.random.normal(p_y, se_y, n_sim)
ratio = ((p_x_sim / (1. - p_x_sim)) /
(p_y_sim / (1. - p_y_sim)))
interval = np.percentile(ratio, [100*(alpha/2.), 100*(1. - alpha/2.)])
return np.round(np.median(ratio), 2), np.round(interval, 2).tolist(), (n_y, n_x)
except ValueError:
return np.nan, np.nan, np.nan
def calc_or(groupby, var):
data = list(groupby[var])
return odds_ratio(data[1][1], data[0][1])
def make_table(groupby, table_vars, replace_dict={}):
table = np.round(groupby[table_vars].mean(), 2).T
ratios = [calc_or(groupby, v) for v in table.index]
table['OR'] = [r[0] for r in ratios]
table['Interval'] = [r[1] for r in ratios]
table['N'] = [r[2] for r in ratios]
table.rename(columns=replace_dict, inplace=True)
table.columns.name = None
return(table)
table_vars = ['male', 'under_2_months', 'months_2_11', 'months_12_23',
'Jordanian', 'Palestinian', 'vitamin D < 20', 'vitamin D < 11',
'prev_cond', 'heart_hx', 'breastfed', 'premature_ind',
'adm_pneumo', 'adm_bronchopneumo', 'adm_sepsis', 'adm_bronchiolitis']
age_groups = pd.get_dummies(pd.cut(rsv_subset.age_months, [0,1,11,23]))
age_groups.index = rsv_subset.index
age_groups.columns = 'under_2_months', 'months_2_11', 'months_12_23'
rsv_subset = rsv_subset.join(age_groups)
nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian',
6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
rsv_subset['nationality'] = rsv_subset.mother_nationality.replace(nationality_lookup)
rsv_subset['Jordanian'] = (rsv_subset.nationality=='Jordanian').astype(int)
rsv_subset['Palestinian'] = (rsv_subset.nationality=='Palestinian').astype(int)
rsv_subset['vitamin D < 20'] = (rsv_subset.hospitalized_vitamin_d < 20).astype(int)
rsv_subset.loc[rsv_subset.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan
rsv_subset['vitamin D < 11'] = (rsv_subset.hospitalized_vitamin_d < 11).astype(int)
rsv_subset.loc[rsv_subset.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan
rsv_subset['premature_ind'] = (rsv_subset.premature>0).astype(int)
groupby_o2 = rsv_subset.groupby('oxygen')
make_table(groupby_o2, table_vars=table_vars+virus_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'})
No Oxygen | Oxygen | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.61 | 0.55 | 0.80 | [0.63, 1.02] | (663, 472) |
under_2_months | 0.14 | 0.28 | 2.33 | [1.75, 3.15] | (663, 472) |
months_2_11 | 0.63 | 0.54 | 0.69 | [0.54, 0.87] | (663, 472) |
months_12_23 | 0.17 | 0.06 | 0.29 | [0.18, 0.44] | (663, 472) |
Jordanian | 0.90 | 0.89 | 0.87 | [0.6, 1.3] | (663, 472) |
Palestinian | 0.06 | 0.07 | 1.09 | [0.66, 1.74] | (663, 472) |
vitamin D < 20 | 0.59 | 0.65 | 1.31 | [1.03, 1.67] | (663, 472) |
vitamin D < 11 | 0.41 | 0.51 | 1.49 | [1.18, 1.89] | (663, 472) |
prev_cond | 0.07 | 0.08 | 1.24 | [0.78, 1.96] | (663, 472) |
heart_hx | 0.03 | 0.03 | 1.18 | [0.53, 2.47] | (663, 472) |
breastfed | 0.64 | 0.63 | 0.98 | [0.77, 1.26] | (663, 472) |
premature_ind | 0.11 | 0.15 | 1.45 | [1.02, 2.07] | (663, 472) |
adm_pneumo | 0.12 | 0.22 | 2.06 | [1.51, 2.88] | (663, 472) |
adm_bronchopneumo | 0.42 | 0.23 | 0.42 | [0.32, 0.54] | (663, 472) |
adm_sepsis | 0.13 | 0.22 | 1.89 | [1.4, 2.62] | (663, 472) |
adm_bronchiolitis | 0.28 | 0.27 | 0.97 | [0.74, 1.26] | (663, 472) |
Influenza | 0.03 | 0.02 | 0.76 | [0.29, 1.64] | (663, 472) |
HMPV | 0.05 | 0.05 | 0.96 | [0.54, 1.64] | (663, 472) |
Rhino | 0.34 | 0.31 | 0.87 | [0.67, 1.12] | (663, 472) |
groupby_vent = rsv_subset.groupby('vent')
make_table(groupby_vent, table_vars=table_vars+virus_vars,
replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'})
No Ventilator | Ventilator | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.59 | 0.56 | 0.89 | [0.51, 1.61] | (1082, 52) |
under_2_months | 0.19 | 0.33 | 2.08 | [1.05, 3.62] | (1082, 52) |
months_2_11 | 0.60 | 0.46 | 0.58 | [0.33, 1.02] | (1082, 52) |
months_12_23 | 0.13 | 0.08 | 0.58 | [0.04, 1.25] | (1082, 52) |
Jordanian | 0.89 | 0.94 | 1.85 | [-9.19, 16.05] | (1082, 52) |
Palestinian | 0.07 | 0.04 | 0.54 | [-0.18, 1.4] | (1082, 52) |
vitamin D < 20 | 0.62 | 0.48 | 0.56 | [0.31, 0.98] | (1082, 52) |
vitamin D < 11 | 0.45 | 0.38 | 0.75 | [0.4, 1.3] | (1082, 52) |
prev_cond | 0.07 | 0.10 | 1.31 | [0.2, 2.76] | (1082, 52) |
heart_hx | 0.03 | 0.00 | NaN | NaN | NaN |
breastfed | 0.64 | 0.52 | 0.61 | [0.35, 1.09] | (1082, 52) |
premature_ind | 0.12 | 0.13 | 1.10 | [0.3, 2.14] | (1082, 52) |
adm_pneumo | 0.16 | 0.27 | 1.98 | [0.94, 3.52] | (1082, 52) |
adm_bronchopneumo | 0.35 | 0.25 | 0.63 | [0.28, 1.12] | (1082, 52) |
adm_sepsis | 0.16 | 0.21 | 1.37 | [0.58, 2.49] | (1082, 52) |
adm_bronchiolitis | 0.28 | 0.27 | 0.97 | [0.45, 1.69] | (1082, 52) |
Influenza | 0.03 | 0.02 | 0.69 | [-0.66, 2.31] | (1082, 52) |
HMPV | 0.05 | 0.06 | 1.12 | [-0.09, 2.68] | (1082, 52) |
Rhino | 0.33 | 0.27 | 0.74 | [0.34, 1.29] | (1082, 52) |
groupby_death = rsv_subset.groupby('icu')
make_table(groupby_death, table_vars=table_vars+virus_vars, replace_dict={False: 'Survived', True: 'Die'})
Survived | Die | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.59 | 0.51 | 0.71 | [0.46, 1.09] | (1041, 93) |
under_2_months | 0.18 | 0.33 | 2.21 | [1.36, 3.45] | (1041, 93) |
months_2_11 | 0.61 | 0.40 | 0.43 | [0.27, 0.65] | (1041, 93) |
months_12_23 | 0.13 | 0.02 | 0.14 | [-0.05, 0.36] | (1041, 93) |
Jordanian | 0.89 | 0.91 | 1.27 | [0.68, 4.05] | (1041, 93) |
Palestinian | 0.07 | 0.04 | 0.61 | [0.04, 1.29] | (1041, 93) |
vitamin D < 20 | 0.61 | 0.69 | 1.41 | [0.91, 2.32] | (1041, 93) |
vitamin D < 11 | 0.44 | 0.53 | 1.39 | [0.91, 2.15] | (1041, 93) |
prev_cond | 0.07 | 0.10 | 1.34 | [0.47, 2.48] | (1041, 93) |
heart_hx | 0.03 | 0.01 | 0.34 | [-0.32, 1.17] | (1041, 93) |
breastfed | 0.64 | 0.60 | 0.86 | [0.56, 1.36] | (1041, 93) |
premature_ind | 0.12 | 0.22 | 2.07 | [1.1, 3.34] | (1041, 93) |
adm_pneumo | 0.14 | 0.38 | 3.61 | [2.24, 5.6] | (1041, 93) |
adm_bronchopneumo | 0.36 | 0.10 | 0.19 | [0.07, 0.33] | (1041, 93) |
adm_sepsis | 0.14 | 0.45 | 5.04 | [3.19, 7.85] | (1041, 93) |
adm_bronchiolitis | 0.28 | 0.19 | 0.61 | [0.32, 0.98] | (1041, 93) |
Influenza | 0.03 | 0.03 | 1.21 | [-0.13, 2.97] | (1041, 93) |
HMPV | 0.05 | 0.06 | 1.28 | [0.27, 2.62] | (1041, 93) |
Rhino | 0.33 | 0.31 | 0.91 | [0.55, 1.4] | (1041, 93) |