In [1]:

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import imp
pm3 = imp.load_module("pymc3", *imp.find_module("pymc", ["/Users/fonnescj/GitHub/pymc"]))

pm.__version__

/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/importlib/_bootstrap.py:1471: UserWarning: theano modules are deprecated and will be removed in release 0.7
  _call_with_frames_removed(exec, code, module.__dict__)

Out[1]:

'2.3.4'

In [2]:

import qgrid
qgrid.nbinstall()

In [3]:

pm3.__version__

Out[3]:

'3.0'

Correlate RSV severity with vitamin D levels (at hospital)

Outcomes:

oxygen
sat class

Covariates:

vitamin D
coinfection indicators
age
smoking
premature
breastfeeding
prev_cond
sex
LTRI, pneumo, bronch, bronchpneumo, ROS indicators
z-score

In [4]:

hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)

/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1152: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [5]:

hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date)
hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date)
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date)

In [6]:

hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1)

Plot z-scores by year of admission

In [7]:

hospitalized['year_admission'] = hospitalized.admission_date.apply(lambda x: x.year)

In [8]:

nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 
                      6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)

In [9]:

hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized['vitamin D < 20'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized['vitamin D < 11'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan

/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [10]:

hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)

In [11]:

hospitalized.shape

Out[11]:

(3169, 423)

Covariates¶

In [12]:

hospitalized['respiratory_class'] = 0
hospitalized.loc[(hospitalized.respiratory_rate>30) & (hospitalized.respiratory_rate<=45), 'respiratory_class'] = 1
hospitalized.loc[(hospitalized.respiratory_rate>45) & (hospitalized.respiratory_rate<=60), 'respiratory_class'] = 2
hospitalized.loc[hospitalized.respiratory_rate>60, 'respiratory_class'] = 3
hospitalized.respiratory_class.value_counts()

Out[12]:

1    1648
0     767
2     636
3     118
dtype: int64

In [13]:

hospitalized['sats_class'] = 0
hospitalized.loc[(hospitalized.sats_number>=90) & (hospitalized.sats_number<95), 'sats_class'] = 1
hospitalized.loc[(hospitalized.sats_number>=85) & (hospitalized.sats_number<90), 'sats_class'] = 2
hospitalized.loc[hospitalized.sats_number<85, 'sats_class'] = 3
hospitalized.sats_class.value_counts()

Out[13]:

0    3020
1     131
2      15
3       3
dtype: int64

In [14]:

hospitalized.sats_range.unique()

Out[14]:

array(['85-89', '>95', nan, '90-94', '80-85 ', '<85', '(90-94)%', '>95%',
       '(85-89)%', '(90-94)', '> 95', '90-95', '<85%', '85-59', '.>95',
       '> 95 ', '95-98', '85-98', '95-97', '92-95', '93-95'], dtype=object)

In [15]:

hospitalized['sats_score'] = hospitalized.sats_range.replace({'85-89': 2, '90-94': 1, '>95': 0, '<85': 3, '(90-94)%': 1, '>95%': 0, '>60': np.nan,
       '90-95': 1, '(85-89)%': 2, '<85%': 3, '85-59': 2, '85-98': 2, '95-98': 0, '92-95': 1,
       '93-95': 1})

In [16]:

pd.crosstab(hospitalized.sats_score, hospitalized.oxygen)

Out[16]:

oxygen	0.0	1.0
sats_score
0	877	293
1	1029	550
2	77	118
3	6	11
(90-94)	1	0
.>95	1	0
80-85	0	1
95-97	1	1
> 95	1	0
> 95	2	0

Month of admission

In [17]:

hospitalized['admission_month'] = hospitalized.admission_date.apply(lambda x: x.month)

In [18]:

hospitalized['admission_week'] = hospitalized.admission_date.apply(lambda x: x.week)

In [19]:

hospitalized['admission_year'] = hospitalized.admission_date.apply(lambda x: x.year)

In [20]:

hospitalized['male'] = hospitalized.sex_child

Convert gestational age to number of weeks less than 37.

In [21]:

hospitalized['premature'] = np.maximum(0, 37 - hospitalized.gest_age)
hospitalized.loc[hospitalized.gest_age.isnull(), 'premature'] = np.nan

Not many preemies -- drop covariate?

In [22]:

hospitalized.premature.hist(bins=20, grid=False)

Out[22]:

<matplotlib.axes._subplots.AxesSubplot at 0x10f274358>

In [23]:

# Create indicator for LRTI
hospitalized['ltri'] = ((hospitalized.adm_bronchopneumo + hospitalized.adm_bronchiolitis 
                        + hospitalized.adm_pneumo + hospitalized.adm_wheezing 
                        + hospitalized.adm_asthma 
                        + (hospitalized.wheezing>0) + (hospitalized.flaring>1)) > 1).astype(int)

In [24]:

pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}

In [314]:

virus_vars = ['Influenza', 'HMPV', 'Rhino']

In [26]:

hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['Influenza'] = (hospitalized['pcr_result___3'] | hospitalized['pcr_result___4']).astype(int)
hospitalized['HMPV'] = hospitalized['pcr_result___2']
hospitalized['Rhino'] = hospitalized['pcr_result___5']

For interpretability, center vitamin D at 20.

In [27]:

hospitalized['vitamin_d_norm'] = ((hospitalized.hospitalized_vitamin_d - 20) 
                                  / hospitalized.hospitalized_vitamin_d.std())

Center age at sample mean

In [28]:

hospitalized['age_centered'] = hospitalized.age_months - hospitalized.age_months.mean()

Create enrollment season variable

In [29]:

hospitalized['enroll_month'] = [d.month for d in hospitalized.enrollment_date]

In [30]:

hospitalized['enroll_spring'] = hospitalized.enroll_month.isin((3,4,5))
hospitalized['enroll_summer'] = hospitalized.enroll_month.isin((6,7,8))
hospitalized['enroll_autumn'] = hospitalized.enroll_month.isin((9,10,11))

Add interaction between age and vitamin D

In [31]:

hospitalized['age_X_vitamin_d'] = hospitalized.vitamin_d_norm * hospitalized.age_centered

In [32]:

hospitalized['male_X_vitamin_d'] = hospitalized.vitamin_d_norm * hospitalized.male

Remove individuals with missing/QNS vitamin D

In [33]:

hospitalized.hosp_vitd.replace({'<1': 0}).astype(float).notnull().sum()

Out[33]:

In [279]:

analysis_subset = hospitalized[hospitalized.qns.isnull() & hospitalized.hosp_vitd.notnull()]
analysis_subset.shape

Out[279]:

(2518, 443)

RSV-only subset

In [280]:

rsv_subset = analysis_subset[analysis_subset.pcr_result___1==1]
rsv_subset.shape

Out[280]:

(1148, 443)

Several covariates are too sparse to consider

In [281]:

rsv_subset.heart_hx.mean()

Out[281]:

0.028745644599303136

In [282]:

rsv_subset.daycare.mean()

Out[282]:

0.018292682926829267

In [283]:

rsv_subset.cigarette_preg.mean()

Out[283]:

0.075783972125435542

In [284]:

rsv_subset['viral_coinfection'] = (rsv_subset[['pcr_result___2', 'pcr_result___3', 'pcr_result___4', 'pcr_result___5', 
              'pcr_result___6', 'pcr_result___7', 'pcr_result___8', 'pcr_result___13', 'pcr_result___14', 
              'pcr_result___15', 'pcr_result___16', 'pcr_result___17', 'pcr_result___18']].sum(1)>0).astype(int)

/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [285]:

covnames =["vitamin_d_norm",
              "prev_cond",
               "cigarette_smokers",
               "male",
               "z_score",
#                "ltri",
               "premature",
#                "viral_coinfection",
#                "enroll_spring",
#                "enroll_summer",
#                "enroll_autumn",
#              "admission_week",
               "age_X_vitamin_d",
               "male_X_vitamin_d",
               "age_centered"]


covariates = rsv_subset[covnames]

In [41]:

covariates.isnull().sum()

Out[41]:

vitamin_d_norm       0
prev_cond            0
cigarette_smokers    0
male                 0
z_score              3
premature            0
age_X_vitamin_d      0
male_X_vitamin_d     0
age_centered         0
dtype: int64

In [42]:

severity = rsv_subset[['oxygen', 'sats_score']]

In [43]:

outcome = 'oxygen'
complete = (covariates.isnull().sum(axis=1).astype(bool)==False) & (severity[outcome].isnull().astype(bool)==False)
covariates_complete = covariates[complete]
y_complete = severity[outcome][complete]

In [44]:

variables = pd.concat([covariates, severity['oxygen']], axis=1)

Drop rows not worth imputing

In [45]:

variables.isnull().sum()

Out[45]:

vitamin_d_norm        0
prev_cond             0
cigarette_smokers     0
male                  0
z_score               3
premature             0
age_X_vitamin_d       0
male_X_vitamin_d      0
age_centered          0
oxygen               13
dtype: int64

In [46]:

variables = variables.dropna()
assert not variables.isnull().sum().any()

In [47]:

qgrid.show_grid(variables, remote_js=True)

Severity model¶

In [48]:

rsv_vars = variables.columns[:-1].values

In [252]:

rsv_vars['vent_or_icu'] = ((rsv_no_hx.vent + rsv_no_hx.icu) > 0).astype(int)

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-252-fcde3616715a> in <module>()
----> 1 rsv_vars['vent_or_icu'] = ((rsv_no_hx.vent + rsv_no_hx.icu) > 0).astype(int)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [49]:

glm = pm3.glm.glm

with pm3.Model() as severity_model:
    
    formula = 'oxygen ~ ' + '+'.join(rsv_vars)
    glm(formula, variables, family=pm3.glm.families.Binomial())
    severity_trace = pm3.sample(2000, pm3.NUTS())

 [-----------------100%-----------------] 2000 of 2000 complete in 16.8 sec

/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/importlib/_bootstrap.py:321: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
  return f(*args, **kwds)

Forest plot of parameter estimates. Thick line is the interquartile range of the estimate, thin line the 95% posterior interval. Positive values mean an increase in the probability of ending up on oxygen when the value of that parameter increasees by one unit.

So, for example, the z-score main effect is negative, so it decreases the probability of oxygen when it increases.

In [50]:

pm3.forestplot(severity_trace, vars=rsv_vars)

Out[50]:

<matplotlib.gridspec.GridSpec at 0x10ef4d198>

In [52]:

from pandas.tools.plotting import scatter_matrix

trace_df = pm3.trace_to_dataframe(severity_trace)
print(trace_df.describe().drop('count').T)

print("\nP(Vitamin D < 0) = {}".format((trace_df['vitamin_d_norm'] < 0).mean()))

                       mean       std       min       25%       50%       75%  \
Intercept         -0.503361  0.131519 -0.940283 -0.587602 -0.503821 -0.420436   
age_X_vitamin_d    0.046439  0.019813 -0.022419  0.033992  0.046953  0.059310   
age_centered      -0.121763  0.016688 -0.187188 -0.132823 -0.121512 -0.110532   
cigarette_smokers -0.000451  0.068638 -0.231318 -0.046710  0.000034  0.044452   
male              -0.148358  0.140683 -0.564303 -0.245916 -0.145783 -0.055139   
male_X_vitamin_d   0.164338  0.131361 -0.206424  0.072769  0.161942  0.252866   
premature          0.062237  0.055419 -0.117127  0.024272  0.060918  0.096899   
prev_cond          0.479694  0.216676 -0.200901  0.325782  0.483269  0.627661   
vitamin_d_norm    -0.125964  0.110375 -0.514271 -0.197223 -0.124351 -0.052352   
z_score           -0.035586  0.038875 -0.166133 -0.062835 -0.034477 -0.008353   

                        max  
Intercept         -0.056627  
age_X_vitamin_d    0.107831  
age_centered      -0.059559  
cigarette_smokers  0.213562  
male               0.345077  
male_X_vitamin_d   0.718376  
premature          0.259560  
prev_cond          1.273045  
vitamin_d_norm     0.231789  
z_score            0.102323  

P(Vitamin D < 0) = 0.878

Odds ratios of parameters

In [53]:

trace_df.drop('Intercept', axis=1).apply(np.exp).describe().drop('count').T

Out[53]:

	mean	std	min	25%	50%	75%	max
age_X_vitamin_d	1.047739	0.020730	0.977831	1.034576	1.048073	1.061104	1.113860
age_centered	0.885481	0.014774	0.829288	0.875620	0.885580	0.895358	0.942180
cigarette_smokers	1.001908	0.068997	0.793487	0.954365	1.000034	1.045455	1.238080
male	0.870701	0.123278	0.568757	0.781988	0.864345	0.946354	1.412098
male_X_vitamin_d	1.188900	0.158749	0.813488	1.075482	1.175792	1.287710	2.051099
premature	1.065855	0.059420	0.889472	1.024569	1.062812	1.101749	1.296360
prev_cond	1.653888	0.362039	0.817993	1.385114	1.621366	1.873224	3.571711
vitamin_d_norm	0.887001	0.097410	0.597937	0.821008	0.883070	0.948995	1.260853
z_score	0.965769	0.037542	0.846934	0.939099	0.966110	0.991682	1.107742

In [54]:

variables['O2'] = variables.oxygen + np.random.normal(size=len(variables.oxygen))*0.01
variables.plot(x='vitamin_d_norm', y='O2', kind='scatter', alpha=0.2, yticks=[0,1])

Out[54]:

<matplotlib.axes._subplots.AxesSubplot at 0x1174cef98>

In [55]:

variables['vitd_bins'] = pd.cut(variables.vitamin_d_norm, bins=12)
ax = variables.groupby('vitd_bins')['oxygen'].mean().plot(grid=False)
variables.groupby('vitd_bins')['oxygen'].count().plot(secondary_y=True, style='k--')
ax.set_ylabel('Proportion on O2')
ax.right_ax.set_ylabel('Number in bin')

Out[55]:

<matplotlib.text.Text at 0x117251a90>

In [56]:

(hospitalized.hospitalized_vitamin_d > 30).mean()

Out[56]:

0.12874723887661724

In [57]:

variables.vitamin_d_norm.hist(bins=25)

Out[57]:

<matplotlib.axes._subplots.AxesSubplot at 0x1171e97b8>

In [58]:

variables[rsv_vars[1:]].head()

Out[58]:

	prev_cond	cigarette_smokers	male	z_score	premature	age_X_vitamin_d	male_X_vitamin_d	age_centered
case_id
A0024	0	0	1	0.56	0	6.356025	1.110316	5.724519
A0054	0	2	1	0.41	0	4.390161	-1.340310	-3.275481
A0062	0	0	1	2.44	0	-3.390811	0.793083	-4.275481
A0063	0	1	1	-0.10	0	7.531009	-1.427549	-5.275481
A0065	0	1	1	-1.54	6	1.025765	0.594812	1.724519

Changepoint model¶

In [59]:

rsv_vars[1:]

Out[59]:

array(['prev_cond', 'cigarette_smokers', 'male', 'z_score', 'premature',
       'age_X_vitamin_d', 'male_X_vitamin_d', 'age_centered'], dtype=object)

In [60]:

def changepoint_model():
    
    X = variables[['prev_cond', 'cigarette_smokers', 'male', 'z_score', 'premature']].values
    
    change = pm.Uniform('change', variables.vitamin_d_norm.min(), 1.5, value=-0.5)
    
    # Low vitamin D effect
    alpha = pm.Normal('alpha', 0, 0.001, value=0)
    
    mu = pm.Normal('mu', 0, 0.001, value=0)
    
    # Covariates for confounders
    beta = pm.Normal('beta', 0, 0.001, value=[0]*X.shape[1])
    
    @pm.deterministic
    def theta(b=beta, a=alpha, c=change, m=mu):
        #import pdb; pdb.set_trace()
        return pm.invlogit(m + a*(variables.vitamin_d_norm.values < c) + X.dot(b))
    
    y = pm.Bernoulli('y', p=theta, value=variables.oxygen.values, observed=True)
    
    return(locals())

In [61]:

M_change = pm.MCMC(changepoint_model())

In [62]:

M_change.sample(20000, 10000)

 [-----------------100%-----------------] 20000 of 20000 complete in 8.9 sec

In [63]:

pm.Matplot.plot(M_change.change)

Plotting change

In [64]:

M_change.change.summary()

change:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	[[-1.13]]        [[ 0.219]]       [[ 0.011]]       [-1.296 -0.594]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	[[-1.287]]       [[-1.229]]      [[-1.198]]     [[-1.13]]     [[-0.57]]

In [65]:

pm.Matplot.plot(M_change.alpha)

Plotting alpha

In [66]:

M_change.alpha.summary()

alpha:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	[[ 0.551]]       [[ 0.146]]       [[ 0.006]]       [ 0.281  0.849]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	[[ 0.254]]       [[ 0.459]]      [[ 0.555]]     [[ 0.656]]    [[ 0.827]]

Gaussian Process Model¶

In [ ]:

from pymc.gp import Covariance, Mean, GPSubmodel
from pymc.gp.cov_funs import matern

vitD_mesh = np.linspace(variables.vitamin_d_norm.min(), variables.vitamin_d_norm.max())

gp_covs =["prev_cond",
               "cigarette_smokers",
               "male",
               "z_score",
               "premature",
               "age_centered"]

def gp_model():
    
    X = variables[gp_covs].values
    
    # Mean risk
    mu = pm.Normal('mu', 0, 0.001, value=0)
    
    # Covariates for confounders
    beta = pm.Normal('beta', 0, 0.001, value=[0]*X.shape[1])

    # GP hyperpriors
    amp = pm.Exponential('amp', 1, value=1)
    scale = pm.Uniform('scale' , 0, 10, value=1)
    diff_degree = pm.Uniform('diff_degree', 0, 10, value=1)

    @pm.deterministic
    def C(diff_degree=diff_degree, amp=amp, scale=scale):
        """
        The Matern covariance function
        """
        return Covariance(matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale)

    
    @pm.deterministic
    def M():
        """
        The mean function is the zero function
        """
        return Mean(lambda x: np.zeros(x.shape))

    alpha = GPSubmodel('alpha', M, C, mesh=vitD_mesh)
        
    @pm.deterministic
    def theta(b=beta, a=alpha.f(variables.vitamin_d_norm.values), m=mu):
        return pm.invlogit(m + a + X.dot(b))
    
    y = pm.Bernoulli('y', p=theta, value=variables.oxygen.values, observed=True)
    
    return(locals())

In [ ]:

M_gp = pm.MCMC(gp_model())

In [ ]:

M_gp.sample(20000, 10000)

In [ ]:

plt.figure(figsize=(18, 5))
ax = plt.subplot(1, 2, 1)
for i in range(0, 100):
    f = np.random.choice(M_gp.alpha.f.trace()[-1000:])(vitD_mesh)
    plt.plot(vitD_mesh, f, 'k-', linewidth=0.5, alpha=0.5)
plt.xlabel('Normalized Vitamin D')
plt.ylabel('Vitamin D effect on oxygen')

Bayesian Lasso Model¶

In [ ]:

import theano.tensor as TT

def tinvlogit(x):
    return TT.exp(x) / (1 + TT.exp(x))

with pm3.Model() as severity_lasso_model:
    
    X = variables[rsv_vars].values
    
    alpha = pm3.Exponential('alpha', 1)
    
    beta = pm3.Laplace('beta', 0, alpha, shape=X.shape[1])
    
    mu = pm3.Normal('mu', 0, 0.01)

    p = tinvlogit(mu + beta.dot(X.T))

    y = pm3.Bernoulli('y', p, observed=variables.oxygen)

In [ ]:

with severity_lasso_model:
    
    trace_lasso = pm3.sample(2000, (pm3.NUTS(vars=[beta], scaling=pm3.find_MAP(vars=[beta])), pm3.Slice(vars=[alpha, mu])))

In [ ]:

pm3.forestplot(trace_lasso[1000:], vars=['beta'], ylabels=list(variables[rsv_vars].columns))

In [ ]:

_ = pm3.traceplot(trace_lasso[1000:], vars=['alpha'])

In [ ]:

with pm3.Model() as severity_lasso_model:
    
    b = pm3.Exponential('b', 1)
    
    # Define priors for intercept and regression coefficients.
    priors = {v:pm3.Laplace.dist(mu=0, b=b) for v in variables.columns[:-1]} 
    priors['Intercept'] = pm3.Normal.dist(mu=0, sd=50)
    
    formula = 'oxygen ~ prev_cond + cigarette_smokers + male + z_score'
    formula += '+ premature + age_X_vitamin_d + male_X_vitamin_d + age_centered + vitamin_d_norm'
    glm(formula, variables, family=pm3.glm.families.Binomial(), priors=priors)
    trace_lasso = pm3.sample(2000, pm3.NUTS())

In [ ]:

pm3.forestplot(trace_lasso, vars=variables.columns[:-1])

In [ ]:

_ = pm3.traceplot(trace_lasso[1000:], vars=['b'])

In [ ]:

trace_df = pm3.trace_to_dataframe(trace_lasso)
#scatter_matrix(trace_df, figsize=(8, 8));
print(trace_df.apply(np.exp).describe().drop('count').T)

Subset with no past medical history¶

In [227]:

rsv_no_hx = rsv_subset#[rsv_subset.no_hx.astype(bool) & rsv_subset.oxygen.notnull()]
rsv_no_hx.shape

Out[227]:

(1148, 444)

In [228]:

rsv_vars

Out[228]:

array(['vitamin_d_norm', 'prev_cond', 'cigarette_smokers', 'male',
       'z_score', 'premature', 'age_X_vitamin_d', 'male_X_vitamin_d',
       'age_centered'], dtype=object)

In [229]:

rsv_no_hx.icu.mean()

Out[229]:

0.082010582010582006

In [230]:

rsv_no_hx.vent.mean()

Out[230]:

0.04585537918871252

In [231]:

rsv_no_hx.death.mean()

Out[231]:

0.0060975609756097563

In [232]:

rsv_no_hx.oxygen.mean()

Out[232]:

0.41585903083700443

In [233]:

rsv_no_hx['vent_or_icu'] = ((rsv_no_hx.vent + rsv_no_hx.icu) > 0).astype(int)

/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [234]:

rsv_no_hx['premature_ind'] = (rsv_no_hx.premature>0).astype(int)

In [240]:

rsv_no_hx.adm_pneumo.mean()

Out[240]:

0.16069868995633188

In [241]:

no_hx_vars = ['vitamin_d_norm', 'breastfed', 'male', 'age_X_vitamin_d', 'z_score',
       'age_centered', 'premature_ind', 'adm_pneumo']

In [242]:

rsv_no_hx[no_hx_vars].head()

Out[242]:

	vitamin_d_norm	breastfed	male	age_X_vitamin_d	z_score	age_centered	premature_ind	adm_pneumo
case_id
A0024	1.110316	1	1	6.356025	0.56	5.724519	0	0
A0054	-1.340310	1	1	4.390161	0.41	-3.275481	0	0
A0062	0.793083	1	1	-3.390811	2.44	-4.275481	0	0
A0063	-1.427549	1	1	7.531009	-0.10	-5.275481	0	0
A0065	0.594812	1	1	1.025765	-1.54	1.724519	1	0

In [243]:

rsv_no_hx.premature.value_counts()

Out[243]:

0    1000
1      58
3      28
4      17
2      17
6      14
9       4
7       4
5       2
8       1
dtype: int64

In [246]:

rsv_no_hx = rsv_no_hx.dropna(subset=['oxygen'] + no_hx_vars)
rsv_no_hx[no_hx_vars].isnull().sum(0)

Out[246]:

vitamin_d_norm     0
breastfed          0
male               0
age_X_vitamin_d    0
z_score            0
age_centered       0
premature_ind      0
adm_pneumo         0
dtype: int64

In [247]:

rsv_no_hx.oxygen.isnull().sum()

Out[247]:

In [248]:

glm = pm3.glm.glm

with pm3.Model() as oxygen_no_hx:
    
    formula = 'oxygen ~ ' + '+'.join(no_hx_vars)
    glm(formula, rsv_no_hx, family=pm3.glm.families.Binomial())
    oxygen_trace = pm3.sample(2000, pm3.NUTS())

 [-----------------100%-----------------] 2000 of 2000 complete in 16.9 sec

In [249]:

pm3.forestplot(oxygen_trace, vars=no_hx_vars)

Out[249]:

<matplotlib.gridspec.GridSpec at 0x12bdb2198>

In [250]:

with pm3.Model() as severity_no_hx:
    
    formula = 'vent_or_icu ~ ' + '+'.join(no_hx_vars)
    glm(formula, rsv_no_hx, family=pm3.glm.families.Binomial())
    severity_trace = pm3.sample(2000, pm3.NUTS())

 [-----------------100%-----------------] 2000 of 2000 complete in 23.8 sec

In [251]:

pm3.forestplot(severity_trace, vars=no_hx_vars)

Out[251]:

<matplotlib.gridspec.GridSpec at 0x12ee5aa58>

In [219]:

pm3.forestplot(severity_trace, vars=no_hx_vars)

Out[219]:

<matplotlib.gridspec.GridSpec at 0x126ad8160>

In [171]:

pm3.forestplot(severity_trace, vars=no_hx_vars)

Out[171]:

<matplotlib.gridspec.GridSpec at 0x11e22a438>

Additional analyses¶

In [55]:

analysis_subset.shape

Out[55]:

(2518, 443)

Distribution of vitamin D levels

In [56]:

import seaborn as sns

plt.figure(figsize=(10, 6))
sns.set(style="white", palette="deep")

sns.distplot(analysis_subset.hospitalized_vitamin_d.dropna(), kde=False, color="steelblue", 
             axlabel='Hospitalized VitaminD (ng/ml)')
sns.despine(trim=True)

In [57]:

analysis_subset.hospitalized_vitamin_d.hist(bins=np.sqrt(1000), normed=True, grid=False, color='grey', figsize=(10,6))
plt.xlabel('Vitamin D (ng/ml)'); plt.ylabel('Relative Frequency')

Out[57]:

<matplotlib.text.Text at 0x11d2f1ed0>

Distribution of smoking exposure

In [58]:

analysis_subset.cigarette_smokers.hist()

Out[58]:

<matplotlib.axes._subplots.AxesSubplot at 0x11cd26390>

Proportion and number of children exposed to cigarette smoking

In [59]:

(analysis_subset.cigarette_smokers > 0).mean()

Out[59]:

0.72398729150119145

In [60]:

(analysis_subset.cigarette_smokers > 0).sum()

Out[60]:

Distribution of nargila exposure

In [61]:

analysis_subset.nargila_smokers.hist()

Out[61]:

<matplotlib.axes._subplots.AxesSubplot at 0x11c02b350>

Proportion and number of children exposed to nargila

In [62]:

(analysis_subset.nargila_smokers > 0).mean()

Out[62]:

0.17911040508339951

In [63]:

(analysis_subset.nargila_smokers > 0).sum()

Out[63]:

Proportion of children with mothers who smoked during pregnancy

In [64]:

analysis_subset.cigarette_preg.mean()

Out[64]:

0.077045274027005561

In [65]:

analysis_subset.nargila_preg.mean()

Out[65]:

0.029388403494837172

Vitamin D levels by breastfeeding status

In [66]:

analysis_subset.breastfed = analysis_subset.breastfed.replace({0: 'Not breastfed', 1: 'Breastfed'})
_ = analysis_subset.groupby(['breastfed']).boxplot(column='hospitalized_vitamin_d', grid=False)

/usr/local/lib/python2.7/site-packages/pandas/core/generic.py:1951: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

/usr/local/lib/python2.7/site-packages/pandas/tools/plotting.py:2601: FutureWarning:


The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.

RSV count by oxygen status

In [67]:

_ = analysis_subset.groupby('oxygen').boxplot(column='rsv_count', grid=False)

Proportion on oxygen and vitamin D, by month

In [68]:

months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

In [69]:

groupby_month = analysis_subset.groupby('admission_month')
ax = groupby_month['oxygen'].mean().plot(grid=False, color='r', figsize=(14,8))
groupby_month['hospitalized_vitamin_d'].mean().plot(secondary_y=True, color='b')
ax.set_ylabel('Proportion on oxygen', color='r')
ax.right_ax.set_ylabel('Mean vitamin D', color='b')
ax.set_xlabel('Admission month')
ax.set_xticks(np.arange(12)+1)
ax.set_xticklabels(months);

In [70]:

vitD_O2 = analysis_subset[['oxygen', 'hospitalized_vitamin_d']].set_index(analysis_subset.admission_date)
vitD_O2_by_week = vitD_O2.resample('M', how='mean').fillna(0)
ax = vitD_O2_by_week['oxygen'].plot(grid=False, color='r')
vitD_O2_by_week['hospitalized_vitamin_d'].plot(secondary_y=True, color='b')
ax.set_ylabel('Proportion on oxygen', color='r')
ax.right_ax.set_ylabel('Mean vitamin D', color='blue')

Out[70]:

<matplotlib.text.Text at 0x11bcc03d0>

Combined plot of proportion on oxygen, vitamin D and frequencies of RSV hospitalization for entire cohort.

In [75]:

from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
import datetime

plt.figure(figsize=(12,6))
host = host_subplot(111, axes_class=AA.Axes)
plt.subplots_adjust(right=0.75)

par1 = host.twinx()
par2 = host.twinx()

offset = 60
new_fixed_axis = par2.get_grid_helper().new_fixed_axis
par2.axis["right"] = new_fixed_axis(loc="right",
                                    axes=par2,
                                    offset=(offset, 0))

par2.axis["right"].toggle(all=True)

dates = [datetime.datetime.utcfromtimestamp(i.astype(int)*1e-9) for i in vitD_O2_by_week.index.values]

vitD_O2 = analysis_subset[['oxygen', 'hospitalized_vitamin_d']].set_index(analysis_subset.admission_date)
vitD_O2_by_week = vitD_O2.resample('M', how='mean').fillna(0)
host.plot(dates, vitD_O2_by_week['oxygen'], 'k-', label='Oxygen')
par1.plot(dates, vitD_O2_by_week['hospitalized_vitamin_d'], 'k--', label='Vitamin D')
par2.plot(dates, vitD_O2['hospitalized_vitamin_d'].resample('M', how='count'), linestyle='dotted', color='grey', label='RSV')

#host.get_xaxis().get_major_formatter().set_useOffset(False)
host.set_xlabel("Month")
host.set_ylabel("Proportion on oxygen")
par1.set_ylabel("Mean vitamin D")
par2.set_ylabel("RSV hospitalizations")

host.legend(loc='lower right');

Sample size of analysis subset

In [77]:

analysis_subset.oxygen.notnull().sum()

Out[77]:

Risk factor tables¶

Function for calculating odds ratios and simulation-based intervals (RSV subset only)

In [255]:

se = lambda p, n: np.sqrt(p * (1. - p) / n)

def odds_ratio(x, y, n_sim=10000, alpha=0.05):

    try:
        n_x, n_y = len(x.dropna()), len(y.dropna())
        p_x, p_y = x.mean(), y.mean()

        se_x = se(p_x, n_x)
        se_y = se(p_y, n_y)

        p_x_sim = np.random.normal(p_x, se_x, n_sim)
        p_y_sim = np.random.normal(p_y, se_y, n_sim)

        ratio = ((p_x_sim / (1. - p_x_sim)) /
                    (p_y_sim / (1. - p_y_sim)))

        interval = np.percentile(ratio, [100*(alpha/2.), 100*(1. - alpha/2.)])

        return np.round(np.median(ratio), 2), np.round(interval, 2).tolist(), (n_y, n_x)
    
    except ValueError:
        
        return np.nan, np.nan, np.nan

In [256]:

def calc_or(groupby, var):
    data = list(groupby[var])
    return odds_ratio(data[1][1], data[0][1])

In [286]:

def make_table(groupby, table_vars, replace_dict={}):
    table = np.round(groupby[table_vars].mean(), 2).T
    ratios = [calc_or(groupby, v) for v in table.index]
    table['OR'] = [r[0] for r in ratios]
    table['Interval'] = [r[1] for r in ratios]
    table['N'] = [r[2] for r in ratios]
    table.rename(columns=replace_dict, inplace=True)
    table.columns.name = None
    
    return(table)

table_vars = ['male', 'under_2_months', 'months_2_11', 'months_12_23', 
                'Jordanian', 'Palestinian', 'vitamin D < 20', 'vitamin D < 11', 
                'prev_cond', 'heart_hx', 'breastfed', 'premature_ind', 
                'adm_pneumo', 'adm_bronchopneumo', 'adm_sepsis', 'adm_bronchiolitis']

In [293]:

age_groups = pd.get_dummies(pd.cut(rsv_subset.age_months, [0,1,11,23]))
age_groups.index = rsv_subset.index
age_groups.columns = 'under_2_months', 'months_2_11', 'months_12_23'
rsv_subset = rsv_subset.join(age_groups)

In [294]:

nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 
                      6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
rsv_subset['nationality'] = rsv_subset.mother_nationality.replace(nationality_lookup)
rsv_subset['Jordanian'] = (rsv_subset.nationality=='Jordanian').astype(int)
rsv_subset['Palestinian'] = (rsv_subset.nationality=='Palestinian').astype(int)

In [295]:

rsv_subset['vitamin D < 20'] = (rsv_subset.hospitalized_vitamin_d < 20).astype(int)
rsv_subset.loc[rsv_subset.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan
rsv_subset['vitamin D < 11'] = (rsv_subset.hospitalized_vitamin_d < 11).astype(int)
rsv_subset.loc[rsv_subset.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan

In [298]:

rsv_subset['premature_ind'] = (rsv_subset.premature>0).astype(int)

Oxygen¶

In [296]:

groupby_o2 = rsv_subset.groupby('oxygen')

In [315]:

make_table(groupby_o2, table_vars=table_vars+virus_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'})

Out[315]:

	No Oxygen	Oxygen	OR	Interval	N
male	0.61	0.55	0.80	[0.63, 1.02]	(663, 472)
under_2_months	0.14	0.28	2.33	[1.75, 3.15]	(663, 472)
months_2_11	0.63	0.54	0.69	[0.54, 0.87]	(663, 472)
months_12_23	0.17	0.06	0.29	[0.18, 0.44]	(663, 472)
Jordanian	0.90	0.89	0.87	[0.6, 1.3]	(663, 472)
Palestinian	0.06	0.07	1.09	[0.66, 1.74]	(663, 472)
vitamin D < 20	0.59	0.65	1.31	[1.03, 1.67]	(663, 472)
vitamin D < 11	0.41	0.51	1.49	[1.18, 1.89]	(663, 472)
prev_cond	0.07	0.08	1.24	[0.78, 1.96]	(663, 472)
heart_hx	0.03	0.03	1.18	[0.53, 2.47]	(663, 472)
breastfed	0.64	0.63	0.98	[0.77, 1.26]	(663, 472)
premature_ind	0.11	0.15	1.45	[1.02, 2.07]	(663, 472)
adm_pneumo	0.12	0.22	2.06	[1.51, 2.88]	(663, 472)
adm_bronchopneumo	0.42	0.23	0.42	[0.32, 0.54]	(663, 472)
adm_sepsis	0.13	0.22	1.89	[1.4, 2.62]	(663, 472)
adm_bronchiolitis	0.28	0.27	0.97	[0.74, 1.26]	(663, 472)
Influenza	0.03	0.02	0.76	[0.29, 1.64]	(663, 472)
HMPV	0.05	0.05	0.96	[0.54, 1.64]	(663, 472)
Rhino	0.34	0.31	0.87	[0.67, 1.12]	(663, 472)

Mechanical ventilation¶

In [316]:

groupby_vent = rsv_subset.groupby('vent')

In [317]:

make_table(groupby_vent, table_vars=table_vars+virus_vars, 
           replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'})

Out[317]:

	No Ventilator	Ventilator	OR	Interval	N
male	0.59	0.56	0.89	[0.51, 1.61]	(1082, 52)
under_2_months	0.19	0.33	2.08	[1.05, 3.62]	(1082, 52)
months_2_11	0.60	0.46	0.58	[0.33, 1.02]	(1082, 52)
months_12_23	0.13	0.08	0.58	[0.04, 1.25]	(1082, 52)
Jordanian	0.89	0.94	1.85	[-9.19, 16.05]	(1082, 52)
Palestinian	0.07	0.04	0.54	[-0.18, 1.4]	(1082, 52)
vitamin D < 20	0.62	0.48	0.56	[0.31, 0.98]	(1082, 52)
vitamin D < 11	0.45	0.38	0.75	[0.4, 1.3]	(1082, 52)
prev_cond	0.07	0.10	1.31	[0.2, 2.76]	(1082, 52)
heart_hx	0.03	0.00	NaN	NaN	NaN
breastfed	0.64	0.52	0.61	[0.35, 1.09]	(1082, 52)
premature_ind	0.12	0.13	1.10	[0.3, 2.14]	(1082, 52)
adm_pneumo	0.16	0.27	1.98	[0.94, 3.52]	(1082, 52)
adm_bronchopneumo	0.35	0.25	0.63	[0.28, 1.12]	(1082, 52)
adm_sepsis	0.16	0.21	1.37	[0.58, 2.49]	(1082, 52)
adm_bronchiolitis	0.28	0.27	0.97	[0.45, 1.69]	(1082, 52)
Influenza	0.03	0.02	0.69	[-0.66, 2.31]	(1082, 52)
HMPV	0.05	0.06	1.12	[-0.09, 2.68]	(1082, 52)
Rhino	0.33	0.27	0.74	[0.34, 1.29]	(1082, 52)

ICU¶

In [318]:

groupby_death = rsv_subset.groupby('icu')

In [319]:

make_table(groupby_death, table_vars=table_vars+virus_vars, replace_dict={False: 'Survived', True: 'Die'})

Out[319]:

	Survived	Die	OR	Interval	N
male	0.59	0.51	0.71	[0.46, 1.09]	(1041, 93)
under_2_months	0.18	0.33	2.21	[1.36, 3.45]	(1041, 93)
months_2_11	0.61	0.40	0.43	[0.27, 0.65]	(1041, 93)
months_12_23	0.13	0.02	0.14	[-0.05, 0.36]	(1041, 93)
Jordanian	0.89	0.91	1.27	[0.68, 4.05]	(1041, 93)
Palestinian	0.07	0.04	0.61	[0.04, 1.29]	(1041, 93)
vitamin D < 20	0.61	0.69	1.41	[0.91, 2.32]	(1041, 93)
vitamin D < 11	0.44	0.53	1.39	[0.91, 2.15]	(1041, 93)
prev_cond	0.07	0.10	1.34	[0.47, 2.48]	(1041, 93)
heart_hx	0.03	0.01	0.34	[-0.32, 1.17]	(1041, 93)
breastfed	0.64	0.60	0.86	[0.56, 1.36]	(1041, 93)
premature_ind	0.12	0.22	2.07	[1.1, 3.34]	(1041, 93)
adm_pneumo	0.14	0.38	3.61	[2.24, 5.6]	(1041, 93)
adm_bronchopneumo	0.36	0.10	0.19	[0.07, 0.33]	(1041, 93)
adm_sepsis	0.14	0.45	5.04	[3.19, 7.85]	(1041, 93)
adm_bronchiolitis	0.28	0.19	0.61	[0.32, 0.98]	(1041, 93)
Influenza	0.03	0.03	1.21	[-0.13, 2.97]	(1041, 93)
HMPV	0.05	0.06	1.28	[0.27, 2.62]	(1041, 93)
Rhino	0.33	0.31	0.91	[0.55, 1.4]	(1041, 93)