%matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd import plotly MPV_only = True hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0) hospitalized.head() hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date) hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date) hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date) hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date) hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1) [c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')] from calendar import monthrange from datetime import timedelta def monthdelta(d1, d2): delta = 0 while True: mdays = monthrange(d1.year, d1.month)[1] d1 += timedelta(days=mdays) if d1 <= d2: delta += 1 else: break return delta if MPV_only: hospitalized = hospitalized[hospitalized.pcr_result___2==1] hospitalized['icu_any'] = (hospitalized.icu + hospitalized.dir_icu).astype(bool) hospitalized.flaring.value_counts() hospitalized.cyanosis[hospitalized.cyanosis==4] = np.nan hospitalized.cyanosis.value_counts() hospitalized.wheezing[hospitalized.wheezing==4] = np.nan hospitalized.wheezing.value_counts() hospitalized.respiratory_rate = hospitalized.respiratory_rate.replace({'-': np.nan}).astype(float) hospitalized['respiratory_class'] = 0 hospitalized.respiratory_class[(hospitalized.respiratory_rate>30) & (hospitalized.respiratory_rate<=45)] = 1 hospitalized.respiratory_class[(hospitalized.respiratory_rate>45) & (hospitalized.respiratory_rate<=60)] = 2 hospitalized.respiratory_class[hospitalized.respiratory_rate>60] = 3 hospitalized.respiratory_class.value_counts() hospitalized['sats_class'] = 0 hospitalized.sats_class[(hospitalized.sats_number>=90) & (hospitalized.sats_number<95)] = 1 hospitalized.sats_class[(hospitalized.sats_number>=85) & (hospitalized.sats_number<90)] = 2 hospitalized.sats_class[hospitalized.sats_number<85] = 3 hospitalized.sats_class.value_counts() hospitalized['severity_score'] = hospitalized[['flaring', 'cyanosis', 'wheezing', 'respiratory_class', 'sats_class']].sum(1) hospitalized['admission_month'] = hospitalized.admission_date.apply(lambda x: x.month) hospitalized.cigarette_smokers.hist() (hospitalized.cigarette_smokers > 0).mean() (hospitalized.cigarette_smokers > 0).sum() hospitalized.nargila_smokers.hist() (hospitalized.nargila_smokers > 0).mean() (hospitalized.nargila_smokers > 0).sum() hospitalized.cigarette_preg.mean() hospitalized.nargila_preg.mean() days_by_sex = hospitalized.groupby('sex_child') _ = days_by_sex.boxplot(column='days_symptoms') hospitalized.breastfed = hospitalized.breastfed.replace({0: 'Not breastfed', 1: 'Breastfed'}) # _ = hospitalized.groupby('breastfed').boxplot(column='hospitalized_vitamin_d', grid=False, fontsize=0) # plt.gca().set_ylabel('Vitamin D levels') for index, group in hospitalized.groupby(['breastfed']): group.boxplot(column='hospitalized_vitamin_d') _ = hospitalized.groupby('breastfed')['oxygen'].hist(alpha=0.3) _ = hospitalized.groupby('oxygen').boxplot(column='rsv_count') hospitalized['male'] = (hospitalized.sex=='M').astype(int) age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23])) age_groups.index = hospitalized.index age_groups.columns = 'under 2 months', '2-11 months', '12-23 months' hospitalized = hospitalized.join(age_groups) nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'} hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup) hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int) hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int) hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int) hospitalized['vitamin D < 20'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int) hospitalized['vitamin D < 11'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int) hospitalized['breastfed'] = (hospitalized.breastfed=='Breastfed').astype(int) groupby_icu = hospitalized.groupby('icu_any') def make_table(groupby, table_vars, replace_dict={}): table = np.round(groupby[table_vars].mean(), 2).T ratios = [calc_or(groupby, v) for v in table.index] table['OR'] = pd.Series([r[0] for r in ratios], index=table.index) table['Interval'] = [r[1] for r in ratios] table['N'] = [r[2] for r in ratios] table.rename(columns=replace_dict, inplace=True) table.columns.name = None return(table) table_vars = ['male', 'under 2 months', '2-11 months', '12-23 months', 'Jordanian', 'Palestinian', 'vitamin D < 20', 'vitamin D < 11', 'prev_cond', 'heart_hx', 'breastfed', 'premature', 'adm_pneumo', 'adm_bronchopneumo', 'adm_sepsis', 'adm_bronchiolitis'] se = lambda p, n: np.sqrt(p * (1. - p) / n) def odds_ratio(x, y, n_sim=10000, alpha=0.05): try: n_x, n_y = len(x.dropna()), len(y.dropna()) p_x, p_y = x.mean(), y.mean() se_x = se(p_x, n_x) se_y = se(p_y, n_y) p_x_sim = np.random.normal(p_x, se_x, n_sim) p_y_sim = np.random.normal(p_y, se_y, n_sim) ratio = ((p_x_sim / (1. - p_x_sim)) / (p_y_sim / (1. - p_y_sim))) interval = np.percentile(ratio, [100*(alpha/2.), 100*(1. - alpha/2.)]) return np.round(np.median(ratio), 2), np.round(interval, 2).tolist(), (n_y, n_x) except ValueError: return np.nan, np.nan, np.nan def calc_or(groupby, var): data = list(groupby[var]) return odds_ratio(data[1][1], data[0][1]) pcr_lookup = {'pcr_result___1': 'RSV', 'pcr_result___2': 'HMPV', 'pcr_result___3': 'flu A', 'pcr_result___4': 'flu B', 'pcr_result___5': 'rhino', 'pcr_result___6': 'PIV1', 'pcr_result___7': 'PIV2', 'pcr_result___8': 'PIV3', 'pcr_result___13': 'H1N1', 'pcr_result___14': 'H3N2', 'pcr_result___15': 'Swine', 'pcr_result___16': 'Swine H1', 'pcr_result___17': 'flu C', 'pcr_result___18': 'Adeno'} hospitalized['RSV'] = hospitalized['pcr_result___1'] hospitalized['Influenza'] = (hospitalized['pcr_result___3'] | hospitalized['pcr_result___4']).astype(int) hospitalized['HMPV'] = hospitalized['pcr_result___2'] hospitalized['Rhino'] = hospitalized['pcr_result___5'] hospitalized['vitamin_d_norm'] = ((hospitalized.hospitalized_vitamin_d - hospitalized.hospitalized_vitamin_d.mean()) / hospitalized.hospitalized_vitamin_d.std()) hospitalized['age_centered'] = hospitalized.age_months/hospitalized.age_months.mean() pd.__version__ groupby_o2 = hospitalized.groupby('oxygen') virus_vars = ['RSV', 'Influenza', 'HMPV', 'Rhino'] make_table(groupby_o2, table_vars=table_vars+virus_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'}) groupby_vent = hospitalized.groupby('vent') make_table(groupby_vent, table_vars=table_vars+virus_vars, replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'}) groupby_death = hospitalized.groupby('death') make_table(groupby_death, table_vars=table_vars+virus_vars, replace_dict={False: 'Survived', True: 'Die'}) groupby_breastfed = hospitalized.groupby('breastfed') table_vars.pop(table_vars.index('breastfed')) make_table(groupby_breastfed, table_vars=table_vars+virus_vars, replace_dict={0: 'Not breastfed', 1: 'Breastfed'}) hospitalized.hospitalized_vitamin_d.notnull().sum() hospitalized.hospitalized_vitamin_d.median() (hospitalized.hospitalized_vitamin_d<20).mean() hospitalized.death.value_counts().rename({False: 'Survived', True: 'Died'}) hospitalized.death.mean() groupby_death = hospitalized.groupby('death') groupby_death['z_score'].median().rename({False: 'Survived', True: 'Died'}) groupby_death['gest_age'].median().rename({False: 'Survived', True: 'Died'}) groupby_death['hospitalized_vitamin_d'].median().rename({False: 'Survived', True: 'Died'}) groupby_death[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].mean().rename({False: 'Survived', True: 'Died'}) hospitalized[hospitalized.icu_any==1]['adm_pneumo'].sum() hospitalized[hospitalized.icu_any==1]['adm_bronchopneumo'].sum() hospitalized[hospitalized.icu_any==1]['adm_bronchiolitis'].sum()