%pylab inline
Populating the interactive namespace from numpy and matplotlib
cd ../src/
/cellar/users/agross/TCGA_Code/TCGA/src
from Processing.Imports import *
params = pd.read_table('../global_params.txt', header=None, squeeze=True,
index_col=0)
run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])
run = get_run(run_path, 'Run_' + params.ix['VERSION'])
cancer = run.load_cancer(params.ix['CANCER'])
clinical = cancer.load_clinical()
mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()
rna = pickle.load(open(cancer.path + '/mRNASeq/store/no_hpv.p', 'rb'))
#meth = pickle.load(open(cancer.path + '/Methylation/store/no_hpv.p', 'rb'))
mirna = pickle.load(open(cancer.path + '/miRNASeq/store/no_hpv.p', 'rb'))
clinical_processed = clinical.processed
clinical_processed = clinical_processed.replace('yes', 1.).replace('no', 0.)
clinical_processed['year'] = clinical_processed.year == 'post_2000'
hpv_inferred = clinical_processed.hpv_inferred
surv = clinical.survival.survival_5y
age = clinical.clinical.age.astype(float)
old = pd.Series(1.*(age>=75), name='old')
keepers_o = true_index(hpv_inferred == 0)
keepers_o = keepers_o.intersection(mut.features.columns)
keepers_o = keepers_o.intersection(cn.features.columns)
keepers_o = keepers_o.intersection(surv.unstack().index)
keepers_o = keepers_o.intersection(rna.features.columns)
keepers_o = keepers_o.intersection(mirna.features.columns)
keepers_o = keepers_o.intersection(true_index(age < 85))
p53_mut = mut.features.ix['TP53'].ix[keepers_o].dropna()
del_3p = cn.features.ix['Deletion'].ix['3p14.2'].ix[0].ix[keepers_o].dropna()
combo = combine(p53_mut==1, del_3p==-1)
combo = combo.map({'Lesion':'b', 'neither':'a', 'TP53':'c', 'both':'d'})
two_hit = combo=='d'
survival_and_stats(combo, surv, order=['d','c','b','a'], figsize=(5,4), colors=colors_th)
get_cox_ph(surv, combo=='d', [age, old], print_desc=True, interactions='just_feature');
coef exp(coef) se(coef) z p feature 1.216 3.37 0.2813 4.32 1.5e-05 old 0.263 1.30 0.0873 3.01 2.6e-03 Likelihood ratio test=29.7 on 2 df, p=3.51e-07 n= 251, number of events= 102 (127 observations deleted due to missingness)
get_cox_ph_ms(surv, combo=='d', [age, old], interactions='just_feature')
LR 8.98e-07 feature_p 1.54e-05 fmla Surv(days, event) ~ feature + old\n hazzard 3.37 dtype: object
c2 = colors_st
def st_plot(f):
'''
Global funciton for plotting subtypes in the background of a
clinical variable.
'''
n_unique = len(f.unique())
fig, axs = subplots(1,n_unique, figsize=(n_unique*3+1,3))
for i,g in enumerate(sorted(f.unique())):
ax = axs[i]
draw_survival_curve(two_hit.ix[true_index(f==g)], surv, ax=ax,
colors={0:c2[1], 1: c2[0]})
ax.get_legend().set_visible(False)
ax.set_title(g)
prettify_ax(ax)
axs[-1].legend(loc='upper right', frameon=False)
fig.tight_layout()
print 'Counts for plot below:\n'
print pd.crosstab(f, two_hit).T
print '\n'
def surv_models(f):
lr = get_cox_ph_ms(surv, '_' + two_hit.astype(str), [age, old, f], interactions=None)['LR']
print 'Likelihood ratio of full model compared to background: {0:.1e}\n'.format(lr)
print 'Full model with covariate and age:'
get_cox_ph(surv, '_' + two_hit.astype(str), [age, old, f], print_desc=True, interactions=None);
pd.crosstab(clinical.processed.stage, two_hit).T
stage | Stage i | Stage ii | Stage iii | Stage iv | nx |
---|---|---|---|---|---|
TP53 | |||||
False | 7 | 17 | 12 | 30 | 6 |
True | 7 | 20 | 24 | 106 | 22 |
2 rows × 5 columns
(20 + 7.) / (20 + 7 + 17 + 7)
0.5294117647058824
clin_stage = clinical.processed.stage.ix[keepers_o].dropna()
clin_stage.name = 'Clinical_Stage'
survival_and_stats(clin_stage, surv)
clin_stage.dropna().value_counts()
Stage iv 136 Stage ii 37 Stage iii 36 nx 28 Stage i 14 dtype: int64
rna.df.shape
(20502, 294)
clin_stage = clinical.stage.clinicalstage.ix[keepers_o]
clin_stage = clin_stage.replace({'stage iva': 'Stage III/IV', 'stage ivb':'Stage III/IV', 'stage ivc':'Stage III/IV',
'stage iii': 'Stage III/IV', 'stage i': 'Stage I/II', 'stage ii': 'Stage I/II'})
st_plot(clin_stage)
plt.gca().legend().set_visible(False)
Counts for plot below: clinicalstage Stage I/II Stage III/IV TP53 False 23 49 True 36 143 [2 rows x 2 columns]
(36 + 23.)
59.0
surv_models(clin_stage)
Likelihood ratio of full model compared to background: 9.0e-07 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.216 3.37 0.2813 4.32 1.5e-05 old 0.263 1.30 0.0873 3.01 2.6e-03 Likelihood ratio test=29.7 on 2 df, p=3.51e-07 n= 251, number of events= 102
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas-0.13.0_247_g82bcbb8-py2.7-linux-x86_64.egg/pandas/core/indexing.py:344: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead self.obj[item] = s
This is only for 232/259 patients
#st=st.astype(int)
f = clinical.stage.pathologicstage.ix[two_hit.index].fillna('Missing')
f = f.replace('stage iva','stage iv')
f = f.replace('stage ivb','stage iv')
f = f.replace('stage ivc','stage iv')
f.name = 'Pathologic_Stage'
path_stage = f
survival_and_stats(path_stage, surv)
st_plot(path_stage)
Counts for plot below: Pathologic_Stage Missing stage i stage ii stage iii stage iv TP53 False 6 7 17 12 30 True 22 7 20 24 106 [2 rows x 5 columns]
surv_models(path_stage)
Likelihood ratio of full model compared to background: 9.4e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.1389 3.123 0.2874 3.962 7.4e-05 old 0.2933 1.341 0.0892 3.287 1.0e-03 Pathologic_Stagestage i -1.2149 0.297 0.7625 -1.593 1.1e-01 Pathologic_Stagestage ii -0.3167 0.729 0.3919 -0.808 4.2e-01 Pathologic_Stagestage iii -0.2114 0.809 0.3830 -0.552 5.8e-01 Pathologic_Stagestage iv -0.0406 0.960 0.3078 -0.132 8.9e-01 Likelihood ratio test=34.2 on 6 df, p=6.01e-06 n= 251, number of events= 102
ecs = clinical.processed.spread
ecs.name = 'ECS'
survival_and_stats(ecs, surv)
st_plot(ecs.dropna())
Counts for plot below: ECS no yes TP53 False 41 5 True 78 46 [2 rows x 2 columns]
surv_models(ecs)
Likelihood ratio of full model compared to background: 3.2e-03 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 0.977 2.66 0.359 2.72 0.00660 old 0.391 1.48 0.117 3.35 0.00081 ECSyes 0.743 2.10 0.260 2.86 0.00420 Likelihood ratio test=27.7 on 3 df, p=4.28e-06 n= 170, number of events= 69 (69 observations deleted due to missingness)
spread_infered = clinical.processed.spread_inferred.ix[keepers_o].dropna()
spread_infered = spread_infered.map({0:'No ECS', 1:'ECS'})
survival_and_stats(spread_infered, surv)
st_plot(spread_infered)
Counts for plot below: spread_inferred ECS No ECS TP53 False 9 63 True 65 114 [2 rows x 2 columns]
surv_models(spread_infered)
Likelihood ratio of full model compared to background: 9.5e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.027 2.791 0.2890 3.55 0.00038 old 0.281 1.325 0.0875 3.21 0.00130 spread_inferredNo ECS -0.701 0.496 0.2082 -3.37 0.00075 Likelihood ratio test=40.6 on 3 df, p=7.87e-09 n= 251, number of events= 102
lymph_stage = clinical_processed.lymph_stage.replace('n2', 'n2+').replace('n3', 'n2+').dropna()
lymph_stage = lymph_stage.ix[keepers_o].fillna('nx')
st_plot(lymph_stage)
Counts for plot below: lymph_stage n0 n1 n2+ nx TP53 False 31 9 15 17 True 53 20 76 30 [2 rows x 4 columns]
surv_models(lymph_stage)
Likelihood ratio of full model compared to background: 8.5e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.152 3.16 0.2881 3.999 6.4e-05 old 0.274 1.32 0.0881 3.113 1.9e-03 lymph_stagen1 0.125 1.13 0.3935 0.317 7.5e-01 lymph_stagen2+ 0.604 1.83 0.2638 2.288 2.2e-02 lymph_stagenx 0.801 2.23 0.2840 2.820 4.8e-03 Likelihood ratio test=40 on 5 df, p=1.49e-07 n= 251, number of events= 102
f = lymph_stage.replace('n0','n0/n1')
f = f.replace('n1','n0/n1')
f = f.replace('Missing', nan).dropna()
f.name = 'N_stage'
lymph_n0n1 = f
st_plot(lymph_n0n1)
Counts for plot below: N_stage n0/n1 n2+ nx TP53 False 40 15 17 True 73 76 30 [2 rows x 3 columns]
surv_models(lymph_n0n1)
Likelihood ratio of full model compared to background: 8.4e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.152 3.17 0.2881 4.00 6.3e-05 old 0.274 1.32 0.0881 3.11 1.9e-03 N_stagen2+ 0.570 1.77 0.2392 2.38 1.7e-02 N_stagenx 0.767 2.15 0.2616 2.93 3.4e-03 Likelihood ratio test=39.9 on 4 df, p=4.52e-08 n= 251, number of events= 102
lymph_n0n1_noECS =lymph_n0n1.ix[true_index(clinical.processed.spread_inferred==0)].dropna()
st_plot(lymph_n0n1_noECS)
Counts for plot below: N_stage n0/n1 n2+ nx TP53 False 36 11 16 True 59 35 20 [2 rows x 3 columns]
surv_models(lymph_n0n1_noECS)
Likelihood ratio of full model compared to background: 1.7e-04 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.167 3.21 0.331 3.521 0.00043 old 0.218 1.24 0.116 1.875 0.06100 N_stagen2+ 0.247 1.28 0.326 0.759 0.45000 N_stagenx 0.681 1.98 0.311 2.191 0.02800 Likelihood ratio test=20.9 on 4 df, p=0.000328 n= 177, number of events= 59
f = clinical.stage.pathologicn.ix[two_hit.index].fillna('Missing')
f = f.map(str.upper)
f = f.replace('NX',nan)
f = f.replace('N2','n2+')
f = f.replace('N2A','n2+')
f = f.replace('N2B','n2+')
f = f.replace('N2C','n2+')
f = f.replace('N3','n2+')
f = f.fillna('MISSING')
f = f.map(str.lower)
f = f.ix[keepers_o].fillna('nx')
f.name = 'Lymph_Stage_Pathologic'
pathologic_n = f
survival_and_stats(pathologic_n, surv)
st_plot(pathologic_n)
Counts for plot below: Lymph_Stage_Pathologic missing n0 n1 n2+ TP53 False 17 31 9 15 True 30 53 20 76 [2 rows x 4 columns]
surv_models(pathologic_n)
Likelihood ratio of full model compared to background: 8.5e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.152 3.165 0.2881 3.999 6.4e-05 old 0.274 1.316 0.0881 3.113 1.9e-03 Lymph_Stage_Pathologicn0 -0.801 0.449 0.2840 -2.820 4.8e-03 Lymph_Stage_Pathologicn1 -0.676 0.509 0.3854 -1.755 7.9e-02 Lymph_Stage_Pathologicn2+ -0.197 0.821 0.2525 -0.782 4.3e-01 Likelihood ratio test=40 on 5 df, p=1.49e-07 n= 251, number of events= 102
f = pathologic_n.replace('n0','n0/n1')
f = f.replace('n1','n0/n1')
f = f.replace('Missing', nan).dropna()
pathologic_n0n1 = f
pathologic_n0n1.name = 'path_n0n1'
survival_and_stats(pathologic_n0n1.ix[keepers_o].fillna('Missing Pathology'), surv)
st_plot(pathologic_n0n1.ix[keepers_o].fillna('Missing Pathology'))
plt.gca().legend().set_visible(False)
Counts for plot below: path_n0n1 missing n0/n1 n2+ TP53 False 17 40 15 True 30 73 76 [2 rows x 3 columns]
surv_models(pathologic_n0n1)
Likelihood ratio of full model compared to background: 8.4e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.152 3.166 0.2881 4.000 6.3e-05 old 0.274 1.315 0.0881 3.109 1.9e-03 path_n0n1n0/n1 -0.767 0.464 0.2616 -2.934 3.4e-03 path_n0n1n2+ -0.198 0.821 0.2525 -0.783 4.3e-01 Likelihood ratio test=39.9 on 4 df, p=4.52e-08 n= 251, number of events= 102
pathologic_n0n1_noECS = pathologic_n0n1.ix[true_index(clinical.processed.spread_inferred==0)].dropna()
st_plot(pathologic_n0n1_noECS.ix[keepers_o].fillna('ECS / Missing'))
Counts for plot below: path_n0n1 ECS / Missing missing n0/n1 n2+ TP53 False 9 16 36 11 True 65 20 59 35 [2 rows x 4 columns]
surv_models(pathologic_n0n1_noECS)
Likelihood ratio of full model compared to background: 1.7e-04 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.167 3.213 0.331 3.52 0.00043 old 0.218 1.243 0.116 1.87 0.06100 path_n0n1n0/n1 -0.681 0.506 0.311 -2.19 0.02800 path_n0n1n2+ -0.434 0.648 0.355 -1.22 0.22000 Likelihood ratio test=20.9 on 4 df, p=0.000328 n= 177, number of events= 59
st_plot(clinical_processed.spread_inferred.dropna().map({1:'ECS',0:'No ECS'}))
plt.gca().legend().set_visible(False)
Counts for plot below: spread_inferred ECS No ECS TP53 False 9 63 True 65 114 [2 rows x 2 columns]
bsc = pd.read_table('../Extra_Data/HNSC_Followup/biospecimen_tumor_sample_hnsc.txt',
index_col=0, na_values=['[Not Available]'])
tw = bsc.tumor_weight.dropna()
tw = tw.groupby(level=0).first()
tn = bsc.tumor_nuclei_percent.dropna()
tn = tn.groupby(level=0).first()
ts = bsc.tumor_necrosis_percent.dropna()
ts = ts.groupby(level=0).first()
tw.clip_upper(500).hist()
<matplotlib.axes.AxesSubplot at 0x80b9e50>
tumor_weight = (tw >= 200)*1. + (tw > 200)*1
tumor_weight = tumor_weight.map({0:'200-', 1:'200', 2:'200+'})
tumor_weight.name = 'weight'
survival_and_stats(tumor_weight, surv, figsize=(6,4))
draw_survival_curves(combine(clinical.processed.year.dropna() == 'pre_2000', tumor_weight == '200+'), surv)
draw_survival_curves(tumor_weight == '200-', surv, combine(two_hit, mirna.features.ix['binary'].ix['hsa-mir-548k']))
draw_survival_curves(tumor_weight == '200-', surv, combine(two_hit, mirna.features.ix['binary'].ix['hsa-mir-548k']))
st_plot(tumor_weight)
Counts for plot below: weight 200 200+ 200- TP53 False 24 8 37 True 73 29 70 [2 rows x 3 columns]
surv_models(tumor_weight)
Likelihood ratio of full model compared to background: 1.3e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.106 3.022 0.2840 3.89 9.8e-05 old 0.263 1.301 0.0877 3.00 2.7e-03 weight200- -0.587 0.556 0.2479 -2.37 1.8e-02 weight200+ 0.505 1.657 0.2678 1.89 5.9e-02 Likelihood ratio test=43.9 on 4 df, p=6.84e-09 n= 241, number of events= 101 (59 observations deleted due to missingness)
tn.hist()
<matplotlib.axes.AxesSubplot at 0x808f050>
tumor_nec = (tn >= 70)*1. + (tn > 70)*1
tumor_nec = tumor_nec.map({0:'70-', 1:'70', 2:'70+'})
tumor_nec.name = 'tumor_necrosis'
survival_and_stats(tumor_nec, surv, figsize=(6,4))
st_plot(tumor_nec)
Counts for plot below: tumor_necrosis 70 70+ 70- TP53 False 36 33 3 True 120 51 7 [2 rows x 3 columns]
surv_models(tumor_nec)
Likelihood ratio of full model compared to background: 3.1e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.166 3.210 0.2823 4.13 3.6e-05 old 0.254 1.289 0.0882 2.88 4.0e-03 tumor_necrosis70- 0.615 1.851 0.6011 1.02 3.1e-01 tumor_necrosis70+ -1.013 0.363 0.2678 -3.78 1.6e-04 Likelihood ratio test=50.9 on 4 df, p=2.32e-10 n= 250, number of events= 102 (117 observations deleted due to missingness)
weight = tumor_weight!='200-'
weight = weight.ix[keepers_o].fillna(False)
clinical_vars = clinical_processed[['spread_inferred', 'invasion_inferred',
'smoker_inferred', 'drinker_inferred',
'old_age','age','year']]
clinical_vars = clinical_vars.join(clin_stage).join(weight)
clinical_vars = clinical_vars.ix[keepers_o].dropna()
m1 = get_cox_ph(surv, covariates=clinical_vars, print_desc=True, interactions=False);
coef exp(coef) se(coef) z p spread_inferred 0.340 1.405 0.0946 3.59 0.00033 invasion_inferred 0.240 1.271 0.1012 2.37 0.01800 smoker_inferred 0.249 1.282 0.1085 2.29 0.02200 old_ageAge > 75 0.676 1.967 0.2608 2.59 0.00950 year -0.183 0.833 0.0914 -2.00 0.04500 weight 0.274 1.315 0.1299 2.11 0.03500 Likelihood ratio test=53.5 on 6 df, p=9.49e-10 n= 251, number of events= 102
clinical_vars = clinical_processed[['spread_inferred', 'invasion_inferred',
'smoker_inferred', 'drinker_inferred',
'old_age','age','year']]
clinical_vars = clinical_vars.join(clin_stage).join(weight)
clinical_vars = clinical_vars.ix[keepers_o].dropna()
m2 = get_cox_ph(surv, two_hit, covariates=clinical_vars, print_desc=True, interactions=False);
coef exp(coef) se(coef) z p feature 0.947 2.578 0.2892 3.27 0.0011 spread_inferred 0.274 1.315 0.0960 2.85 0.0044 invasion_inferred 0.231 1.260 0.1017 2.27 0.0230 old_ageAge > 75 0.708 2.030 0.2610 2.71 0.0067 year -0.208 0.812 0.0917 -2.27 0.0230 weight 0.267 1.306 0.1293 2.06 0.0390 Likelihood ratio test=60.7 on 6 df, p=3.31e-11 n= 251, number of events= 102
exp(1.056), exp(1.056) - exp(1.056 - .3)
(2.8748485655216069, 0.74510836648250134)
LR_test(m2, m1)
0.0073021175074920782
clin_stage = clin_stage.replace('nx', 'unstaged')
lymph_n0n1 = lymph_n0n1.replace('nx','unstaged')
clinical_vars = clinical_processed[['spread_inferred', 'invasion_inferred',
'smoker_inferred', 'drinker_inferred',
'old_age','age','year']]
clinical_vars = clinical_vars.join(lymph_n0n1).join(weight)
clinical_vars = clinical_vars.ix[keepers_o].dropna()
clinical_vars = clinical_vars.ix[ti(age < 75)].dropna()
m1 = get_cox_ph(surv, covariates=clinical_vars, print_desc=True, interactions=False);
m2 = get_cox_ph(surv, two_hit, covariates=clinical_vars, print_desc=True, interactions=False);
coef exp(coef) se(coef) z p spread_inferred 0.278 1.320 0.1104 2.516 0.01200 invasion_inferred 0.258 1.295 0.1168 2.211 0.02700 smoker_inferred 0.395 1.484 0.1300 3.036 0.00240 year -0.326 0.722 0.0887 -3.671 0.00024 N_stagen2+ 0.277 1.319 0.2868 0.967 0.33000 N_stageunstaged 0.663 1.941 0.3049 2.175 0.03000 Likelihood ratio test=46.5 on 6 df, p=2.41e-08 n= 221, number of events= 83 coef exp(coef) se(coef) z p feature 1.192 3.295 0.3793 3.144 0.00170 spread_inferred 0.194 1.215 0.1108 1.755 0.07900 invasion_inferred 0.261 1.298 0.1176 2.216 0.02700 smoker_inferred 0.268 1.307 0.1320 2.027 0.04300 year -0.335 0.715 0.0899 -3.730 0.00019 N_stagen2+ 0.176 1.192 0.2871 0.612 0.54000 N_stageunstaged 0.706 2.026 0.3031 2.329 0.02000 Likelihood ratio test=58.7 on 7 df, p=2.74e-10 n= 221, number of events= 83
LR_test(m2, m1)
0.00046425482623168018
fig, ax = subplots(figsize=(6,4))
ci = convert_robj(robjects.r.summary(m2)[7])
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color='grey',
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xscale('log')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_xticks([.5, 1, 1.5, 2, 4,8])
ax.set_xticklabels([.5, 1, 1.5, 2, 4,8])
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
ax.set_xlabel('Hazard Ratio')
prettify_ax(ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/mv_pa.pdf', tranparent=True)
fmla1 = 'Surv(days, event) ~ weight + spread_inferred'
fmla2 = 'Surv(days, event) ~ feature + weight + spread_inferred'
clinical_vars = clinical_processed[['spread_inferred', 'invasion_inferred',
'smoker_inferred', 'drinker_inferred',
'old_age','age','year']]
clinical_vars = clinical_vars.join(lymph_n0n1).join(weight)
clinical_vars = clinical_vars.ix[keepers_o].dropna()
clinical_vars = clinical_vars.ix[ti(age >= 75)].dropna()
m1 = get_cox_ph(surv, covariates=clinical_vars, formula=fmla1, print_desc=False, interactions=False);
m2 = get_cox_ph(surv, two_hit, covariates=clinical_vars, formula=fmla2, print_desc=False, interactions=False);
LR_test(m2, m1)
0.98754451076085026
fig, ax = subplots(figsize=(6,2.5))
ci = convert_robj(robjects.r.summary(m2)[7])
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color='grey',
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xscale('log')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_xticks([.5, 1, 1.5, 2, 4,8])
ax.set_xticklabels([.5, 1, 1.5, 2, 4,8])
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
ax.set_xlabel('Hazard Ratio')
prettify_ax(ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/mv_pb.pdf', tranparent=True)
def st_plot(f):
'''
Global funciton for plotting subtypes in the background of a
clinical variable.
'''
n_unique = len(f.unique())
fig, axs = subplots(1,n_unique, figsize=(7,3), sharey=True)
for i,g in enumerate(sorted(f.unique())):
ax = axs[i]
draw_survival_curve(two_hit.ix[true_index(f==g)], surv, ax=ax,
colors={0:c2[1], 1: c2[0]})
ax.get_legend().set_visible(False)
ax.set_title(g)
prettify_ax(ax)
axs[-1].legend(loc='upper right', frameon=False)
fig.tight_layout()
print 'Counts for plot below:\n'
print pd.crosstab(f, two_hit).T
print '\n'
st_plot(lymph_n0n1)
plt.savefig('/cellar/users/agross/figures/mv_pc.pdf')
Counts for plot below: N_stage n0/n1 n2+ unstaged TP53 False 40 15 17 True 73 76 30 [2 rows x 3 columns]
lymph_n0n1 = lymph_n0n1.replace('nx','unstaged')
clinical_vars = clinical_processed[['spread_inferred', 'invasion_inferred',
'smoker_inferred', 'drinker_inferred',
'old_age','age']]
clinical_vars = clinical_vars.join(lymph_n0n1)
clinical_vars = clinical_vars.ix[keepers_o].dropna()
clinical_vars = clinical_vars.ix[ti(age < 75)].dropna()
m1 = get_cox_ph(surv, covariates=clinical_vars, print_desc=False, interactions=False);
m2 = get_cox_ph(surv, two_hit, covariates=clinical_vars, print_desc=False, interactions=False);
LR_test(m2, m1)
0.00048996118568016376
fig, ax = subplots()
ci = convert_robj(robjects.r.summary(m2)[7])
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color='grey',
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xscale('log')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_xticks([.67, 1, 1.5, 2, 4,8])
ax.set_xticklabels([.67, 1, 1.5, 2, 4,8])
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
prettify_ax(ax)
Clinical model
clinical_vars = clinical_processed[['spread_inferred', 'invasion_inferred',
'smoker_inferred', 'drinker_inferred',
'old_age','age']]
clinical_vars = clinical_vars.join(lymph_n0n1)
clinical_vars = clinical_vars.ix[keepers_o].dropna()
#clinical_vars = clinical_vars.ix[ti(age < 75)].dropna()
m1 = get_cox_ph(surv, covariates=clinical_vars, print_desc=True, interactions=False);
coef exp(coef) se(coef) z p spread_inferred 0.369 1.45 0.0995 3.71 0.00021 invasion_inferred 0.205 1.23 0.1025 1.99 0.04600 smoker_inferred 0.304 1.36 0.1113 2.73 0.00630 old_ageAge > 75 0.805 2.24 0.2595 3.10 0.00190 N_stagen2+ 0.326 1.38 0.2558 1.27 0.20000 N_stageunstaged 0.743 2.10 0.2625 2.83 0.00460 Likelihood ratio test=46 on 6 df, p=3.03e-08 n= 251, number of events= 102
Combined model
clinical_vars = clinical_processed[['spread_inferred', 'invasion_inferred',
'smoker_inferred', 'drinker_inferred',
'old_age','age']]
clinical_vars = clinical_vars.join(lymph_n0n1)
clinical_vars = clinical_vars.ix[keepers_o].dropna()
#clinical_vars = clinical_vars.ix[ti(age < 75)].dropna()
m2 = get_cox_ph(surv, two_hit, covariates=clinical_vars, print_desc=True, interactions=False);
coef exp(coef) se(coef) z p feature 0.844 2.32 0.306 2.761 0.0058 spread_inferred 0.300 1.35 0.101 2.979 0.0029 invasion_inferred 0.192 1.21 0.103 1.871 0.0610 smoker_inferred 0.203 1.22 0.115 1.761 0.0780 old_ageAge > 75 0.806 2.24 0.261 3.090 0.0020 N_stagen2+ 0.244 1.28 0.256 0.953 0.3400 N_stageunstaged 0.737 2.09 0.263 2.804 0.0051 Likelihood ratio test=54.6 on 7 df, p=1.8e-09 n= 251, number of events= 102
exp(0.844), exp(0.844) - exp(0.844 - .306)
(2.3256510003566726, 0.61307272216932507)
LR_test(m2, m1)
0.0033076347737565608