import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory populating namespace with data
cancers = {c: run.load_cancer(c) for c in run.cancers}
from Data.ProcessClinicalDataPortal import update_clinical_object
clinical = {c: cancer.load_clinical() for c, cancer in cancers.iteritems()}
for cancer,clin in clinical.iteritems():
try:
path = OUT_PATH + '/Followup/' + cancer + '/'
clinical[cancer] = update_clinical_object(clin, path)
except:
print cancer
do_nothing = True
Data/ProcessClinicalDataPortal.py:35: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead f['vitalstatus'] = f['daystodeath'].isnull()
surv = pd.DataFrame({c: v.survival.survival for c,v in clinical.iteritems()})
surv = surv.stack()
codes = pd.Series({s[0]: s[1] for s in surv[:,'days'].index})
codes = codes.groupby(level=0).first()
codes.name = 'codes'
surv.index = surv.index.droplevel(2)
surv = surv.groupby(level=[0,1]).first()
surv = surv.ix[ti(surv[:,'days'] >= 7)]
surv_5y = pd.DataFrame({c: v.survival.survival_5y for c,v in clinical.iteritems()})
surv_5y = surv_5y.stack()
surv_5y.index = surv_5y.index.droplevel(2)
surv_5y = surv_5y.groupby(level=[0,1]).first()
#bad = ((surv_5y.unstack()['days'] < 8) * (surv_5y.unstack()['event'] == 1)) == False
#surv_5y = surv_5y.unstack().ix[bad].stack()
surv_5y = surv_5y.ix[ti(surv[:,'days'] >= 7)]
for c in clinical.values():
c = c.artificially_censor(10)
surv_10y = pd.DataFrame({c: v.survival.survival_10y for c,v in clinical.iteritems()})
surv_10y = surv_10y.stack()
surv_10y.index = surv_10y.index.droplevel(2)
surv_10y = surv_10y.groupby(level=[0,1]).first()
surv_10y = surv_10y.ix[ti(surv[:,'days'] >= 7)]
for c in clinical.values():
c = c.artificially_censor(3)
surv_3y = pd.DataFrame({c: v.survival.survival_3y for c,v in clinical.iteritems()})
surv_3y = surv_3y.stack()
surv_3y.index = surv_3y.index.droplevel(2)
surv_3y = surv_3y.groupby(level=[0,1]).first()
surv_3y = surv_3y.ix[ti(surv[:,'days'] >= 7)]
all_mut = pd.read_csv(OUT_PATH + '/MAFs/mega_maf.csv', index_col=0)
all_mut = all_mut[all_mut.Tumor_Sample_Barcode.apply(lambda s: s[13:16]) == '01A']
non_coding = ['Silent','RNA','IGR',"5'Flank", "3'UTR",'Intron',"5'UTR"]
all_mut = all_mut[all_mut.Variant_Classification.isin(non_coding)==False]
all_mut.Tumor_Sample_Barcode = all_mut.Tumor_Sample_Barcode.map(lambda s: s[:12])
all_mut = all_mut.groupby(['Tumor_Sample_Barcode', 'Hugo_Symbol']).size()
all_mut = all_mut.unstack().T.fillna(0)
mut = {c: cancer.load_data('Mutation') for c, cancer in cancers.iteritems()
if run.sample_matrix.MAF.ix[c] > 0}
hit_mat = pd.concat([m.df.unstack() for c,m in mut.iteritems()], axis=1).fillna(0)
hit_mat = hit_mat.groupby(axis=1, level=0).first()
hit_mat = pd.concat([hit_mat, all_mut], axis=1)
hit_mat = hit_mat.groupby(axis=1, level=0).sum() > 0
cn_all = {}
for c in codes.unique():
try:
cn_all[c] = FH.get_gistic_gene_matrix(run.data_path, c)
except:
print c
cn_all = pd.concat(cn_all.values(), axis=1)
cn_all = cn_all.groupby(axis=1, level=0).first()
Here we are going through a number of clinical variables and finding confounders which may effect a genomic analysis of the data. We opted not to just use everything multivariate modeling of covariates as it would create a very complicated model considering we already have ~15 degrees of freedom with our tissue type variable. The strategy here is to isolate factors which have a large influence on outcome and to filter those out to try and get at a more homogenous set of patients.
non_embargo = ['AML','BRCA','KIRC','COAD','READ','COADREAD','SKCM','GBM',
'HNSC','LUAD','LUSC','OV','STAD','THCA','UCEC']
#codes = codes[codes.isin(non_embargo)]
c2 = codes[codes.isin(ti(codes.value_counts() > 30))]
survival_and_stats(c2, surv_10y, upper_lim=10, figsize=(10,12))
fig, axs = subplots(2,1, figsize=(8,8))
ax = axs[0]
t = get_surv_fit(surv, codes)
t = t.sort([('5y Survival','Surv')])
t['Stats'].plot(kind='bar', ax=ax)
prettify_ax(ax)
ax2 = axs[1]
t = get_surv_fit(surv, codes)
tt = t['5y Survival'].sort('Surv')
b = (tt['Surv']).plot(kind='bar', ax=ax2, color='grey',
yerr=[tt.Surv-tt.Lower, tt.Upper-tt.Surv], ecolor='black')
ax2.set_ylabel('5Y Survival')
ax2.set_yticks([0, .5, 1.])
prettify_ax(ax2)
fig.tight_layout()
fys = get_surv_fit(surv, codes)['5y Survival']['Surv'].order()
nn = codes.isin(true_index((fys > .1) & (fys < .83)))
fys
GBM 0.06 LAML 0.23 OV 0.31 STAD 0.35 LUAD 0.35 LIHC 0.36 BLCA 0.39 HNSC 0.44 LUSC 0.45 READ 0.47 ACC 0.48 SARC 0.58 SKCM 0.61 KIRC 0.62 COAD 0.63 LGG 0.65 CESC 0.65 KIRP 0.68 UCEC 0.79 BRCA 0.81 DLBC 0.84 KICH 0.87 THCA 0.89 PRAD 0.97 ESCA NaN PAAD NaN Name: Surv, dtype: float64
age = pd.concat([c.clinical.daystobirth for c in clinical.values()])
age = -1*np.floor(age / 365.25)
a2 = a2 = pd.concat([c.clinical.age for c in clinical.values()])
age = age.combine_first(a2)
age = age.groupby(level=0).min()
age = age.fillna(age.median())
age.name = 'age'
survival_and_stats((age / 15).round(), surv_5y)
survival_and_stats(age.dropna() >= 85, surv_5y)
get_cox_ph(surv_5y, age >= 85, print_desc=True);
coef exp(coef) se(coef) z p feature 0.723 2.06 0.123 5.89 3.8e-09 Likelihood ratio test=28.1 on 1 df, p=1.18e-07 n= 7294, number of events= 1959
old = age >= 75
old.name = 'age_over_75'
survival_and_stats(old.ix[ti(age < 85)], surv_5y)
exp(.773), exp(.773)-(exp(.773-.139))
(2.1662552812206535, 0.28111921870672885)
year = pd.concat([c.clinical.yearofinitialpathologicdiagnosis for c in clinical.values()])
year = year.groupby(level=0).first().replace('[Discrepancy]', nan).astype(float).dropna()
survival_and_stats(year <= 2000, surv_5y)
get_cox_ph_ms(surv_5y, year < 2000, [codes])
LR 5.92e-09 feature_p NaN fmla Surv(days, event) ~ codes + codes:feature\n hazzard NaN dtype: object
r_status = pd.concat([c.clinical.residualtumor for c in clinical.values() if 'residualtumor'
in c.clinical])
r_status = r_status.replace(['[Not Evaluated]', '[Unknown]', 'RX'])
r_status = r_status.str.lower()
r_status = r_status.groupby(level=0).first()
survival_and_stats(r_status.dropna(), surv_5y)
get_cox_ph(surv_5y, r_status=='r2', print_desc=True);
coef exp(coef) se(coef) z p feature 1.05 2.86 0.18 5.82 5.8e-09 Likelihood ratio test=25.1 on 1 df, p=5.57e-07 n= 3555, number of events= 809
exp(1.05), exp(1.05)-(exp(1.05-.18))
(2.8576511180631639, 0.47074026453888695)
margin_status = pd.concat([c.clinical.marginstatus for c in clinical.values() if 'marginstatus' in c.clinical])
margin_status = margin_status.groupby(level=0).first()
survival_and_stats(margin_status.dropna(), surv_5y)
s = {}
for can,c in clinical.iteritems():
if 'pathologicstage' in c.stage:
s[can] = c.stage.pathologicstage
elif 'clinicalstage' in c.stage:
s[can] = c.stage.clinicalstage
stage = pd.concat(s.values())
stage = stage.groupby(level=0).first()
stage = stage.dropna().map(lambda s: s.replace('A','').replace('B','').replace('C',''))
stage = stage.dropna().map(lambda s: s.replace('1','').replace('2',''))
stage = stage.replace('stage iic','stage ii')
stage = stage.ix[age.index].fillna('missing')
stage = stage.str.lower()
stage = stage.replace(['[unknown]','stage x','stage tis','i or ii nos', 'stage 0'], nan).dropna()
stage.name = 'stage'
survival_and_stats(stage.ix[true_index(nn)].dropna(), surv_5y)
get_cox_ph(surv_5y, stage == 'stage iv', print_desc=True);
coef exp(coef) se(coef) z p feature 0.712 2.04 0.0625 11.4 0 Likelihood ratio test=110 on 1 df, p=0 n= 7263, number of events= 1952
exp(.712), exp(.712)-(exp(.712-.0625))
(2.0380633118599061, 0.1234800138578156)
mets = pd.concat([c.stage.pathologicm for can,c in clinical.iteritems()
if 'pathologicm' in c.stage])
mets = mets.groupby(level=0).first()
mets = pd.concat([c.stage.pathologicm for can,c in clinical.iteritems() if 'pathologicm' in c.stage])
mets = mets.groupby(level=0).first()
mets = mets.dropna().map(lambda s: s.replace('a','').replace('b','').replace('c',''))
mets = mets.replace(['[Unknown]','MX'], nan).dropna()
mets = mets=='M1'
mets.name = 'metastasis'
survival_and_stats(mets, surv_5y)
survival_and_stats(combine(mets, stage=='stage iv'), surv_5y)
new_p53_calls = pd.read_csv('../Extra_Data/p53_calls_pancancer.csv', header=None, index_col=0,
squeeze=True)
p53_new = (hit_mat.ix['TP53'].combine_first(new_p53_calls)) > 0
survival_and_stats(combine(new_p53_calls, hit_mat.ix['TP53']), surv_5y)
filters = pd.concat([age > 85, mets, r_status=='r2', stage=='stage iv', nn==False,
codes.isin(['ACC','ESCA'])], 1)
keepers = true_index((filters > 0).sum(1) == 0)
keepers = keepers.intersection(surv_5y.unstack().index)
keepers = keepers.intersection(p53_new.index).intersection(cn_all.columns)
keepers = keepers.intersection(codes.index)
stage = stage.ix[keepers.union(stage.index)].fillna('missing')
len(keepers)
4583
survival_and_stats(codes.ix[keepers].order(), surv_10y, upper_lim=10, figsize=(10,10))
plt.savefig(FIGDIR + 'pancan_supp.pdf', transparent=True)
p53_all = p53_new > 0
p53_mut = p53_all.ix[keepers]
p53_all.name = 'TP53'
p53_mut.name = 'TP53'
survival_and_stats(p53_mut, surv_5y)
get_cox_ph_ms(surv_5y, p53_mut, [codes, age, stage], interactions=None)
LR 0.12 feature_p 0.121 fmla Surv(days, event) ~ feature + codes + age + st... hazzard 1.14 dtype: object
cn_all2 = cn_all.ix[[i for i in cn_all.index if i[2] in rna.df.index]]
pct_del = (cn_all2 < 0).sum() / (1.*len(cn_all2))
pct_amp = (cn_all2 > 0).sum() / (1.*len(cn_all2))
pct_altered = pct_amp + pct_del
pct_altered.name = 'CIN_pct_altered'
pct_del.name = 'pct_del'
survival_and_stats(to_quants(pct_altered, q=.33), surv)
del_3p_all = cn_all.ix['3p14.2'].median().round()
del_3p_all = del_3p_all < 0
del_3p_all = del_3p_all.map({True:-1, False:0})
del_3p = del_3p_all.ix[keepers].dropna()
del_3p.name = 'del_3p'
del_3p_all.name = 'del_3p'
survival_and_stats(del_3p, surv_5y)
get_cox_ph_ms(surv_5y, del_3p < 0, [codes, age, stage, pct_altered], interactions=None)
LR 0.0594 feature_p 0.0587 fmla Surv(days, event) ~ feature + codes + age + st... hazzard 1.18 dtype: object
old.name = 'age_over_75'
fig, axs = subplots(3, 1, figsize=(3.5,4.5), sharey=True, sharex=True)
fmla = 'Surv(days, event) ~ del_3p + pct_del + age + codes + age_over_75'
fmla2 = 'Surv(days, event) ~ pct_del + age + codes + age_over_75'
k2 = keepers.diff(codes[codes.isin(['HNSC','LUSC'])].index)
pts = [k2, ti(p53_mut>0).intersection(k2),
ti(p53_mut==0).intersection(k2)]
color_list = ['grey', '#a1d99b','#9ecae1']
for i, pt in enumerate(pts):
ax = axs[i]
m1 = get_cox_ph(surv_5y, covariates=[del_3p.ix[pt].dropna() < 0, pct_del, stage, age, old, codes],
formula=fmla, print_desc=False, interactions=False);
m2 = get_cox_ph(surv_5y, covariates=[del_3p.ix[pt].dropna() < 0, pct_del, stage, age, old, codes],
formula=fmla2, print_desc=False, interactions=False);
print LR_test(m1, m2)
ci = convert_robj(robjects.r.summary(m1)[7])
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color=color_list[i],
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color=color_list[i])
ax.axvline(1, ls='--', color='black')
ax.set_xscale('log')
ax.set_xbound(.8,1.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_xticks([1])
ax.set_xticklabels([1])
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
ax.set_ybound(-.5, 1.5)
prettify_ax(ax)
axs[2].set_xlabel('Hazard Ratio')
fig.tight_layout()
0.0416696152492 0.00662221454448 0.852501276826
combo = combine(del_3p < 0, p53_mut>0)
combo = combo.ix[keepers].dropna()
combo.name = 'TP53 / 3p14'
len(combo), len(combo.ix[codes[codes != 'HNSC'].index].dropna())
(4583, 4404)
venn_pandas(del_3p_all < 0, p53_all>0)
<matplotlib_venn._venn2.Venn2 instance at 0x23a3d998>
len(combine(del_3p_all.ix[del_3p_all.index.diff(true_index(codes=='HNSC'))].dropna() < 0,
p53_all>0))
7081
fisher_exact_test(del_3p_all.ix[del_3p_all.index.diff(true_index(codes=='HNSC'))].dropna() < 0,
p53_all>0)
odds_ratio 2.47e+00 p 3.24e-62 dtype: float64
venn_pandas(del_3p.ix[del_3p.index.diff(true_index(codes=='HNSC'))].dropna() < 0,
p53_mut>0)
<matplotlib_venn._venn2.Venn2 instance at 0x2a61d3b0>
len(combine(del_3p.ix[combo.index.diff(true_index(codes=='HNSC'))] < 0, p53_mut>0))
4404
fisher_exact_test(del_3p.ix[combo.index.diff(true_index(codes=='HNSC'))] < 0, p53_mut>0)
odds_ratio 2.00e+00 p 4.67e-26 dtype: float64
combo_all = combine(p53_all>0, del_3p_all<0)
combo_all.name = 'TP53 / 3p14'
ct = pd.crosstab(combo_all, stage[stage != 'missing']=='stage iv').T
(ct.ix[True] / (1.*ct.sum())).order().plot(kind='bar');
plt.ylabel('% of Patients in Stage IV')
plt.xlabel('');
c = clinical['LGG'].clinical
combo_all = combine(p53_all>0, del_3p_all<0)
fisher_exact_test(combo_all=='both', stage[stage != 'M']=='stage iv')
odds_ratio 2.46e+00 p 2.47e-20 dtype: float64
Univarite model, all patients
get_cox_ph(surv_5y, combo=='both', print_desc=True);
coef exp(coef) se(coef) z p feature 0.505 1.66 0.0767 6.59 4.3e-11 Likelihood ratio test=39.5 on 1 df, p=3.22e-10 n= 4583, number of events= 955
Multivariate model, all patients
get_cox_ph_ms(surv_5y, combo=='both', [codes, age, stage], interactions=None)
LR 0.000325 feature_p 0.000249 fmla Surv(days, event) ~ feature + codes + age + st... hazzard 1.4 dtype: object
pts = true_index(codes != 'HNSC')
Univariate model, HNSCC excluded
get_cox_ph(surv_5y, combo.ix[pts].dropna()=='both', print_desc=True);
coef exp(coef) se(coef) z p feature 0.452 1.57 0.082 5.52 3.4e-08 Likelihood ratio test=27.8 on 1 df, p=1.35e-07 n= 4404, number of events= 904
TP53-3p vs. 3p alone
f = (combo[combo.isin(['del_3p','both'])]=='both').ix[pts].dropna()
cox(f, surv_5y)
hazard exp(coef) 1.73e+00 exp(-coef) 5.78e-01 lower .95 1.39e+00 upper .95 2.15e+00 LR stat 2.43e+01 df 1.00e+00 p 8.11e-07 concordance stat 5.74e-01 se 1.49e-02 dtype: float64
TP53-3p vs. TP53 alone
cox((combo[combo.isin(['TP53','both'])].dropna()=='both').ix[pts].dropna(),
surv_5y).ix['LR'].ix['p']
0.001689913203002158
pts_y = pts.intersection(true_index(age < 75))
log_rank((combo[combo.isin(['TP53','both'])]=='both').ix[pts_y].dropna(), surv_5y).ix['p']
0.0031807460590553751
cox((combo[combo.isin(['TP53','both'])]=='both').ix[pts_y].dropna(), surv_5y)
hazard exp(coef) 1.39 exp(-coef) 0.72 lower .95 1.12 upper .95 1.74 LR stat 8.50 df 1.00 p 0.00 concordance stat 0.56 se 0.02 dtype: float64
Multivariate model, HNSCC excluded
get_cox_ph_ms(surv_5y, (combo=='both').ix[true_index(codes!='HNSC')].dropna(),
[codes, stage, age, old], interactions=False)
LR 0.00513 feature_p 0.00436 fmla Surv(days, event) ~ feature + codes + stage + ... hazzard 1.32 dtype: object
two_hit = combo == 'both'
two_hit.name = 'two_hit'
p53_mut.name = 'TP53'
pct_del.name = 'pct_del'
old = age >= 75
old.name = 'old'
Pan-Cancer HNSCC excluded
pts = keepers.diff(ti(codes=='HNSC'))
fmla0 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + old + age + codes + stage'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del + old + age + codes + stage'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + old + age + codes + stage'
m0 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old,
age, pct_del, codes, stage], formula=fmla0)
m1 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old,
age, pct_del, codes, stage], formula=fmla1)
m2 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old,
age, pct_del, codes, stage], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)
(0.074818360700235967, 0.023396113337244468)
Interestingly lung squamous cell carcinoma behaves differently than the others, if we exclude it from the analysis (along with HNSCC) we get a very significant association as we did for HNSCC. We omitted this from the manuscript as it is a bit ad-hoc, but it very well could be the case that the TP53-3p effect is occuring in LUSC, but the alternate routes to cancer are just more aggressive.
pts = keepers.diff(ti(codes.isin(['HNSC','LUSC'])))
#pts = keepers.diff(ti(codes=='HNSC'))
fmla0 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + codes + stage + age'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del + codes + stage + age'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + codes + stage + age'
m0 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, age, pct_del, codes, stage], formula=fmla0)
m1 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, age, pct_del, codes, stage], formula=fmla1)
m2 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, age, pct_del, codes, stage], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)
(0.0010334523028629353, 0.00023897916452182404)
pts = keepers.diff(ti(codes.isin(['HNSC','LUSC'])))
fmla0 = 'Surv(days, event) ~ TP53 + del_3p'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del'
m0 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del], formula=fmla0)
m1 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del], formula=fmla1)
m2 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)
(2.6714203811114096e-05, 1.7904393791766563e-05)
cc = array(colors_th)
fig, ax = subplots(figsize=(5,3))
draw_survival_curve(combo, surv_5y, ax=ax,
colors=cc[[2,0,3,1]], ms=30, alpha=.7)
ax.get_legend().set_visible(False)
ax.set_ybound(0,1)
ax.set_xbound(0,5)
prettify_ax(ax)
combo.ix[true_index(codes!='HNSC')].dropna().ix[surv_5y.unstack().index].dropna().value_counts()
neither 1954 TP53 1047 both 726 del_3p 677 dtype: int64
k2 = combo.index.intersection(true_index(codes!='HNSC'))
cc = array(colors_th)
fig, ax = subplots(figsize=(4,3))
draw_survival_curve(combo.ix[k2], surv_5y, ax=ax,
colors={'both': cc[0], 'TP53': cc[1], 'del_3p': cc[2], 'neither':cc[3]},
ms=30, alpha=.7)
ax.get_legend().set_visible(False)
ax.set_ybound(0,1)
ax.set_xbound(0,5)
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'fig3e.pdf', transparent=True)
df = pd.concat([p53_mut, del_3p, combo.ix[k2], surv[:,'days'], surv[:,'event']],
keys=['TP53 Mutation', '3p Deletion', 'TP53 / 3p', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).dropna()
df.to_csv(FIGDIR + 'fig3c.csv')
get_surv_fit(surv, combo.ix[ti(codes != 'HNSCC')], time_cutoff=3)
Stats | Median Survival | 3y Survival | ||||||
---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | |
both | 823 | 246 | 4.78 | 4.11 | 5.72 | 0.62 | 0.58 | 0.67 |
del_3p | 705 | 182 | 8.12 | 6.42 | 9.48 | 0.80 | 0.76 | 0.83 |
neither | 1993 | 433 | 8.20 | 6.96 | 10.60 | 0.77 | 0.75 | 0.80 |
TP53 | 1062 | 272 | 5.56 | 4.85 | 7.17 | 0.75 | 0.72 | 0.79 |
fmla = 'Surv(days, event) ~ feature + age + old + stage + strata(codes)'
m = {'TP53':'c', 'both':'d', 'neither':'a', 'del_3p':'b'}
cm = combo.map(m).ix[true_index(codes != 'HNSC')]
f = get_cox_ph(surv_3y, cm, interactions=False)
ci = convert_robj(robjects.r.summary(f)[7])
ci.index = ['3p','TP53','both']
n = ci.ix[0]*0 +1
n.name = 'neither'
ci = ci.append(n)
ci = ci.ix[['neither','3p', 'TP53','both']]
ci_uni = ci
f = get_cox_ph(surv_3y, cm, [codes, age, old, stage],
formula=fmla, interactions=False)
ci = convert_robj(robjects.r.summary(f)[7])
ci = ci.ix[:3]
ci.index = ['3p','TP53','both']
n = ci.ix[0]*0 +1
n.name = 'neither'
ci = ci.append(n)
ci = ci.ix[['neither','3p', 'TP53','both']]
ci_full = ci
fig, ax = subplots(1,1, figsize=(3,2.5))
ci = ci_full
haz = ci['exp(coef)']
b = haz.plot(kind='bar', ax=ax, color=cc[[3,2,1,0]],
yerr=[haz - ci['lower .95'], ci['upper .95'] - haz], ecolor='black')
prettify_ax(ax)
ax.set_ylabel('hazard')
ax.set_ylim(0)
ax.axhline(y=1, ls='--', lw=3, color='black', alpha=.5)
fig.tight_layout()
fig, ax = subplots(1,1, figsize=(2.3,3))
ci = ci_full.ix[::-1]
haz = ci['exp(coef)']
color_list = colors_th
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color=color_list[j],
edgecolors=['black'], zorder=10, )
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color=color_list[j])
ax.axvline(1, ls='--', color='black')
ax.set_xscale('log')
ax.set_xbound(.7,2.)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_xticks([.75, 1, 1.25, 1.5, 2])
ax.set_xticklabels([.75, 1, 1.25, 1.5, 2])
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels('')
ax.set_ybound(-.5, 3.5)
prettify_ax(ax)
ax.set_xlabel('3 Year Hazard Ratio')
fig.tight_layout()
fig.savefig(FIGDIR + 'fig3f.pdf', transparent=True)
fig, axs = subplots(1,2, figsize=(5,2.5), sharey=True)
for i,ci in enumerate([ci_uni, ci_full]):
ax = axs[i]
haz = ci['exp(coef)']
b = haz.plot(kind='bar', ax=ax, color=cc[[3,2,1,0]],
yerr=[haz - ci['lower .95'], ci['upper .95'] - haz], ecolor='black')
prettify_ax(ax)
ax.set_ylabel('hazard')
ax.set_ylim(0)
ax.axhline(y=1, ls='--', lw=3, color='black', alpha=.5)
fig.tight_layout()
from itertools import combinations
c = list(combinations(cm.unique(),2))[0]
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_5y, cm[cm.isin(c)], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
TP53 both 1.69e-03 del_3p 1.28e-02 neither 3.15e-02 both del_3p 8.11e-07 neither 1.64e-07 neither del_3p 6.53e-01 dtype: float64
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_3y, cm[cm.isin(c)], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
TP53 both 2.23e-05 del_3p 1.49e-01 neither 9.25e-01 both del_3p 2.22e-07 neither 3.87e-06 neither del_3p 1.19e-01 dtype: float64
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_3y, cm[cm.isin(c)], [codes, age], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
TP53 both 1.49e-03 del_3p 8.57e-01 neither 4.18e-01 both del_3p 7.11e-02 neither 2.78e-04 neither del_3p 8.42e-01 dtype: float64
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_5y, cm[cm.isin(c)], [codes, age, stage], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
TP53 both 5.03e-03 del_3p 4.52e-01 neither 7.22e-01 both del_3p 3.31e-01 neither 2.96e-04 neither del_3p 8.06e-01 dtype: float64
pts = keepers.diff(ti(codes=='HNSC'))
cm = combo.ix[pts].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_3y, cm[cm.isin(c)], [codes, age, old, stage],
interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
TP53 both 2.33e-03 del_3p 8.95e-01 neither 3.91e-01 both del_3p 5.88e-02 neither 9.21e-04 neither del_3p 7.22e-01 dtype: float64
df = pd.concat([p53_mut, del_3p, codes, age, old, stage.ix[keepers].fillna('missing'), combo.ix[k2], surv[:,'days'], surv[:,'event']],
keys=['TP53 Mutation', '3p Deletion',
'Tissue Type', 'Age', 'Age > 75', 'Stage'
'TP53 / 3p', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).ix[keepers]
df = pd.concat([p53_mut, del_3p, codes, age, old, stage.ix[keepers].fillna('M'), combo.ix[k2], surv[:,'days'], surv[:,'event']],
keys=['TP53 Mutation', '3p Deletion',
'Tissue Type', 'Age', 'Age > 75', 'Stage',
'TP53 / 3p', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).dropna()
df.to_csv(FIGDIR + 'fig3c.csv')
f = {}
for cancer in codes.ix[combo.index].unique():
try:
c = combo[combo.isin(['both','TP53'])] == 'both'
c = c.ix[ti(codes==cancer)].dropna()
f[cancer] = get_cox_ph_ms(surv_3y, c, [age, old], interactions=False)
except:
print 'fail'
fail
pd.DataFrame(f).T.sort('LR')
LR | feature_p | fmla | hazzard | |
---|---|---|---|---|
OV | 0.0167 | 0.0137 | Surv(days, event) ~ feature + age\n | 1.75 |
LUAD | 0.0579 | 0.063 | Surv(days, event) ~ feature + old\n | 1.79 |
LGG | 0.123 | 0.0339 | Surv(days, event) ~ feature\n | 4.48 |
LIHC | 0.216 | 0.163 | Surv(days, event) ~ feature\n | 5.53 |
BRCA | 0.248 | 0.251 | Surv(days, event) ~ feature + age\n | 1.64 |
KIRC | 0.28 | 0.999 | Surv(days, event) ~ feature\n | 2.64e+08 |
COAD | 0.324 | 0.17 | Surv(days, event) ~ feature\n | 2.56 |
LAML | 0.422 | 0.421 | Surv(days, event) ~ feature + old\n | 1.7 |
STAD | 0.514 | 0.478 | Surv(days, event) ~ feature\n | 1.35 |
HNSC | 0.527 | 0.175 | Surv(days, event) ~ feature\n | 2.06 |
READ | 0.572 | 0.549 | Surv(days, event) ~ feature + age\n | 2.01 |
UCEC | 0.88 | 0.88 | Surv(days, event) ~ feature + age\n | 0.919 |
BLCA | 1 | 0.655 | Surv(days, event) ~ feature\n | 0.745 |
CESC | 1 | 1 | Surv(days, event) ~ feature\n | 1.62e+09 |
LUSC | 1 | 0.317 | Surv(days, event) ~ feature\n | 0.681 |
SARC | 1 | 1 | Surv(days, event) ~ feature\n | 1 |
SKCM | 1 | 0.702 | Surv(days, event) ~ feature\n | 0.665 |
cc = ti(codes.ix[combo_all.index].value_counts() > 50)
ct = pd.DataFrame({c: fisher_exact_test(p53_all.ix[ti(codes==c)].dropna()>0, del_3p_all<0)
for c in cc}).T
ct.dropna().sort('odds_ratio', ascending=False)
odds_ratio | p | |
---|---|---|
LAML | inf | 7.40e-08 |
UCEC | 28.35 | 3.46e-21 |
HNSC | 9.38 | 9.74e-18 |
LGG | 5.67 | 2.29e-03 |
KIRP | 4.75 | 2.73e-01 |
COAD | 3.94 | 4.21e-05 |
BRCA | 3.47 | 7.22e-17 |
STAD | 3.46 | 9.20e-06 |
LIHC | 2.60 | 1.54e-01 |
LUSC | 2.06 | 4.92e-02 |
LUAD | 1.96 | 5.50e-04 |
BLCA | 1.51 | 2.49e-01 |
GBM | 1.45 | 3.98e-01 |
OV | 1.42 | 3.36e-01 |
PAAD | 1.39 | 7.64e-01 |
SKCM | 1.35 | 4.89e-01 |
KICH | 1.14 | 1.00e+00 |
READ | 0.93 | 1.00e+00 |
KIRC | 0.82 | 7.81e-01 |
CESC | 0.66 | 1.00e+00 |
SARC | 0.28 | 4.35e-01 |
PRAD | 0.00 | 1.00e+00 |
THCA | 0.00 | 1.00e+00 |
fig, axs = subplots(1, 7, figsize=(10,3), sharey=True)
cc = array(colors_th)
for i,c in enumerate(['HNSC','BRCA','LUAD', 'OV','KIRC','LGG','LUSC']):
draw_survival_curve(combo.ix[true_index(codes==c)].dropna(),
surv_3y, ax=axs[i],
colors={'both': cc[0], 'TP53': cc[1], 'del_3p': cc[2], 'neither':cc[3]})
axs[i].get_legend().set_visible(False)
axs[i].set_ylim(.4,1.05)
axs[i].set_xticks([0,1,2,3])
prettify_ax(axs[i])
axs[i].annotate(c, (2, .42))
fig.tight_layout()
fig, axs = subplots(1, 7, figsize=(10,3), sharey=True)
cc = array(colors_th)
for i,c in enumerate(['HNSC','BRCA','LUAD', 'OV','KIRC','LGG','LUSC']):
f = combo[combo.isin(['both','TP53'])].ix[true_index(codes==c)].dropna()
#f = f.ix[ti(age < 75)].dropna()
draw_survival_curve(f, surv_5y, ax=axs[i],
colors={'both': cc[0], 'TP53': cc[1], 'del_3p': cc[2], 'neither':cc[3]})
axs[i].get_legend().set_visible(False)
axs[i].set_ylim(.4,1.05)
axs[i].set_xticks([0,1,2,3,4,5])
prettify_ax(axs[i])
axs[i].annotate(c, (2, .42))
fig.tight_layout()