import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory populating namespace with data
Extract genomic variables of interest
p53_mut = mut.features.ix['TP53'].ix[keepers_o].dropna() > 0
del_3p = cn.features.ix['Deletion'].ix['3p14.2'].ix[0].ix[keepers_o].dropna()
p53_mut.name = 'TP53'
del_3p.name = 'del_3p'
combo = combine(p53_mut==1, del_3p==-1)
two_hit = combo=='both'
two_hit.name = 'two_hit'
old = age > 75
old.name = 'old'
Calculate deletion and amplification rates
cn_by_gene = FH.get_gistic_gene_matrix(run.data_path, cancer.name)
cn_by_gene = cn_by_gene.ix[[i for i in cn_by_gene.index if i[2] in rna.df.index]]
pct_del = (cn_by_gene < 0).sum() / (1.*len(cn_by_gene))
pct_amp = (cn_by_gene > 0).sum() / (1.*len(cn_by_gene))
pct_altered = pct_amp + pct_del
pct_altered.name = 'CIN'
pct_del.name = 'pct_del'
pct_altered.name = 'pct_altered'
Define competing models
f0 = robjects.Formula('event ~ old')
f1 = robjects.Formula('event ~ TP53 + old')
f2 = robjects.Formula('event ~ del_3p + old')
f3 = robjects.Formula('event ~ TP53 + del_3p + old')
f8 = robjects.Formula('event ~ TP53 + del_3p + pct_del + old')
f4 = robjects.Formula('event ~ TP53 + del_3p + two_hit + old')
f5 = robjects.Formula('event ~ two_hit + old')
f6 = robjects.Formula('event ~ TP53 + del_3p + pct_del + TP53:pct_del + old')
f7 = robjects.Formula('event ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + old')
formulas = [f0, f1, f2, f3, f8, f7]
To convert the survival data into a binary classification problem, we organized patients into two classes depending on whether they were surviving or deceased at T years after surgery. In this analysis, the ratio of deceased to surviving patients is artificially high due to the ability to observe a death in a shorter followup than the full time interval required to annotate a patient as surviving (i.e. the basis of the Cox censorship problem). To reduce this bias, we removed patients with an observed death but a time of surgery after a set year (2013 – (T – 1)).
year = clinical.clinical.yearofinitialpathologicdiagnosis.astype(float)
year.max()
2013.0
def surv_to_binary(surv, t, year):
last_collection = year.max()
ss = surv.unstack()
ss = ss[(ss.days >= (365*t)) | (ss.event == 1)]
ss = ss.ix[ti(year < (last_collection - (t-1)))]
event = ss.ix[ss.index.intersection(keepers_o)].dropna().days >= (365*t)
event = event.astype(float)
event.name = 'event'
return event
Sample size for each time point
e2 = pd.concat({t: surv_to_binary(clinical.survival.survival, t, year).value_counts()
for t in range(1,6)}, axis=1)
e2.index = ['Deceased', 'Alive']
fig, ax = subplots(figsize=(3,2))
e2.T.plot(kind='bar', stacked=True, ax=ax, rot=0, fontsize=11)
ax.legend(frameon=False)
prettify_ax(ax)
ax.set_ylabel('# of Patients')
ax.set_xlabel('Time Cutoff (Years)')
fig.tight_layout()
fig.savefig(FIGDIR + '/CV_sample_size.pdf')
from sklearn.metrics import roc_curve, auc, precision_recall_curve
def as_FV(s):
'''
Convert a Series to R FloatVector object.
'''
fv = robjects.FloatVector(s)
fv.names = list(s.index)
return fv
accuracy = {}
predictions = {}
error = {}
roc_area = {}
aupur = {}
for t in [1,2,3,4,5]:
event = surv_to_binary(clinical.survival.survival, t, year)
df = pd.concat([p53_mut, del_3p<0, event, old, two_hit, pct_del, age], axis=1).dropna()
df_r = convert_to_r_dataframe(df)
outcome = event.map(len(event) - event.value_counts())
outcome = outcome / sum(outcome)
prediction = {}
for i,p in enumerate(df.index):
pt = df.index[[i]]
df_r = convert_to_r_dataframe(df.ix[df.index.diff(pt)])
o2 = outcome.ix[outcome.index.diff(pt)]
models = [robjects.r.glm(f, family='binomial', weights=as_FV(o2),
data=df_r)
for f in formulas]
df_p = convert_to_r_dataframe(df.ix[pt])
prediction[p] = [robjects.r.predict(m, df_p, type='response')[0] for m
in models]
prediction = pd.DataFrame(prediction).T
prediction.columns = ['1','TP53','3p',
'TP53 + 3p',
'TP53 + 3p + TP53:3p',
'TP53 + 3p + pct_del + TP53:3p + TP53:pct_del']
accuracy[t] = ((((prediction.T >= .5)) == (df.event == 1)) * outcome).sum(1)
predictions[t] = prediction
error[t] = ((prediction.T - df.event).abs() * outcome).sum(1)
fpr = {}
tpr = {}
roc_auc = {}
aupur_auc = {}
for m,v in prediction.iteritems():
a,b = match_series(v, df.event)
fpr, tpr, thresh = roc_curve(b, a)
roc_auc[m] = auc(fpr, tpr)
precision, recall, thresh = precision_recall_curve(b, a)
aupur_auc[m] = auc(recall, precision)
roc_area[t] = roc_auc
aupur[t] = aupur_auc
roc_area = pd.DataFrame(roc_area).T
aupur = pd.DataFrame(aupur).T
fig, axs = subplots(1,2, figsize=(10,3.5))
ax = axs[0]
roc_area.columns = ['m0','m1','m2','m3','m4','m5']
roc_area.plot(kind='bar', legend=False, ax=ax)
ax.legend(loc='lower left')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
#ax.set_ylim(0.35,0.52)
ax.set_ylabel('AUROC')
ax.set_xlabel('Time (years)')
prettify_ax(ax)
ax = axs[1]
aupur.columns = ['m0','m1','m2','m3','m4','m5']
aupur.plot(kind='bar', legend=False, ax=ax)
#ax.legend(loc='lower right')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
#ax.set_ylim(0.5,0.72)
ax.set_ylabel('AUPUR')
ax.set_xlabel('Time (years)')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'CV_supp_AUC.pdf')
pd.concat([roc_area, aupur], axis=0, keys=['AUROC','AUPUR'])
m0 | m1 | m2 | m3 | m4 | m5 | ||
---|---|---|---|---|---|---|---|
AUROC | 1 | 0.19 | 0.36 | 0.32 | 0.43 | 0.43 | 0.66 |
2 | 0.13 | 0.34 | 0.34 | 0.43 | 0.49 | 0.62 | |
3 | 0.14 | 0.39 | 0.36 | 0.49 | 0.68 | 0.69 | |
4 | 0.16 | 0.42 | 0.40 | 0.52 | 0.69 | 0.70 | |
5 | 0.19 | 0.41 | 0.41 | 0.51 | 0.71 | 0.73 | |
AUPUR | 1 | 0.59 | 0.74 | 0.72 | 0.78 | 0.79 | 0.86 |
2 | 0.29 | 0.48 | 0.46 | 0.55 | 0.57 | 0.66 | |
3 | 0.20 | 0.39 | 0.34 | 0.47 | 0.56 | 0.61 | |
4 | 0.17 | 0.34 | 0.30 | 0.43 | 0.50 | 0.54 | |
5 | 0.16 | 0.28 | 0.26 | 0.34 | 0.43 | 0.52 |
fig, axs = subplots(1,2, figsize=(10,3.5))
ax = axs[0]
ct = pd.concat(error, axis=1).T
ct.columns = ['m0','m1','m2','m3','m4','m5']
ct.plot(kind='bar', legend=False, ax=ax)
ax.legend(loc='lower left')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
ax.set_ylim(0.35,0.52)
ax.set_ylabel('Linear Prediction Error')
ax.set_xlabel('Time (years)')
prettify_ax(ax)
ax = axs[1]
ct = pd.concat(accuracy, axis=1).T
ct.columns = ['m0','m1','m2','m3','m4','m5']
ct.plot(kind='bar', legend=False, ax=ax)
#ax.legend(loc='lower right')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
ax.set_ylim(0.5,0.72)
ax.set_ylabel('Weighted Accuracy')
ax.set_xlabel('Time (years)')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'CV_supp.pdf')
ct1= pd.concat(accuracy, axis=1)
ct2 = pd.concat(error, axis=1)
ct = pd.concat([ct1, ct2], keys=['Weighted Accuracy', 'Linear Prediction Error'], axis=0)
ct.columns = ['1y','2y','3y','4y','5y']
ct.columns.name = 'Time Cutoff'
ct
Time Cutoff | 1y | 2y | 3y | 4y | 5y | |
---|---|---|---|---|---|---|
Weighted Accuracy | 1 | 0.56 | 0.52 | 0.53 | 0.54 | 0.58 |
TP53 | 0.20 | 0.60 | 0.60 | 0.62 | 0.63 | |
3p | 0.58 | 0.60 | 0.63 | 0.64 | 0.62 | |
TP53 + 3p | 0.61 | 0.63 | 0.67 | 0.68 | 0.67 | |
TP53 + 3p + TP53:3p | 0.56 | 0.63 | 0.66 | 0.64 | 0.66 | |
TP53 + 3p + pct_del + TP53:3p + TP53:pct_del | 0.63 | 0.62 | 0.69 | 0.68 | 0.71 | |
Linear Prediction Error | 1 | 0.49 | 0.50 | 0.50 | 0.50 | 0.48 |
TP53 | 0.49 | 0.48 | 0.47 | 0.46 | 0.45 | |
3p | 0.46 | 0.47 | 0.44 | 0.44 | 0.45 | |
TP53 + 3p | 0.46 | 0.46 | 0.43 | 0.42 | 0.44 | |
TP53 + 3p + TP53:3p | 0.47 | 0.46 | 0.43 | 0.42 | 0.43 | |
TP53 + 3p + pct_del + TP53:3p + TP53:pct_del | 0.45 | 0.45 | 0.42 | 0.41 | 0.40 |
First we test two hypotheses:
fmla0 = 'Surv(days, event) ~ TP53 + del_3p + old'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del + old'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + old'
m0 = get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del], formula=fmla0)
m1 = get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del], formula=fmla1)
m2 = get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)
(0.016999296949315183, 0.022793238361218802)
Here is the full model
get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del],
formula=fmla2, print_desc=True);
coef exp(coef) se(coef) z p TP53 -0.568 0.567 0.3940 -1.44 0.1500 del_3p -0.693 0.500 0.5359 -1.29 0.2000 pct_del 0.222 1.249 0.1434 1.55 0.1200 two_hit 1.411 4.100 0.6665 2.12 0.0340 old 0.281 1.325 0.0868 3.24 0.0012 TP53:pct_del -0.631 0.532 0.2130 -2.96 0.0030 Likelihood ratio test=41.3 on 6 df, p=2.48e-07 n= 250, number of events= 102
Forest plots of the models
fig, axs = subplots(3,1, figsize=(6,5), sharex=True)
ax = axs[0]
ci = convert_robj(robjects.r.summary(m0)[7])
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color='grey',
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
ax = axs[1]
ci = convert_robj(robjects.r.summary(m1)[7])
ci = ci.ix[['TP53','del_3p','pct_del','TP53:pct_del']][::-1]
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color='grey',
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
ax = axs[2]
ci = convert_robj(robjects.r.summary(m2)[7])
ci = ci.ix[['TP53','del_3p','pct_del','TP53:pct_del','two_hit']][::-1]
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color='grey',
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
for ax in axs:
prettify_ax(ax)
ax.set_xscale('log')
ax.set_xticks([.25, .5, 1, 1.5, 2, 4,8,16])
ax.set_xticklabels([.25, .5, 1, 1.5, 2, 4,8,16])
ax.set_xlabel('Hazard Ratio')
ax.set_xbound(.15, 17)
fig.tight_layout()