HNSCC HPV- Cohort Statistical Analysis of TP53-3p Interaction

Import Data and Packages¶

For full list of data and packages imported see the Imports notebook.

In [1]:

import NotebookImport
from Imports import *

importing IPython notebook from Imports.ipynb
Populating the interactive namespace from numpy and matplotlib
changing to source dirctory
populating namespace with data

Extract genomic variables of interest

In [2]:

p53_mut = mut.features.ix['TP53'].ix[keepers_o].dropna() > 0
del_3p = cn.features.ix['Deletion'].ix['3p14.2'].ix[0].ix[keepers_o].dropna()
p53_mut.name = 'TP53'
del_3p.name = 'del_3p'

combo = combine(p53_mut==1, del_3p==-1)
two_hit = combo=='both'
two_hit.name = 'two_hit'

old = age > 75
old.name = 'old'

Calculate deletion and amplification rates

In [3]:

cn_by_gene = FH.get_gistic_gene_matrix(run.data_path, cancer.name)
cn_by_gene = cn_by_gene.ix[[i for i in cn_by_gene.index if i[2] in rna.df.index]]
pct_del = (cn_by_gene < 0).sum() / (1.*len(cn_by_gene))
pct_amp = (cn_by_gene > 0).sum() / (1.*len(cn_by_gene))
pct_altered = pct_amp + pct_del
pct_altered.name = 'CIN'

pct_del.name = 'pct_del'
pct_altered.name = 'pct_altered'

Define competing models

In [4]:

f0 = robjects.Formula('event ~ old')
f1 = robjects.Formula('event ~ TP53 + old')
f2 = robjects.Formula('event ~ del_3p + old')
f3 = robjects.Formula('event ~ TP53 + del_3p + old')
f8 = robjects.Formula('event ~ TP53 + del_3p + pct_del + old')
f4 = robjects.Formula('event ~ TP53 + del_3p + two_hit + old')
f5 = robjects.Formula('event ~ two_hit + old')
f6 = robjects.Formula('event ~ TP53 + del_3p + pct_del + TP53:pct_del + old')
f7 = robjects.Formula('event ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + old')

formulas = [f0, f1, f2, f3, f8, f7]

Preform Leave on out cross-validation on various logistic models to test for the TP53-3p interaction.¶

To convert the survival data into a binary classification problem, we organized patients into two classes depending on whether they were surviving or deceased at T years after surgery. In this analysis, the ratio of deceased to surviving patients is artificially high due to the ability to observe a death in a shorter followup than the full time interval required to annotate a patient as surviving (i.e. the basis of the Cox censorship problem). To reduce this bias, we removed patients with an observed death but a time of surgery after a set year (2013 – (T – 1)).

In [5]:

year = clinical.clinical.yearofinitialpathologicdiagnosis.astype(float)
year.max()

Out[5]:

2013.0

In [6]:

def surv_to_binary(surv, t, year):
    last_collection = year.max()
    ss = surv.unstack()
    ss = ss[(ss.days >= (365*t)) | (ss.event == 1)]
    ss = ss.ix[ti(year < (last_collection - (t-1)))]
    event = ss.ix[ss.index.intersection(keepers_o)].dropna().days >= (365*t)
    event = event.astype(float)
    event.name = 'event'
    return event

Sample size for each time point

In [7]:

e2 = pd.concat({t: surv_to_binary(clinical.survival.survival, t, year).value_counts() 
                for t in range(1,6)}, axis=1)
e2.index = ['Deceased', 'Alive']

In [8]:

fig, ax = subplots(figsize=(3,2))
e2.T.plot(kind='bar', stacked=True, ax=ax, rot=0, fontsize=11)
ax.legend(frameon=False)
prettify_ax(ax)
ax.set_ylabel('# of Patients')
ax.set_xlabel('Time Cutoff (Years)')
fig.tight_layout()
fig.savefig(FIGDIR + '/CV_sample_size.pdf')

In [9]:

from sklearn.metrics import roc_curve, auc, precision_recall_curve

def as_FV(s):
    '''
    Convert a Series to R FloatVector object.
    '''
    fv = robjects.FloatVector(s)
    fv.names = list(s.index)
    return fv

In [10]:

accuracy = {}
predictions = {}
error = {}
roc_area = {}
aupur = {}

for t in [1,2,3,4,5]:
    event = surv_to_binary(clinical.survival.survival, t, year)

    df = pd.concat([p53_mut, del_3p<0, event, old, two_hit, pct_del, age], axis=1).dropna()
    df_r = convert_to_r_dataframe(df)

    outcome = event.map(len(event) - event.value_counts())
    outcome = outcome / sum(outcome)

    prediction = {}
    for i,p in enumerate(df.index):
        pt = df.index[[i]]
        df_r = convert_to_r_dataframe(df.ix[df.index.diff(pt)])
        o2 = outcome.ix[outcome.index.diff(pt)]
        models = [robjects.r.glm(f, family='binomial', weights=as_FV(o2), 
                                 data=df_r) 
                  for f in formulas]
        df_p = convert_to_r_dataframe(df.ix[pt])
        prediction[p] = [robjects.r.predict(m, df_p, type='response')[0] for m 
                         in models]
    prediction = pd.DataFrame(prediction).T
    prediction.columns = ['1','TP53','3p',
                          'TP53 + 3p',
                          'TP53 + 3p + TP53:3p',
                          'TP53 + 3p + pct_del + TP53:3p + TP53:pct_del']
    
    accuracy[t] = ((((prediction.T >= .5)) == (df.event == 1)) * outcome).sum(1)
    predictions[t] = prediction

    error[t] = ((prediction.T - df.event).abs() * outcome).sum(1)
    
    fpr = {}
    tpr = {}
    roc_auc = {}
    aupur_auc = {}
    for m,v in prediction.iteritems():
        a,b = match_series(v, df.event)
        fpr, tpr, thresh = roc_curve(b, a)
        roc_auc[m] = auc(fpr, tpr)
        precision, recall, thresh = precision_recall_curve(b, a)
        aupur_auc[m] = auc(recall, precision)
    roc_area[t] = roc_auc
    aupur[t] = aupur_auc
roc_area = pd.DataFrame(roc_area).T
aupur = pd.DataFrame(aupur).T

In [11]:

fig, axs = subplots(1,2, figsize=(10,3.5))
ax = axs[0]
roc_area.columns = ['m0','m1','m2','m3','m4','m5']
roc_area.plot(kind='bar', legend=False, ax=ax)
ax.legend(loc='lower left')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
#ax.set_ylim(0.35,0.52)
ax.set_ylabel('AUROC')
ax.set_xlabel('Time (years)')
prettify_ax(ax)

ax = axs[1]
aupur.columns = ['m0','m1','m2','m3','m4','m5']
aupur.plot(kind='bar', legend=False, ax=ax)
#ax.legend(loc='lower right')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
#ax.set_ylim(0.5,0.72)
ax.set_ylabel('AUPUR')
ax.set_xlabel('Time (years)')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'CV_supp_AUC.pdf')

In [12]:

pd.concat([roc_area, aupur], axis=0, keys=['AUROC','AUPUR'])

Out[12]:

		m0	m1	m2	m3	m4	m5
AUROC	1	0.19	0.36	0.32	0.43	0.43	0.66
	2	0.13	0.34	0.34	0.43	0.49	0.62
	3	0.14	0.39	0.36	0.49	0.68	0.69
	4	0.16	0.42	0.40	0.52	0.69	0.70
	5	0.19	0.41	0.41	0.51	0.71	0.73
AUPUR	1	0.59	0.74	0.72	0.78	0.79	0.86
	2	0.29	0.48	0.46	0.55	0.57	0.66
	3	0.20	0.39	0.34	0.47	0.56	0.61
	4	0.17	0.34	0.30	0.43	0.50	0.54
	5	0.16	0.28	0.26	0.34	0.43	0.52

In [13]:

fig, axs = subplots(1,2, figsize=(10,3.5))
ax = axs[0]
ct = pd.concat(error, axis=1).T
ct.columns = ['m0','m1','m2','m3','m4','m5']
ct.plot(kind='bar', legend=False, ax=ax)
ax.legend(loc='lower left')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
ax.set_ylim(0.35,0.52)
ax.set_ylabel('Linear Prediction Error')
ax.set_xlabel('Time (years)')
prettify_ax(ax)

ax = axs[1]
ct = pd.concat(accuracy, axis=1).T
ct.columns = ['m0','m1','m2','m3','m4','m5']
ct.plot(kind='bar', legend=False, ax=ax)
#ax.legend(loc='lower right')
ax.set_xticklabels([1,2,3,4,5], rotation=0)
ax.set_ylim(0.5,0.72)
ax.set_ylabel('Weighted Accuracy')
ax.set_xlabel('Time (years)')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'CV_supp.pdf')

In [14]:

ct1= pd.concat(accuracy, axis=1)
ct2 = pd.concat(error, axis=1)
ct = pd.concat([ct1, ct2], keys=['Weighted Accuracy', 'Linear Prediction Error'], axis=0)
ct.columns = ['1y','2y','3y','4y','5y']
ct.columns.name = 'Time Cutoff'
ct

Out[14]:

	Time Cutoff	1y	2y	3y	4y	5y
Weighted Accuracy	1	0.56	0.52	0.53	0.54	0.58
	TP53	0.20	0.60	0.60	0.62	0.63
	3p	0.58	0.60	0.63	0.64	0.62
	TP53 + 3p	0.61	0.63	0.67	0.68	0.67
	TP53 + 3p + TP53:3p	0.56	0.63	0.66	0.64	0.66
	TP53 + 3p + pct_del + TP53:3p + TP53:pct_del	0.63	0.62	0.69	0.68	0.71
Linear Prediction Error	1	0.49	0.50	0.50	0.50	0.48
	TP53	0.49	0.48	0.47	0.46	0.45
	3p	0.46	0.47	0.44	0.44	0.45
	TP53 + 3p	0.46	0.46	0.43	0.42	0.44
	TP53 + 3p + TP53:3p	0.47	0.46	0.43	0.42	0.43
	TP53 + 3p + pct_del + TP53:3p + TP53:pct_del	0.45	0.45	0.42	0.41	0.40

Analysis of top model with full censored data¶

First we test two hypotheses:

Does the model imporove on an additive model of TP53, 3p and age (we would include deletion rate as well but this was dropped in model selection)
Does the addition of the TP53-3p interaction term improve the fit of the model (drop-1 test)

In [15]:

fmla0 = 'Surv(days, event) ~ TP53 + del_3p + old'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del + old'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + old'

m0 = get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del], formula=fmla0)
m1 = get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del], formula=fmla1)
m2 = get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)

Out[15]:

(0.016999296949315183, 0.022793238361218802)

Here is the full model

In [16]:

get_cox_ph(surv, covariates=[p53_mut, del_3p < 0, two_hit, old, age, pct_del], 
           formula=fmla2, print_desc=True);

               coef exp(coef) se(coef)     z      p
TP53         -0.568     0.567   0.3940 -1.44 0.1500
del_3p       -0.693     0.500   0.5359 -1.29 0.2000
pct_del       0.222     1.249   0.1434  1.55 0.1200
two_hit       1.411     4.100   0.6665  2.12 0.0340
old           0.281     1.325   0.0868  3.24 0.0012
TP53:pct_del -0.631     0.532   0.2130 -2.96 0.0030

Likelihood ratio test=41.3  on 6 df, p=2.48e-07  n= 250, number of events= 102

Forest plots of the models

In [17]:

fig, axs = subplots(3,1, figsize=(6,5), sharex=True)

ax = axs[0]
ci = convert_robj(robjects.r.summary(m0)[7])
haz = ci['exp(coef)']
for j,h in enumerate(haz):
    ax.scatter(h, j, marker='s', s=100, color='grey', 
               edgecolors=['black'], zorder=10)
    ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))), 
            lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)

ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)

ax = axs[1]
ci = convert_robj(robjects.r.summary(m1)[7])
ci = ci.ix[['TP53','del_3p','pct_del','TP53:pct_del']][::-1]
haz = ci['exp(coef)']
for j,h in enumerate(haz):
    ax.scatter(h, j, marker='s', s=100, color='grey', 
               edgecolors=['black'], zorder=10)
    ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))), 
            lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)

ax = axs[2]
ci = convert_robj(robjects.r.summary(m2)[7])
ci = ci.ix[['TP53','del_3p','pct_del','TP53:pct_del','two_hit']][::-1]
haz = ci['exp(coef)']
for j,h in enumerate(haz):
    ax.scatter(h, j, marker='s', s=100, color='grey', 
               edgecolors=['black'], zorder=10)
    ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))), 
            lw=3, ls='-', marker='o', dash_joinstyle='bevel', color='grey')
ax.axvline(1, ls='--', color='black')
ax.set_xbound(.5,3.5)
ax.set_ybound(-.5,len(ci.index) - .5)

ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)

for ax in axs:
    prettify_ax(ax)
    ax.set_xscale('log')
    ax.set_xticks([.25, .5, 1, 1.5, 2, 4,8,16])
    ax.set_xticklabels([.25, .5, 1, 1.5, 2, 4,8,16])
ax.set_xlabel('Hazard Ratio')
ax.set_xbound(.15, 17)    
fig.tight_layout()