import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory populating namespace with data
from Processing.Screen import *
def surv_test(s, surv, cov_df):
s = s.dropna()
try:
return get_cox_ph_ms(surv, s, cov_df, return_val='LR', interactions='just_feature')
except:
return pd.Series(index=['LR','feature_p', 'fmla', 'hazzard'])
def run_screen(screen, filters, covariates):
cov_df = pd.concat(covariates, axis=1)
keepers_o = screen.get_patient_set(filters)
cutoff = max(np.ceil(len(keepers_o) * .05), 10)
df = screen.get_data(keepers_o, cutoff)
univariate = cox_screen(df, surv)
vec = univariate.LR.p.sort_index()
univariate = pd.concat([univariate['hazard'], corrections(vec)],
keys=['hazard', 'p'], axis=1)
#hits = univariate[univariate['q_bh'] < .2].index
hits = univariate.index
full = df.ix[hits].apply(surv_test, args=(surv, cov_df,), axis=1)
vec = full.LR.ix[univariate.index].sort_index()
full = pd.concat([full[['fmla']], corrections(vec)],
keys=['fmla','p'], axis=1)
hits = true_index(full.p.bh_all.order() < .1)
res = full.sort([('p','uncorrected')]).head()
return res, full, univariate, keepers_o, df
screen = Screen(mut, cn, rna, mirna, clinical.binary_df, surv, keepers_o)
res, full, univariate, keepers_o, df = run_screen(screen, filters=[hpv],
covariates=[old, age])
(mut.df.ix[:, keepers_o].sum(1) > 0).value_counts()
True 12088 False 1070 dtype: int64
df.shape
(878, 250)
df.groupby(level=0).size()
clinical 13 cna 74 mirna 238 mutation 121 rna 432 dtype: int64
len([i for i in df.ix['mutation'].index if i not in run.gene_sets])
33
len([i for i in df.ix['mutation'].index if i in run.gene_sets])
88
len([i for i in df.ix['rna'].index if i not in run.gene_sets])
294
len([i for i in df.ix['rna'].index if i in run.gene_sets])
138
(full.p.bh_all < .1).groupby(level=0).apply(pd.value_counts).unstack()
False | True | |
---|---|---|
clinical | 5 | 8 |
cna | 62 | 12 |
mirna | 226 | 12 |
mutation | 107 | 14 |
rna | 376 | 56 |
5 rows × 2 columns
(full.p.bh_all < .1).value_counts()
False 776 True 102 dtype: int64
rr = cox_screen(df, surv)
haz = rr['hazard'][['exp(coef)','lower .95','upper .95']]
p = rr.LR.p
uni = haz.join(p)
uni['PASS'] = uni.p < .05
multi = full.reset_index().sort(['level_0',('p','uncorrected')]).set_index(['level_0','level_1'])
multi.index.names = ['data_type','event']
multi['p'] = multi['p'].clip_upper(1.)
multi.columns = multi.columns.droplevel(0)
multi['PASS'] = multi.bh_all < .1
f = pd.concat([uni, multi], keys=['Univariate','Multivariate'], axis=1)
f[('BOTH','PASS')] = f.Univariate.PASS & f.Multivariate.PASS
f = f.sort([('BOTH','PASS'),('Multivariate','uncorrected')], ascending=[False, True])
fd = f.copy()
fd.index = pd.MultiIndex.from_tuples([(i[0], i[1].replace('_',' ')) for i in fd.index])
fd[('Multivariate','fmla')] = fd.Multivariate.astype(str).fmla.str.replace('\n','')
fd.to_csv(FIGDIR + 'supplemental_table1.csv', float_format='%.2e')
(f.Univariate.PASS & f.Multivariate.PASS).value_counts()
False 796 True 82 dtype: int64
hits = full.ix[ti((f.Univariate.PASS & f.Multivariate.PASS))]
hits = hits.sort([('p','uncorrected')])
get_cox_ph(surv, df.ix['mutation'].ix['TP53'], print_desc=True);
coef exp(coef) se(coef) z p feature 1.07 2.91 0.333 3.2 0.0014 Likelihood ratio test=13.6 on 1 df, p=0.000228 n= 250, number of events= 102
exp(1.07), exp(1.07) - exp(1.07 - .333)
(2.9153794999769969, 0.82572236974094038)
hits.ix['mutation'].ix['TP53']
fmla fmla Surv(days, event) ~ feature + old\n p uncorrected 0.00017 bh_within 0.0103 bh_all 0.00875 bonf_all 0.149 bonf_within 0.0206 two_step 0.0514 Name: TP53, dtype: object
hits.ix['mutation']['p'].sort('uncorrected').head(4)
uncorrected | bh_within | bh_all | bonf_all | bonf_within | two_step | |
---|---|---|---|---|---|---|
TP53 | 1.70e-04 | 0.01 | 0.01 | 0.15 | 0.02 | 0.05 |
MUC5B | 6.34e-04 | 0.02 | 0.02 | 0.56 | 0.08 | 0.12 |
KEGG_CELL_CYCLE | 8.16e-04 | 0.02 | 0.02 | 0.72 | 0.10 | 0.12 |
REACTOME_SIGNALLING_TO_RAS | 2.62e-03 | 0.06 | 0.05 | 2.30 | 0.32 | 0.30 |
4 rows × 6 columns
get_cox_ph(surv, df.ix['cna'].ix['del_3p14.2'], print_desc=True);
coef exp(coef) se(coef) z p feature 1.26 3.54 0.369 3.43 0.00061 Likelihood ratio test=16.7 on 1 df, p=4.32e-05 n= 250, number of events= 102
exp(1.26), exp(1.26) - exp(1.26 - .369)
(3.5254214873653824, 1.0878554884534357)
hits.ix['cna']['p'].sort('uncorrected').head(4)
uncorrected | bh_within | bh_all | bonf_all | bonf_within | two_step | |
---|---|---|---|---|---|---|
del_3p14.2 | 5.74e-06 | 3.28e-04 | 0 | 0.01 | 4.25e-04 | 0 |
del_3p25.3 | 8.86e-06 | 3.28e-04 | 0 | 0.01 | 6.56e-04 | 0 |
del_3p14.3 | 3.05e-05 | 6.10e-04 | 0 | 0.03 | 2.26e-03 | 0 |
del_3p12.2 | 3.30e-05 | 6.10e-04 | 0 | 0.03 | 2.44e-03 | 0 |
4 rows × 6 columns
hits.ix['cna']['p'].iloc[0]
uncorrected 5.74e-06 bh_within 3.28e-04 bh_all 1.68e-03 bonf_all 5.04e-03 bonf_within 4.25e-04 two_step 1.64e-03 Name: del_3p14.2, dtype: float64
survival_and_stats(df.ix['cna'].ix['del_3p14.2'], surv, figsize=(6,4))
hits.ix['rna']['p'].sort('uncorrected').head(3)
uncorrected | bh_within | bh_all | bonf_all | bonf_within | two_step | |
---|---|---|---|---|---|---|
SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES | 8.84e-07 | 3.82e-04 | 7.77e-04 | 7.77e-04 | 3.82e-04 | 0.00 |
P4HA1 | 1.79e-05 | 2.91e-03 | 2.90e-03 | 1.57e-02 | 7.72e-03 | 0.01 |
REACTOME_THROMBOXANE_SIGNALLING_THROUGH_TP_RECEPTOR | 2.26e-05 | 2.91e-03 | 2.90e-03 | 1.99e-02 | 9.78e-03 | 0.01 |
3 rows × 6 columns
survival_and_stats(df.ix['rna'].ix['SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'], surv)
hits.ix['mirna']['p'].sort('uncorrected').head(4)
uncorrected | bh_within | bh_all | bonf_all | bonf_within | two_step | |
---|---|---|---|---|---|---|
hsa-mir-3170 | 3.24e-05 | 0.01 | 0.00 | 0.03 | 0.01 | 0.04 |
hsa-mir-411 | 2.04e-04 | 0.02 | 0.01 | 0.18 | 0.05 | 0.12 |
hsa-mir-100 | 3.30e-04 | 0.02 | 0.01 | 0.29 | 0.08 | 0.12 |
hsa-mir-629 | 4.04e-04 | 0.02 | 0.01 | 0.35 | 0.10 | 0.12 |
4 rows × 6 columns
survival_and_stats(df.ix['mirna'].ix['hsa-mir-3170'].dropna(), surv, figsize=(6,4))
hits.ix['clinical']['p'].sort('uncorrected').head(4)
uncorrected | bh_within | bh_all | bonf_all | bonf_within | two_step | |
---|---|---|---|---|---|---|
recent_smoker | 4.70e-06 | 6.11e-05 | 0.00 | 0.00 | 6.11e-05 | 3.06e-04 |
pre_2000 | 7.78e-05 | 5.06e-04 | 0.01 | 0.07 | 1.01e-03 | 2.53e-03 |
spread | 1.79e-04 | 7.77e-04 | 0.01 | 0.16 | 2.33e-03 | 3.88e-03 |
lymph_n2+ | 4.52e-04 | 1.47e-03 | 0.02 | 0.40 | 5.87e-03 | 7.34e-03 |
4 rows × 6 columns
It has previously been shown that genetic alterations often act in redundant or synergistic mechanisms to confer a growth advantage in the tumor. Under the hypothesis that such prognostic biomarkers might act in concert, we next examined the 78 primary events for pairwise association.
df = df.replace({'no': False, 'yes':True})
hit_df = df.ix[hits.index].astype(float)
fet_within = pd.DataFrame({(a, b): fisher_exact_test(v1, v2)
for a, v1 in hit_df.iterrows()
for b, v2 in hit_df.iterrows()
if a[0] == b[0]
and a[1] > b[1]}).T
fet_within.sort('p').head()
odds_ratio | p | |
---|---|---|
((cna, del_3p14.3), (cna, del_3p14.2)) | 8976.00 | 9.28e-47 |
((cna, del_11q23.1), (cna, del_11q14.2)) | 147.54 | 7.00e-39 |
((cna, del_3p25.3), (cna, del_3p14.3)) | 468.22 | 7.23e-37 |
((cna, del_3p25.3), (cna, del_3p14.2)) | 273.00 | 2.03e-34 |
((cna, del_3p14.2), (cna, del_3p12.2)) | 225.17 | 4.28e-33 |
5 rows × 2 columns
Here we tag these redundant features and remove them from the test space.
s = {b for i, (a, v1) in enumerate(hit_df.iterrows())
for j, (b, v2) in enumerate(hit_df.iterrows())
if (i < j)
and a[0] == b[0]
and np.log2(fisher_exact_test(v1, v2)['odds_ratio']) > 4}
hit_df = hit_df.ix[[i for i in hit_df.index if i not in s]] #to keep sorted order
There are 5 choose 2 = 10 different combinations of data-types. We are looking for associations which can not be explained trivially, for example a copy number event associated with expression of a nearby gene or mirna isn't very interesting.
hit_df.groupby(level=0).size()
clinical 6 cna 7 mirna 9 mutation 5 rna 30 dtype: int64
import itertools as itertools
fet= {}
for dtypes in itertools.combinations(hit_df.index.levels[0], 2):
fet[dtypes] = pd.DataFrame({(a[1], b[1]): fisher_exact_test(v1, v2)
for a, v1 in hit_df.iterrows()
for b, v2 in hit_df.iterrows()
if a[0] != b[0]
if a[0] == dtypes[0]
if b[0] == dtypes[1]}).T
fet = pd.concat(fet)
fet.index = pd.MultiIndex.from_tuples([(i[0], i[2][0], i[1], i[2][1]) for i in fet.index])
fet['p bonf.'] = fet['p'] * len(fet)
fet['PASS'] = (fet['p bonf.'] < .01)
fet = fet.sort(['PASS','p'], ascending=[0,1])
fet.PASS.value_counts()
False 1046 True 33 dtype: int64
clinical.binary_df.ix['stage_iv'].value_counts(0)
False 222 dtype: int64
df.shape
(878, 250)
pd.crosstab(combo, clinical.binary_df.ix['stage_iv'])
stage_iv | False |
---|---|
TP53 | |
TP53 | 22 |
both | 157 |
del_3p14.2 | 22 |
neither | 21 |
4 rows × 1 columns
fet.to_csv(FIGDIR + 'supplemental_table2.csv', float_format='%.2e')
f3 = fet.groupby(level=[0,2]).apply(lambda s: s.sort('p').head(1))
f3.index = f3.index.droplevel([0,1])
f3['p_corrected'] = f3['p'] * len(fet)
f3.index = pd.MultiIndex.from_tuples([tuple((i.replace('_',' ') for i in s))
for s in f3.index])
f3.sort('p')
odds_ratio | p | p bonf. | PASS | p_corrected | ||||
---|---|---|---|---|---|---|---|---|
cna | del 3p14.2 | rna | BIOCARTA AKAP13 PATHWAY | 15.94 | 4.53e-14 | 4.88e-11 | True | 4.88e-11 |
clinical | recent smoker | cna | del 3p14.2 | 7.38 | 1.72e-08 | 1.85e-05 | True | 1.85e-05 |
rna | ST GAQ PATHWAY | 4.73 | 7.95e-08 | 8.58e-05 | True | 8.58e-05 | ||
mirna | hsa-mir-3170 | rna | SIG PIP3 SIGNALING IN B LYMPHOCYTES | 0.20 | 2.11e-07 | 2.27e-04 | True | 2.27e-04 |
cna | del 3p14.2 | mutation | TP53 | 6.59 | 3.56e-07 | 3.84e-04 | True | 3.84e-04 |
mutation | TP53 | rna | QARS | 0.17 | 7.12e-07 | 7.68e-04 | True | 7.68e-04 |
cna | del 3p14.2 | mirna | hsa-mir-3170 | 5.43 | 5.15e-06 | 5.56e-03 | True | 5.56e-03 |
clinical | invasion | mirna | hsa-mir-656 | 4.28 | 5.02e-05 | 5.42e-02 | False | 5.42e-02 |
mirna | hsa-mir-570 | mutation | REACTOME SIGNALLING TO RAS | 0.23 | 4.63e-04 | 4.99e-01 | False | 4.99e-01 |
clinical | lymph n2+ | mutation | TP53 | 3.38 | 3.14e-03 | 3.39e+00 | False | 3.39e+00 |
10 rows × 5 columns
3p14.2 Deletion is associated with change of the AKAP13 pathay. Included is RHOA and PRKAR2A which are both on 3p. Not very interesting...
fig, axs = subplots(1,2, figsize=(10,4))
violin_plot_pandas(df.ix['cna'].ix['del_3p14.2'], rna.pathways.ix['BIOCARTA_AKAP13_PATHWAY'],
ax=axs[0])
rna.loadings.ix['BIOCARTA_AKAP13_PATHWAY'].order().plot(kind='bar', ax=axs[1])
for ax in axs:
prettify_ax(ax)
pd.crosstab(df.ix['clinical'].ix['recent_smoker'], df.ix['cna'].ix['del_3p14.2'])
del_3p14.2 | 0.0 | 1.0 |
---|---|---|
recent_smoker | ||
False | 32 | 52 |
True | 12 | 144 |
2 rows × 2 columns
Interesting trend between smoking and GAQ signaling, but looks more like a phenotype of smoking rather than having to do with tumor progression.
fig, axs = subplots(1,2, figsize=(10,4))
violin_plot_pandas(df.ix['clinical'].ix['recent_smoker'],
rna.pathways.ix['SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'], ax=axs[0])
rna.loadings.ix['SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'].order().plot(kind='bar', ax=axs[1])
for ax in axs:
prettify_ax(ax)
venn_pandas(df.ix['mutation'].ix['TP53'], df.ix['cna'].ix['del_3p14.2']);
len(keepers_o)
250
Adjusting for the test space we get a p-value of ~10-4.
fisher_exact_test(df.ix['mutation'].ix['TP53'], df.ix['cna'].ix['del_3p14.2'])
odds_ratio 6.59e+00 p 3.56e-07 dtype: float64
fisher_exact_test(df.ix['mutation'].ix['TP53'], df.ix['cna'].ix['del_3p14.2']).p * len(fet)
0.00038427707515341651
combo = combine(df.ix['mutation'].ix['TP53'] > 0, df.ix['cna'].ix['del_3p14.2'])
survival_and_stats(combo, clinical.survival.survival_5y, figsize=(6,4))
get_surv_fit_lr(surv, combo)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
19.3 | 0.000236 | |||||||||
both | 179 | 87 | 1.9 | 1.5 | 2.84 | 0.339 | 0.255 | 0.45 | ||
del_3p14.2 | 26 | 7 | NaN | 2.96 | NaN | 0.621 | 0.419 | 0.921 | ||
TP53 | 23 | 5 | NaN | 4.49 | NaN | 0.62 | 0.374 | 1 | ||
neither | 22 | 3 | NaN | 4.71 | NaN | 0.709 | 0.444 | 1 |
5 rows × 10 columns
There is a lot of literature out there implicating TP53 with chromosomal instability... are we picking up an artifact of this?
two_hit = combo == 'both'
p53_mut = df.ix['mutation'].ix['TP53']
del_3p = df.ix['cna'].ix['del_3p14.2']
two_hit.name = 'TP53_3p'
survival_and_stats(two_hit, surv)
m1 = get_cox_ph(surv, two_hit, [age, old], interactions='None', print_desc=True)
coef exp(coef) se(coef) z p feature 1.203 3.33 0.2814 4.28 1.9e-05 old 0.262 1.30 0.0873 3.00 2.7e-03 Likelihood ratio test=29.1 on 2 df, p=4.9e-07 n= 250, number of events= 102 (128 observations deleted due to missingness)
m2 = get_cox_ph(surv, p53_mut, [age, old], interactions='None', print_desc=True)
coef exp(coef) se(coef) z p feature 1.09 2.97 0.334 3.26 0.0011 old 0.23 1.26 0.087 2.64 0.0082 Likelihood ratio test=19.6 on 2 df, p=5.44e-05 n= 250, number of events= 102 (128 observations deleted due to missingness)
LR_test(m1, m2, df=1)
0.0021463124501954031
m3 = get_cox_ph(surv, del_3p, [age, old], interactions='None', print_desc=True)
coef exp(coef) se(coef) z p feature 1.403 4.07 0.3726 3.76 0.00017 old 0.294 1.34 0.0879 3.35 0.00082 Likelihood ratio test=26.1 on 2 df, p=2.18e-06 n= 250, number of events= 102 (128 observations deleted due to missingness)
LR_test(m1, m3, df=1)
0.084060129965569844
cox(two_hit.ix[true_index(p53_mut==1)].ix[ti(old==False)], surv)
hazard exp(coef) 5.58 exp(-coef) 0.18 lower .95 1.37 upper .95 22.74 LR stat 10.37 df 1.00 p 0.00 concordance stat 0.55 se 0.02 dtype: float64
surv_test(two_hit.ix[true_index(p53_mut==1)], surv, [age, old])
LR 0.00239 feature_p 0.0109 fmla Surv(days, event) ~ feature + old\n hazzard 3.25 dtype: object
surv_test(two_hit.ix[true_index(del_3p==1)], surv, [age, old])
LR 0.0186 feature_p 0.05 fmla Surv(days, event) ~ feature + old + feature:old\n hazzard 2.25 dtype: object
r = screen_feature(two_hit, fisher_exact_test, df.ix['mutation'])
r.head()
odds_ratio | p | q | |
---|---|---|---|
TP53 | inf | 3.26e-34 | 3.94e-32 |
KEGG_CELL_CYCLE | inf | 2.92e-13 | 1.77e-11 |
REACTOME_SOS_MEDIATED_SIGNALLING | 0.11 | 2.50e-06 | 1.01e-04 |
REACTOME_SHC_MEDIATED_SIGNALLING | 0.12 | 7.72e-06 | 2.34e-04 |
CASP8 | 0.13 | 2.32e-05 | 5.61e-04 |
5 rows × 3 columns
len(r)
121
r.ix['REACTOME_SOS_MEDIATED_SIGNALLING'].p * len(r)
0.00030230943613944141
r.ix['CASP8'].p * len(r)
0.0028054320178764786