%pylab inline
Populating the interactive namespace from numpy and matplotlib
cd ../src
/cellar/users/agross/TCGA_Code/TCGA/src
from Processing.Imports import *
from Figures.Survival import survival_and_stats
from Processing.Screen import *
params = pd.read_table('../global_params.txt', header=None, squeeze=True,
index_col=0)
run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])
run = get_run(run_path, 'Run_' + params.ix['VERSION'])
cancer = run.load_cancer(params.ix['CANCER'])
clinical = cancer.load_clinical()
mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()
rppa = cancer.load_data('RPPA')
rna = pickle.load(open(cancer.path + '/mRNASeq/store/no_hpv.p', 'rb'))
mirna = pickle.load(open(cancer.path + '/miRNASeq/store/no_hpv.p', 'rb'))
#meth = cancer.load_data('Methylation')
clinical_processed = clinical.processed
#clinical_processed = clinical_processed.replace('yes', 1.).replace('no', 0.)
clinical_processed['year'] = clinical_processed.year == 'post_2000'
hpv_inferred = clinical_processed.hpv_inferred.astype(int)
surv = clinical.survival.survival_5y
age = clinical.clinical.age.astype(float)
old = pd.Series(1.*(age>=75), name='old')
keepers_o = true_index(hpv_inferred == 0)
keepers_o = keepers_o.intersection(mut.features.columns)
keepers_o = keepers_o.intersection(cn.features.columns)
keepers_o = keepers_o.intersection(surv.unstack().index)
keepers_o = keepers_o.intersection(rna.features.columns)
keepers_o = keepers_o.intersection(mirna.features.columns)
keepers_o = keepers_o.intersection(true_index(age < 85))
n0 = clinical_processed.lymph_stage == 'n0'
n0.name = 'lymph_n0'
s4 = clinical_processed.stage == 'stge iv'
s4.name = 'Stage_IV'
oc = clinical_processed.tumor_subdivision == 'oral cavity'
oc.name = 'oral_cavity'
lx = clinical_processed.tumor_subdivision == 'larynx'
lx.name = 'larynx'
ox = clinical_processed.tumor_subdivision == 'oropharynx'
ox.name = 'oropharynx'
year = clinical_processed.year
white = clinical.clinical.race == 'white'
white.name = 'race_white'
gender = clinical.clinical.gender == 'male'
gender.name = 'gender_male'
inferred = clinical_processed[['drinker_inferred','invasion_inferred',
'smoker_inferred','spread_inferred']]
clinical_df = pd.concat([n0, s4, oc, lx, ox, year, white, gender], 1)
clinical_df = pd.concat([clinical_df, inferred], 1)
from Processing.Screen import *
def test(s, surv, cov_df):
s = s.dropna()
try:
return get_cox_ph_ms(surv, s, cov_df, return_val='LR', interactions='just_feature')
except:
return pd.Series(index=['LR','feature_p', 'fmla', 'hazzard'])
def run_screen(screen, filters, covariates, save=True):
cov_df = pd.concat(covariates, axis=1)
keepers_o = screen.get_patient_set(filters)
cutoff = max(np.ceil(len(keepers_o) * .05), 12)
df = screen.get_data(keepers_o, cutoff)
univariate = cox_screen(df, surv)
vec = univariate.LR.p.sort_index()
univariate = pd.concat([univariate['hazard'], corrections(vec)],
keys=['hazard', 'p'], axis=1)
#hits = univariate[univariate['q_bh'] < .2].index
hits = univariate.index
full = df.ix[hits].apply(test, args=(surv, cov_df,), axis=1)
vec = full.LR.ix[univariate.index].sort_index()
full = pd.concat([full[['fmla']], corrections(vec)],
keys=['fmla','p'], axis=1)
hits = true_index(full.p.bh_all.order() < .1)
res = full.sort([('p','uncorrected')]).head()
return res, full, univariate, keepers_o, df
rna
mRNASeq dataset
screen = Screen(mut, cn, rna, mirna, clinical_df, surv)
p53_mut = mut.features.ix['TP53'].ix[keepers_o]
del_3p = cn.features.ix[('Deletion', '3p14.2', 'Lesion')].ix[keepers_o]
two_hit = combine(p53_mut > 0, del_3p < 0) == 'both'
res, full, univariate, keepers_o, df = run_screen(screen, filters=[hpv_inferred==1, two_hit==0, age>=85],
covariates=[old, age])
(full.p.bh_all < .1).groupby(level=0).apply(pd.value_counts).unstack()
False | True | |
---|---|---|
clinical | 9 | 1 |
cna | 70 | NaN |
mirna | 227 | 1 |
mutation | 175 | 4 |
rna | 423 | 2 |
5 rows × 2 columns
(full.p.bh_all < .1).value_counts()
False 904 True 8 dtype: int64
muc5b = df.ix['mutation'].ix['MUC5B']>0
survival_and_stats(df.ix['mutation'].ix['MUC5B']>0, surv)
res, full, univariate, keepers_o, df = run_screen(screen, filters=[hpv_inferred==1, two_hit==0],
covariates=[old, age, year])
df.shape
(912, 179)
rr = cox_screen(df.ix[full.p.bh_all < .1], surv)
hits = full.ix[true_index(rr.LR.p < .05)]
hits = hits.sort([('p','uncorrected')])
len(hits)
7
hits.p.head(4)
uncorrected | bh_within | bh_all | bonf_all | bonf_within | two_step | ||
---|---|---|---|---|---|---|---|
mirna | hsa-mir-548k | 3.40e-05 | 0.01 | 0.02 | 0.03 | 0.01 | 0.03 |
hsa-mir-629 | 5.23e-05 | 0.01 | 0.02 | 0.05 | 0.01 | 0.03 | |
mutation | MUC5B | 3.71e-04 | 0.02 | 0.05 | 0.34 | 0.07 | 0.11 |
clinical | smoker_inferred | 5.69e-04 | 0.01 | 0.06 | 0.52 | 0.01 | 0.03 |
4 rows × 6 columns
mir548k = df.ix['mirna'].ix['hsa-mir-548k']
survival_and_stats(mir548k, surv)
survival_and_stats(df.ix['mutation'].ix['MUC5B'], surv)
rr = cox_screen(df, surv)
haz = rr['hazard'][['exp(coef)','lower .95','upper .95']]
p = rr.LR.p
uni = haz.join(p)
uni['PASS'] = uni.p < .05
multi = full.reset_index().sort(['level_0',('p','uncorrected')]).set_index(['level_0','level_1'])
multi.index.names = ['data_type','event']
multi['p'] = multi['p'].clip_upper(1.)
multi.columns = multi.columns.droplevel(0)
multi['PASS'] = multi.bh_all < .1
f = pd.concat([uni, multi], keys=['Univariate','Multivariate'], axis=1)
f[('BOTH','PASS')] = f.Univariate.PASS & f.Multivariate.PASS
f = f.sort([('BOTH','PASS'),('Multivariate','uncorrected')], ascending=[False, True])
f.to_csv('/cellar/users/agross/figures/supplemental_table2.csv', float_format='%.2e')