import NotebookImport from Imports import * meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0) meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1]) for i in meta.index]) clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0) clin.Tumor_type = clin.Tumor_type.map(str.strip) surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack() clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0) surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack() surv = surv2.combine_first(surv) keepers = ti(clin.HPV == 'Negative') cc = clin.ix[true_index(meta.Final_74_exome_analysis[:,'Tumor']>0)] cc = cc.ix[:,cc.apply(pd.value_counts).count().isin([2,3,4])] cc = cc.dropna(1) cox_screen(cc.T, surv)[7:].head(10) f = clin.Primary_site.ix[keepers] f = f.replace('Hypopharynx','Other').replace('Sinonasal','Other') get_surv_fit_lr(surv, f) f = clin.Nodal_stage.ix[keepers].fillna('.') f = f.replace('N2A','N2').replace('N2B','N2').replace('N2C','N2') get_surv_fit_lr(surv, f) survival_and_stats(f.isin(['N2','N3']), surv) f = clin.clinical_stage.ix[keepers] f = f.replace('IVa','IV').replace('IVb','IV').replace('.', 'Unstaged').fillna('Unstaged') get_surv_fit_lr(surv, f) f = clin.Alcohol_amt.fillna(0).ix[keepers].replace('.', 0).astype(float) > 13 f2 = clin.Alcohol_amt.fillna(0).ix[keepers].replace('.', 0).astype(float) < 7 get_surv_fit_lr(surv, 1.*f - 1.*f2) get_surv_fit_lr(surv, clin.Tob_history.ix[keepers]) get_surv_fit_lr(surv, clin.PNI.ix[keepers]) survival_and_stats(clin['Chemo '], surv, figsize=(5,4), title='Chemo') get_surv_fit_lr(surv, clin.Date_Dx.map(lambda s: s[-2:]).ix[keepers].astype(int)>10) survival_and_stats(clin.EPS, surv, figsize=(5,4), title='EPS') survival_and_stats(clin.HPV, surv, figsize=(5,4), title='HPV') survival_and_stats(to_quants(clin.Age_Dx, q=.33, labels=True), surv, figsize=(5,4), title='Age') maf = pd.read_csv('../Extra_Data/UPMC_cohort/maf.csv') maf = maf.dropna(how='all', axis=[0,1]) maf['Tumor_Sample_Barcode'] = maf['Tumor_Sample_Barcode'].str.replace('-Tumor','').copy() maf = maf.set_index(['Hugo_Symbol','Tumor_Sample_Barcode']) maf = maf[maf.Validation_Status != 'Wildtype'] non_silent = maf[maf.Variant_Classification != 'Synonymous'] non_silent['counter'] = 1 hit_matrix = non_silent.counter.groupby(level=[0,1]).sum().unstack() hit_matrix.columns = map(lambda s: s.replace('-Tumor',''), hit_matrix.columns) hit_matrix = hit_matrix.fillna(0) fig, axs = subplots(1,3, figsize=(8,3), sharey=True) c = pd.concat([(mut.df > 0).sum(), (hit_matrix > 0).sum(0)], keys=['TCGA','UPMC'], axis=1).stack() violin_plot_series(c.clip_upper(500), ax=axs[0]) hh = hit_matrix.ix[:, true_index(clin.HPV == 'Negative')].sum() yy = mut.df.ix[:, true_index(clinical.hpv.dropna() == False)].sum() c = pd.concat([yy,hh], keys=['TCGA','UPMC'], axis=1).stack() violin_plot_series(c.clip_upper(500), ax=axs[1]) hh = hit_matrix.ix[:, true_index(clin.HPV == 'Positive')].sum() yy = mut.df.ix[:, true_index(clinical.hpv.dropna())].sum() c = pd.concat([yy,hh], keys=['TCGA','UPMC'], axis=1).stack() violin_plot_series(c.clip_upper(500), ax=axs[2]) for ax in axs: prettify_ax(ax) ax.set_ylabel('') ax.set_xlabel('Study') axs[0].set_ylabel('# of Mutations') fig.tight_layout() fig.savefig(FIGDIR + 'mutation_rate_comparison.pdf') hh = hit_matrix.ix[:, true_index(clin.HPV == 'Negative')].sum() yy = mut.df.ix[:, true_index(clinical.hpv.dropna() == False)].sum() c = pd.concat([yy,hh], keys=['TCGA','UPMC'], axis=1).stack() stats.mannwhitneyu(hh.dropna(), yy.dropna()) yy = mut.df.ix[:, true_index(clinical.hpv.dropna() == False)].sum() yy.median() hh = hit_matrix.ix[:, true_index(clin.HPV == 'Negative')].sum() hh.median() p53_mut = hit_matrix.ix['TP53'].ix[true_index(clin.HPV=='Negative')] > 0 survival_and_stats(p53_mut, surv) import re as re get_nums = lambda s: re.findall(r'\d+', s) def is_disruptive(v): c = v.Variant_Classification if c != 'Missense': if 'Ins' in c or 'Del' in c: return 'InDel' else: return v.Variant_Classification.split('_')[0] else: s = v.Protein_Change aa = int(get_nums(s)[0]) if int(aa) in range(163,196): return 'L2' if int(aa) in range(236, 252): return 'L3' return 'other' p53 = maf.ix['TP53'].reset_index() dd = p53.apply(is_disruptive, 1) dd = dd.replace('Synonymous',nan).dropna() others = hit_matrix.columns.diff(p53.Tumor_Sample_Barcode.ix[dd.index]) dd.index = p53.Tumor_Sample_Barcode.ix[dd.index] dd = pd.concat([pd.Series('WT', others), dd]) dd = dd.ix[keepers] s2 = surv.unstack().ix[dd.index] s2.index = range(len(dd)) s2 = s2.stack() dd.index = range(len(dd)) survival_and_stats(dd, s2, colors=colors[:6] + ['grey'] + colors[6:], figsize=(7,6)) v = maf.ix['TP53'].Validation_Status == 'Valid' v = v.groupby(level=0).sum() > 0 v.name = 'valid' v = v.ix[hit_matrix.columns] v = v.fillna('WT') v.value_counts() draw_survival_curve(v.ix[true_index(clin.HPV=='Negative')], surv) survival_and_stats(hit_matrix.ix['MUC5B'].ix[true_index(clin.HPV=='Negative')]>0, surv) cn_tumor = meta.xs('Tumor', level=1)['SNP array ID'].dropna() lu = pd.Series({v:i for i,v in cn_tumor.iteritems()}) path = '../Extra_Data/UPMC_cohort/' gistic = pd.read_table(path + 'all_thresholded.by_genes.txt', index_col=[0,1, 2]) gistic.index = gistic.index.droplevel(1) cn_upmc = pd.DataFrame({i: gistic[t] for i,t in cn_tumor.iteritems() if t in gistic}) pct_del = (cn_upmc < 0).sum() / (1.*len(cn_upmc)) pct_amp = (cn_upmc > 0).sum() / (1.*len(cn_upmc)) pct_altered = pct_amp + pct_del pct_altered.name = 'CIN_pct_altered' survival_and_stats(to_quants(pct_altered, q=.33, labels=True), surv) del_3p = cn_upmc.xs('3p14.2',0,1).median().ix[true_index(clin.HPV=='Negative')].dropna() < 0 b = combine(p53_mut, del_3p).index keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1) keepers = keepers.groupby(level=0).sum() keepers = (keepers > 0) & (clin.HPV == 'Negative') keepers = keepers.ix[surv.index.get_level_values(0)].dropna() keepers = keepers.groupby(level=0).last() keepers.value_counts() survival_and_stats(del_3p, surv) fisher_exact_test(p53_mut, del_3p, alternative='greater') combo = combine(p53_mut, del_3p) survival_and_stats(combo, surv, figsize=(5,4), title='PITT') pd.crosstab(clin.Alcohol, clin.Alcohol_amt.fillna(0)) alch = clin.Alcohol_amt.fillna(0).replace('.', nan).astype(float) alch.hist() get_surv_fit_lr(surv, (alch.ix[combo.index].dropna() > 10).fillna('M')) get_surv_fit_lr(surv, clin.Smoking_history.ix[combo.index]) p2 = p53_mut p2.name = 'TP53' p2 = p2.map({True:'Mut.',False:'WT'}) draw_survival_curves(del_3p.map({0:'3p +/+', 1:'3p +/-'}), surv, p2, legend=True) get_cox_ph(surv, del_3p.ix[true_index(p53_mut)], print_desc=True, interactions=False); exp(1.92), exp(1.92) - exp(1.92 - 1.03) pd.crosstab(p53_mut, del_3p.ix[p53_mut.index].fillna('M')) two_hit_both = combine(p53_mut>0, del_3p) == 'both' no_cna = true_index(del_3p.ix[true_index(p53_mut==False)].isnull()) no_cna = pd.Series(False, index=no_cna) two_hit = two_hit_both.append(no_cna).ix[keepers].dropna() survival_and_stats(two_hit.ix[keepers].dropna(), surv) %load_ext rmagic coin = robjects.packages.importr('coin') th2 = two_hit[true_index(clin.Age_Dx < 75)].dropna() time = robjects.r.c(*list(surv.unstack().days.ix[th2.index])) event = robjects.r.c(*list(surv.unstack().event.ix[th2.index])) g = robjects.r.c(*list(th2*1.)) %%R -i time,event,g exdata <- data.frame(time = time, event = event, group = factor(g)) surv_test(Surv(time, event) ~ group, data=exdata, distribution=exact(), alternative='less') f = del_3p.ix[true_index(p53_mut)].dropna()*1. f = f.ix[clin.Age_Dx < 77] g = robjects.r.c(*list(f)) time = robjects.r.c(*list(surv.unstack().days.ix[f.index])) event = robjects.r.c(*list(surv.unstack().event.ix[f.index])) cox(f, surv) %%R -i time,event,g exdata <- data.frame(time = time, event = event, group = factor(g)) surv_test(Surv(time, event) ~ group, data=exdata, distribution=exact(), alternative='less') f = p53_mut.ix[true_index(del_3p)].dropna()*1. g = robjects.r.c(*list(f)) time = robjects.r.c(*list(surv.unstack().days.ix[f.index])) event = robjects.r.c(*list(surv.unstack().event.ix[f.index])) %%R -i time,event,g exdata <- data.frame(time = time, event = event, group = factor(g)) surv_test(Surv(time, event) ~ group, data=exdata, distribution=exact(), alternative='less') cn_upmc = cn_upmc.ix[:, two_hit.index] survival_and_stats(cn_upmc.ix['MIR548K'].ix[0].dropna()>0, surv) st = (two_hit + 1).add(2*(cn_upmc.ix['MIR548K'].ix[0] > 0)).replace(3,0).replace(4,3) st = st.dropna() mir = cn_upmc.ix['MIR548K'].ix[0].ix[two_hit.index] eq = cn_upmc.ix['MIR548K'].ix[0].ix[two_hit.index] #eq = cn_upmc.xs('11q13.3',0,1).median() st = (two_hit + 1).add(2*(eq > 1)).replace(3,1).replace(4,3) st = st.dropna() survival_and_stats(st, surv) get_cox_ph(surv, st, print_desc=True, interactions=False); cn_upmc = pd.DataFrame({i: gistic[t] for i,t in cn_tumor.iteritems() if t in gistic}) cn_upmc = pd.DataFrame({i: gistic[t] for i,t in cn_tumor.iteritems() if t in gistic}) del_3p_upmc = cn_upmc.xs('3p14.2',0,1).median().ix[true_index(clin.HPV=='Positive')].dropna() surv_combined = pd.concat([surv, clinical.survival.survival_5y]) cn_2 = cn_upmc.groupby(level=1).median().ix[:, true_index(clin.HPV=='Positive')].dropna(1, how='all') cn_3 = cn.features.copy().ix[:, ti(clinical.hpv)] cn_3.index = cn_3.index.get_level_values(1) cn_4 = cn_2.join(cn_3) cn_4 = cn_4.dropna(1, how='all').dropna(how='any') cn_d = (cn_4 < 0) cn_d = cn_d[cn_d.sum(1) > 0] cn_a = (cn_4 > 0) cn_a = cn_a[cn_a.sum(1) > 0] cn_all = pd.concat([cn_a, cn_d], keys=['Amplification','Deletion']) cox_screen(cn_all, surv_combined).sort([('LR','p')]).head() del_3p_all = cn_all.ix['Deletion'].ix['3p14.2'] p53_upmc = hit_matrix.ix['TP53', true_index(clin.HPV=='Positive')].dropna() p53_tcga = mut.features.ix['TP53', true_index(clinical.hpv==True)] p53_all = (mut.features.ix['TP53'].append(hit_matrix.ix['TP53']>0)).map({True:'mut',False:'wt'}) hpv_all = clinical.hpv.append(clin.HPV=='Positive') pd.crosstab(hpv_all, p53_all) 1 / fisher_exact_test(hpv_all, p53_all)['odds_ratio'] fisher_exact_test(hit_matrix.ix['TP53']>0, clin.HPV=='Positive') print len(combine(hit_matrix.ix['TP53']>0, clin.HPV=='Positive')) pd.crosstab(hit_matrix.ix['TP53']>0, clin.HPV) fisher_exact_test(hpv_all, p53_all) survival_and_stats(del_3p_all, surv_combined) log_rank(del_3p_all, surv_combined).ix['p'] get_cox_ph(surv_combined, del_3p_all, print_desc=True); exp(1.71), exp(1.71) - exp(1.71 - .663) p53_mut.name = 'TP53' del_3p.name = '3p' combo = combine(p53_mut, del_3p) fig, ax = subplots(figsize=(3,2.5)) c = {'TP53': colors_th[1], 'both': colors_th[0]} draw_survival_curve(combo[combo.isin(['both','TP53'])], surv, colors=c, ax=ax) ax.legend(loc='lower left', frameon=False) prettify_ax(ax) fig.tight_layout() fig.savefig(FIGDIR + 'fig3a.pdf') c = combo[combo.isin(['both','TP53'])] c = c.replace({'both': 'TP53 and 3p'}) df = pd.concat([c, surv[:,'days'], surv[:,'event']], keys=['Molecular Status', 'Days to Death/Censoring', 'Death Indicator'], axis=1).dropna() df.to_csv(FIGDIR + 'fig3a.csv') fig, ax = subplots(figsize=(3,2.5)) surv_fit = get_surv_fit(surv, combo) tt = surv_fit['5y Survival'] b = (tt['Surv']).plot(kind='bar', ax=ax, color=[colors_th[2], colors_th[0], colors_th[3], colors_th[1]], yerr=[tt.Surv-tt.Lower, tt.Upper-tt.Surv], ecolor='black') ax.set_ylabel('5Y Survival') ax.set_yticks([0, .5, 1.0]) bar_labels = ['{}\n\n{}'.format(j,int(v)) for j,v in surv_fit['Stats']['# Patients'].iteritems()] ax.set_xticklabels(bar_labels, rotation=0); prettify_ax(ax) fig.tight_layout() fig, ax = subplots(figsize=(3,2.5)) draw_survival_curve(del_3p_all.astype(int), surv_combined, ax=ax, colors=[colors_th[3], colors_th[2]]) ax.legend().set_visible(False) prettify_ax(ax) fig.tight_layout() fig.savefig(FIGDIR + 'fig3d.pdf') df = pd.concat([del_3p_all, surv_combined[:,'days'], surv_combined[:,'event']], keys=['3p Deletion', 'Days to Death/Censoring', 'Death Indicator'], axis=1).dropna() df.to_csv(FIGDIR + 'fig3ba.csv')