import NotebookImport from Imports import * rppa = cancer.load_data('RPPA') cn_by_gene = FH.get_gistic_gene_matrix(run.data_path, cancer.name) #cn.features.index = cn.features.index.droplevel(2) cin = (cn_by_gene.groupby(level='Cytoband').median().abs() >= 1).sum() cn_by_gene.index = map(lambda s: s.split('.')[0], cn_by_gene.index.get_level_values(0)) cin = (cn_by_gene.groupby(level=0).median().abs() >= 1).sum() cn_by_gene = FH.get_gistic_gene_matrix(run.data_path, cancer.name) cn_by_gene = cn_by_gene.ix[[i for i in cn_by_gene.index if i[2] in rna.df.index]] pct_del = (cn_by_gene < 0).sum() / (1.*len(cn_by_gene)) pct_amp = (cn_by_gene > 0).sum() / (1.*len(cn_by_gene)) pct_altered = pct_amp + pct_del pct_altered.name = 'CIN' draw_survival_curves(pct_altered, surv, hpv.dropna()) violin_plot_pandas(hpv, pct_altered) df = mut.df.ix[(mut.df.sum(1).order() > 5)] tt = lambda a,b: kruskal_pandas(b,a) s = screen_feature(cin, tt, df.clip_upper(1.)) s.head(10) df = mut.df.ix[(mut.df.sum(1).order() > 5)] tt = lambda a,b: kruskal_pandas(b,a) s = screen_feature(pct_altered, tt, df.clip_upper(1.)) s.head(10) df = cn.features.ix['Deletion'] df.index = df.index.get_level_values(0) screen_feature(pct_altered.ix[keepers_o], rev_kruskal, df < 0).head() df = cn.features.ix['Amplification'] df.index = df.index.get_level_values(0) screen_feature(pct_altered.ix[keepers_o], tt, df > 0).head() fig, axs = subplots(1,2, figsize=(12,4)) violin_plot_pandas(cn.features.ix[('Deletion','5q11.2'), keepers_o].iloc[0], pct_altered, ax=axs[0]) violin_plot_pandas(cn.features.ix[('Deletion','3p14.2'), keepers_o].iloc[0], pct_altered, ax=axs[1]) del_3p = cn.features.ix[('Deletion','3p14.2')].iloc[0] < 0 del_3p = del_3p.ix[keepers_o] del_3p.name = 'del_3p' old.name = 'age_over_75' s = screen_feature(del_3p, fisher_exact_test, cn.features.ix['Deletion'] < 0) s.head(10) args = [run.data_path] + [cancer.name] f = '{}/analyses/{}/CopyNumber_Gistic2/all_lesions.conf_99.txt'.format(*args) lesions = pd.read_table(f, index_col=0) lesions = lesions.set_index('Descriptor').select(lambda s: s.startswith('TCGA'), 1) lesions = lesions.groupby(lambda s: s[:12],1).first() lesions.index = map(str.strip, lesions.index) levels = lesions.ix['3p14.2'].ix[2].ix[keepers_o] calls = lesions.ix['3p14.2'].ix[0].ix[keepers_o] calls.name = 'del_3p' #del_3p = lesions.ix['3p14.2'].ix[0] > 0 p53_mut = mut.features.ix['TP53']>0 fig, axs = subplots(1, 2, figsize=(12,4)) levels.hist(bins=50, ax=axs[0]) s = pd.DataFrame({c: log_rank(levels < c, surv) for c in levels.order()[2:-2]}) s.ix['p'].plot(ax=axs[1]) axs[0].set_xlabel('Copy Number') axs[1].set_xlabel('Copy Number') axs[1].set_ylabel('p-value') axs[1].set_yscale('log') colors = rcParams['axes.color_cycle'] f = '../Extra_Data/FH_HNSC__4_16_all_data_thresholded_by_genes.txt' gistic = pd.read_table(f, index_col=[2, 1, 0], low_memory=False) gistic = FH.fix_barcode_columns(gistic, tissue_code='01') cn_by_gene = gistic coords = pd.read_csv('../Extra_Data/gene_coords.csv', index_col=0) #cn_by_gene = FH.get_gistic_gene_matrix(run.data_path, cancer.name) cn_by_gene = cn_by_gene.reset_index(level=[0,1]) cn_by_gene = cn_by_gene.join(coords).set_index(['chrom','start', 'Cytoband','Locus ID'], append=True) cn_by_gene = cn_by_gene.ix[:,keepers_o].dropna(axis=1) ch_3 = cn_by_gene.xs('3', level='chrom') p_arm = pd.Series(ch_3.index.get_level_values('Cytoband').map(lambda s: 'p' in s), ch_3.index) p_arm = ch_3[p_arm].sortlevel('start') p_arm = p_arm.ix[:-1] #df = rna.df.xs('01', 1, 1).ix[p_arm.index.get_level_values(0)].dropna()# #df = rna.features.ix['real'].ix[p_arm.index.get_level_values(0)].dropna() df = rna.df.xs('01', 1, 1).ix[p_arm.index.get_level_values(0)].dropna() #df = df.ix[screen_feature(del_3p, kruskal_pandas, df).p < .05] df = df.ix[:,keepers_o].dropna(1) rr = screen_feature(p53_mut, fisher_exact_test, p_arm < 0) rr2 = screen_feature(rna.df.ix['TP53'][:,'01'], pearson_pandas, df) rr2 = screen_feature(rna.df.ix['TP53'][:,'01'], pearson_pandas, df) rr2 = rr2.ix[df.index].sort_index() rr2.index = p_arm.sort_index().ix[rr2.index].index rr2.ix['FHIT'] fig = plt.figure(figsize=(8,4)) ax1 = plt.subplot2grid((7,1), (0,0), rowspan=3, colspan=1) ax2 = plt.subplot2grid((7,1), (3,0), rowspan=3, colspan=1) ax3 = plt.subplot2grid((7,1), (6,0), rowspan=1, colspan=1) bands = p_arm.reset_index().groupby('Cytoband') bands = bands.first().start.order() starts = p_arm.index.get_level_values('start') for i,(idx,v) in enumerate(bands.iteritems()): try: r = plt.Rectangle((v,0), bands.iloc[i+1]-v, 1, lw=2, alpha=.5, fc=colors[i%2]) except: r = plt.Rectangle((v,0), starts[-1] - v, 1, lw=2, alpha=.5, fc=colors[i%2]) ax3.add_artist(r) ax3.set_xticks(bands) b = [{0:s,1:''}[(i%2)] for i,s in enumerate(bands.index)] ax3.set_xticklabels(b, rotation=270+45, ha='left') ax1.scatter(starts, (p_arm < -.1).sum(1), s=40, c=colors[2], edgecolor='grey', lw=.5, alpha=.5) ax2.scatter(rr2.index.get_level_values('start'), rr2.rho, s=40, c=colors[4], edgecolor='grey', alpha=1) #ax1.set_ybound(215,255) ax1.set_ylabel('# of Patients\nwith Deletion\n\n', ha='center', va='center') #ax2.set_ybound(-.5,4.) ax2.set_ylabel('mRNA Survival\nAssociation\n(Log10 p)\n\n\n', ha='center', va='center') ax3.set_yticks([]) fig.tight_layout() for ax in [ax1, ax2, ax3]: ax.set_xbound(starts[0], starts[-1]) ax.set_xticks(bands) for n, ss in exp2.ix[['FHIT']].iteritems(): ax2.annotate(n[0], (n[1], ss), xytext=(15, -5), textcoords='offset points', arrowprops=dict(arrowstyle="->")) ax1.set_xticklabels('') ax2.set_xticklabels('') #ax2.set_yticks([2,3,4,5,6,7,8]) fig.tight_layout() #fig.savefig(FIGDIR + 'supp_2a.pdf', transparent=True) fig = plt.figure(figsize=(8,4)) ax1 = plt.subplot2grid((7,1), (0,0), rowspan=3, colspan=1) ax2 = plt.subplot2grid((7,1), (3,0), rowspan=3, colspan=1) ax3 = plt.subplot2grid((7,1), (6,0), rowspan=1, colspan=1) bands = p_arm.reset_index().groupby('Cytoband') bands = bands.first().start.order() starts = p_arm.index.get_level_values('start') for i,(idx,v) in enumerate(bands.iteritems()): try: r = plt.Rectangle((v,0), bands.iloc[i+1]-v, 1, lw=2, alpha=.5, fc=colors[i%2]) except: r = plt.Rectangle((v,0), starts[-1] - v, 1, lw=2, alpha=.5, fc=colors[i%2]) ax3.add_artist(r) ax3.set_xticks(bands) b = [{0:s,1:''}[(i%2)] for i,s in enumerate(bands.index)] ax3.set_xticklabels(b, rotation=270+45, ha='left') ax1.scatter(starts, (p_arm < -.1).sum(1), s=40, c=colors[2], edgecolor='grey', lw=.5, alpha=.5) ax2.scatter(starts, rr.odds_ratio.ix[p_arm.index], s=40, c=colors[4], edgecolor='grey', alpha=.7) #ax1.set_ybound(215,255) ax1.set_ylabel('# of Patients\nwith Deletion\n\n', ha='center', va='center') #ax2.set_ybound(-.5,4.) ax2.set_ylabel('mRNA Survival\nAssociation\n(Log10 p)\n\n\n', ha='center', va='center') ax3.set_yticks([]) fig.tight_layout() for ax in [ax1, ax2, ax3]: ax.set_xbound(starts[0], starts[-1]) ax.set_xticks(bands) for n, ss in exp2.ix[['FHIT']].iteritems(): ax2.annotate(n[0], (n[1], ss), xytext=(15, -5), textcoords='offset points', arrowprops=dict(arrowstyle="->")) ax1.set_xticklabels('') ax2.set_xticklabels('') ax2.set_yticks([2,3,4,5,6,7,8]) fig.tight_layout() #fig.savefig(FIGDIR + 'supp_2a.pdf', transparent=True) levels = lesions.ix['3p14.2'].ix[2] calls = lesions.ix['3p14.2'].ix[0] calls.name = 'del_3p' cn_by_gene = FH.get_gistic_gene_matrix(run.data_path, cancer.name) cn_by_gene = cn_by_gene.reset_index(level=[0,1]) cn_by_gene = cn_by_gene.join(coords).set_index(['chrom','start', 'Cytoband','Locus ID'], append=True) cn_by_gene = cn_by_gene.ix[:,true_index(hpv==True)].dropna(axis=1) ch_3 = cn_by_gene.xs('3', level='chrom') p_arm = pd.Series(ch_3.index.get_level_values('Cytoband').map(lambda s: 'p' in s), ch_3.index) p_arm = ch_3[p_arm].sortlevel('start') p_arm = p_arm.ix[:-1] #bad annotation rna_all = cancer.load_data('mRNASeq') #df = rna_all.features.ix['real'].ix[p_arm.index.get_level_values(0)].dropna() df = rna_all.df.xs('01', 1, 1).ix[p_arm.index.get_level_values(0)].dropna() df = df.ix[:,true_index(hpv.dropna())] #df = df.ix[screen_feature(del_3p, kruskal_pandas, df).p < .05] test = lambda s: get_cox_ph_ms(surv, s, [age, old], interactions='just_first') exp = df.apply(test, axis=1) exp = exp.sort_index() exp.index = p_arm.sort_index().ix[exp.index].index exp2 = -exp.LR.map(log10) fig = plt.figure(figsize=(8,4)) ax1 = plt.subplot2grid((7,1), (0,0), rowspan=3, colspan=1) ax2 = plt.subplot2grid((7,1), (3,0), rowspan=3, colspan=1) ax3 = plt.subplot2grid((7,1), (6,0), rowspan=1, colspan=1) bands = p_arm.reset_index().groupby('Cytoband') bands = bands.first().start.order() starts = p_arm.index.get_level_values('start') for i,(idx,v) in enumerate(bands.iteritems()): try: r = plt.Rectangle((v,0), bands.iloc[i+1]-v, 1, lw=2, alpha=.5, fc=colors[i%2]) except: r = plt.Rectangle((v,0), starts[-1] - v, 1, lw=2, alpha=.5, fc=colors[i%2]) ax3.add_artist(r) ax3.set_xticks(bands) b = [{0:s,1:''}[(i%2)] for i,s in enumerate(bands.index)] ax3.set_xticklabels(b, rotation=270+45, ha='left') ax1.scatter(starts, (p_arm < -.1).sum(1), s=40, c=colors[2], edgecolor='grey', alpha=1) ax2.scatter(exp2.index.get_level_values('start'), exp2, s=40, c=colors[4], edgecolor='grey', alpha=1) #ax1.set_ybound(215,255) ax1.set_ylabel('# of Patients\nwith Deletion\n\n', ha='center', va='center') ax2.set_ybound(-.5,3) ax2.set_ylabel('mRNA Survival\nAssociation\n(Log10 p)\n\n\n', ha='center', va='center') ax3.set_yticks([]) fig.tight_layout() for ax in [ax1, ax2, ax3]: ax.set_xbound(starts[0], starts[-1]) ax.set_xticks(bands) for n, ss in exp2.order().tail(2).iteritems(): ax2.annotate(n[0], (n[1], ss), xytext=(15, -5), textcoords='offset points', arrowprops=dict(arrowstyle="->")) ax1.set_xticklabels('') ax2.set_xticklabels('') ax2.set_yticks([0,1,2,3,4,5]) fig.tight_layout() fig.savefig(FIGDIR + 'supp_2b.pdf', transparent=True) t1 = screen_feature(pct_del.ix[keepers_o], rev_kruskal, mut.df.ix[mut.df.ix[:,keepers_o].sum(1) > 5]>0) t1.head(10) len(t1) t1 = screen_feature(pct_altered.ix[keepers_o], rev_kruskal, cn.features.ix['Deletion'] < 0) t2 = screen_feature(pct_altered.ix[keepers_o], rev_kruskal, cn.features.ix['Amplification'] > 0) del t1['q'] t1['p_bonf'] = t1.p * len(t1) del t2['q'] t2['p_bonf'] = t2.p * len(t2) pd.concat([t1.head(6), t2.head(6)], keys=['Deletion', 'Amplification']) len(t2) len(t1) t3 = screen_feature(p53_mut.ix[keepers_o], fisher_exact_test, cn.features.ix['Deletion'] < 0).head(5) t4 = screen_feature(p53_mut.ix[keepers_o], fisher_exact_test, cn.features.ix['Amplification'] > 0).head() pd.concat([t3, t4], keys=['Deletion', 'Amplification']) kruskal_pandas(p53_mut.ix[keepers_o], pct_del) fig, axs = subplots(2,1, figsize=(4,6)) p53_mut = mut.features.ix['TP53'].ix[keepers_o].map({0:'TP53_WT', 1:'TP53_mut'}) violin_plot_pandas(p53_mut, pct_del, ax=axs[0]) axs[0].set_ylabel('% of Genes Deleted') violin_plot_pandas(p53_mut, pct_amp, ax=axs[1]) axs[1].set_ylabel('% of Genes Amplified') for ax in axs: prettify_ax(ax) ax.set_xlabel('') fig.tight_layout() fig.savefig(FIGDIR + 'p53_cin_supp.pdf', transparent=True) violin_plot_pandas(del_3p, pct_altered) del_3p.name = 'del_3p' pct_del.name = 'CIN' fmla = 'Surv(days, event) ~ del_3p + CIN + age_over_75 + age' m1 = get_cox_ph(surv, covariates=[del_3p, pct_del, old, age], formula=fmla, print_desc=True, interactions=False); fmla = 'Surv(days, event) ~ CIN + age_over_75 + age' m2 = get_cox_ph(surv, covariates=[del_3p, pct_del, old, age], formula=fmla, print_desc=True, interactions=False); LR_test(m1, m2) del_3p.name = 'del_3p' p53_mut = mut.features.ix['TP53'].ix[keepers_o] fmla = 'Surv(days, event) ~ del_3p + CIN + age_over_75 + age' m1 = get_cox_ph(surv, covariates=[del_3p.ix[ti(p53_mut==1)], pct_del, old, age], formula=fmla, print_desc=True, interactions=False); fmla = 'Surv(days, event) ~ CIN + age_over_75 + age' m2 = get_cox_ph(surv, covariates=[del_3p.ix[ti(p53_mut==1)], pct_del, old, age], formula=fmla, print_desc=True, interactions=False); LR_test(m1, m2) fmla = 'Surv(days, event) ~ del_3p + CIN + age_over_75 + age' m1 = get_cox_ph(surv, covariates=[del_3p.ix[ti(p53_mut==0)], pct_del, old, age], formula=fmla, print_desc=True, interactions=False); fmla = 'Surv(days, event) ~ CIN + age_over_75 + age' m2 = get_cox_ph(surv, covariates=[del_3p.ix[ti(p53_mut==0)], pct_del, old, age], formula=fmla, print_desc=True, interactions=False); LR_test(m1, m2) pct_del.name = 'CIN' fig, axs = subplots(3, 1, figsize=(3.5,3.5), sharey=True, sharex=False) fmla = 'Surv(days, event) ~ del_3p + CIN + age_over_75 + age' pts = [keepers_o, ti(p53_mut==1), ti(p53_mut==0)] color_list = ['grey', '#a1d99b','#9ecae1'] for i, pt in enumerate(pts): ax = axs[i] m1 = get_cox_ph(surv, covariates=[del_3p.ix[pt], pct_del, old, age], formula=fmla, print_desc=False, interactions=False); ci = convert_robj(robjects.r.summary(m1)[7]) ci = ci.ix[['del_3p','CIN']] ci.index = ['3p', 'CIN'] ci = ci.ix[['CIN', '3p']] haz = ci['exp(coef)'] for j,h in enumerate(haz): ax.scatter(h, j, marker='s', s=100, color=color_list[i], edgecolors=['black'], zorder=10) ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))), lw=3, ls='-', marker='o', dash_joinstyle='bevel', color=color_list[i]) ax.axvline(1, ls='--', color='black') ax.set_xscale('log') ax.set_ybound(-.5,1.5) ax.set_xticks([.25,.5, .8, 1, 1.5, 2,4, 8]) ax.set_xticklabels([.25,.5, .8, 1, 1.5, 2, 4, 8]) ax.set_yticks([0,1]) ax.set_yticklabels(['CIN', '3p']) prettify_ax(ax) axs[0].set_xbound(.66,2.5) axs[1].set_xbound(.66,2.5) axs[2].set_xbound(.25,20) axs[2].set_xticks([.5, 1, 2,4, 8]) axs[2].set_xticklabels([.5, 1, 2, 4, 8]) axs[2].set_xlabel('Hazard Ratio') fig.tight_layout() fig.savefig(FIGDIR + 'fig2c.pdf', transparent=True) df1 = pd.concat([del_3p, pct_del, old, age, surv[:,'days'], surv[:,'event']], keys=['3p','CIN','age_over_75','age', 'Days to Death/Censoring', 'Death Indicator'], axis=1).dropna() df1.to_csv(FIGDIR + 'fig2c_all.csv') df1.ix[ti(p53_mut>0)].to_csv(FIGDIR + 'fig2c_TP53_mut.csv') df1.ix[ti(p53_mut==0)].to_csv(FIGDIR + 'fig2c_TP53_WT.csv') fig, axs = subplots(3, 1, figsize=(3.5,4.5), sharey=True, sharex=True) fmla = 'Surv(days, event) ~ del_3p + CIN + age_over_75' fmla2 = 'Surv(days, event) ~ CIN + age_over_75' pts = [keepers_o, ti(p53_mut==1), ti(p53_mut==0)] color_list = ['grey', '#a1d99b','#9ecae1'] for i, pt in enumerate(pts): ax = axs[i] m1 = get_cox_ph(surv, covariates=[del_3p.ix[pt], pct_del, old, age], formula=fmla, print_desc=False, interactions=False); m2 = get_cox_ph(surv, covariates=[del_3p.ix[pt], pct_del, old, age], formula=fmla2, print_desc=False, interactions=False); print LR_test(m1, m2) ci = convert_robj(robjects.r.summary(m1)[7]) #ci.index = ['3p14.2', 'CIN', 'age > 75', 'age','year','spread'] #ci = ci.ix[['year', 'spread', 'age','age > 75', 'CIN', '3p14.2']] haz = ci['exp(coef)'] for j,h in enumerate(haz): ax.scatter(h, j, marker='s', s=100, color=color_list[i], edgecolors=['black'], zorder=10) ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))), lw=3, ls='-', marker='o', dash_joinstyle='bevel', color=color_list[i]) ax.axvline(1, ls='--', color='black') ax.set_xscale('log') ax.set_xbound(.5,3.5) ax.set_ybound(-.5,len(ci.index) - .5) ax.set_xticks([.67, 1, 1.5, 2,3]) ax.set_xticklabels([.67, 1, 1.5, 2, 3]) ax.set_yticks(range(len(ci.index))) ax.set_yticklabels(ci.index) prettify_ax(ax) axs[2].set_xlabel('Hazard Ratio') fig.tight_layout() fig.savefig(FIGDIR + 'fig2c_alt.pdf', transparent=True) year = clinical.binary_df.ix['pre_2000'] year.name = 'year' fig, axs = subplots(3, 1, figsize=(3.5,4.5), sharey=True, sharex=True) fmla = 'Surv(days, event) ~ del_3p + CIN + age_over_75 + age + year' fmla2 = 'Surv(days, event) ~ CIN + age_over_75 + age + year' pts = [keepers_o, ti(p53_mut==1), ti(p53_mut==0)] color_list = ['grey', '#a1d99b','#9ecae1'] for i, pt in enumerate(pts): ax = axs[i] m1 = get_cox_ph(surv, covariates=[del_3p.ix[pt], pct_del, old, age, year == False], formula=fmla, print_desc=False, interactions=False); m2 = get_cox_ph(surv, covariates=[del_3p.ix[pt], pct_del, old, age, year == False], formula=fmla2, print_desc=False, interactions=False); print LR_test(m1, m2) ci = convert_robj(robjects.r.summary(m1)[7]) #ci.index = ['3p14.2', 'CIN', 'age > 75', 'age','year','spread'] #ci = ci.ix[['year', 'spread', 'age','age > 75', 'CIN', '3p14.2']] haz = ci['exp(coef)'] for j,h in enumerate(haz): ax.scatter(h, j, marker='s', s=100, color=color_list[i], edgecolors=['black'], zorder=10) ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))), lw=3, ls='-', marker='o', dash_joinstyle='bevel', color=color_list[i]) ax.axvline(1, ls='--', color='black') ax.set_xscale('log') ax.set_xbound(.5,3.5) ax.set_ybound(-.5,len(ci.index) - .5) ax.set_xticks([.67, 1, 1.5, 2,3]) ax.set_xticklabels([.67, 1, 1.5, 2, 3]) ax.set_yticks(range(len(ci.index))) ax.set_yticklabels(ci.index) prettify_ax(ax) axs[2].set_xlabel('Hazard Ratio') fig.tight_layout() fig.savefig(FIGDIR + 'fig2c_alt.pdf', transparent=True)