import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory
populating namespace with data
pearson_pandas(rna.df.ix['TP53'], rna.df.ix['FHIT'])
rho 3.05e-01 p 1.03e-07 dtype: float64
ax.annotate?
rcParams['font.size'] = 15
def linear_regression(a,b):
a,b = match_series(a,b)
res = stats.linregress(a,b)
return pd.Series({'slope':res[0], 'intercept':res[1], 'r-value':res[2],
'p-value':res[3], 'stderr':res[4]})
def line_me(slope, intercept, start=0, end=100, ax=None):
if ax is None:
ax = plt.gca()
line = lambda x: slope*x + intercept
ax.plot([start, end], [line(start), line(end)], lw=3, alpha=.8,
color=colors[0], ls='--')
def remove_leading_zero(f, places=2):
f = round(f, places)
if abs(f - 1) < .01:
return '1.0'
elif abs(f) < .01:
return ''
elif abs(f) > 1:
f = str(f)
elif f > 0:
f = str(f)[1:]
else:
f = '-' + str(f)[2:]
return f
def regression_string(reg):
r_value = round(reg['r-value'],2)
r_value = str(r_value)[1:] if r_value != 1 else '1.0'
r_value = '$r={}$'.format(r_value)
slope = remove_leading_zero(reg['slope'])
intercept = remove_leading_zero(reg['intercept'])
if (len(intercept) != 0) and (intercept[0] != '-'):
intercept = '+' + intercept
line = '$y={}x {}$'.format(slope, intercept)
return '\n'.join([r_value, line])
def plot_regression(x, y, ax=None):
x,y = match_series(x,y)
fig, ax = init_ax(ax, figsize=(5,5))
series_scatter(x,y, ax=ax, ann=None)
reg = linear_regression(x, y)
ax.annotate(regression_string(reg), (.5,.05),
xycoords='axes fraction', size=18)
line_me(reg['slope'], reg['intercept'], start=x.min(),
end=x.max(), ax=ax)
line_me(1, 0, start=x.min(),
end=x.max(), ax=ax)
xy = x.append(y)
ax.set_xbound(xy.min() - 3, xy.max() + 3)
ax.set_ybound(xy.min() - 3, xy.max() + 3)
prettify_ax(ax)
reg = linear_regression(rna.df.ix['TP53'][:,'01'], rna.df.ix['FHIT'][:,'01'])
reg
intercept -1.18e+00 p-value 6.87e-06 r-value 2.76e-01 slope 3.43e-01 stderr 7.46e-02 dtype: float64
FH.read_rnaSeq('/ce)
fig, ax = subplots(figsize=(6,4))
series_scatter(rna.df.ix['TP53'][:,'01'], rna.df.ix['FHIT'][:,'01'], ax=ax,
ann=None)
ax.annotate('$r=.28$', (11.5, -3), size=20)
line_me(reg['slope'], reg['intercept'], 5.5, 13, ax)
prettify_ax(ax)
ax.set_xlabel('Log TP53 mRNA Expression')
ax.set_ylabel('Log FHIT mRNA Expression')
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/TP53_FHIT.png', dpi=300)
from Figures.MemoPlot import pathway_plot
p = 'REACTOME_SOS_MEDIATED_SIGNALLING'
fig = plt.figure(figsize=(6,3.7))
ax = plt.subplot2grid((1,4), (0,0), colspan=3)
df = pathway_plot(mut.df.ix[run.gene_sets[p], keepers_o]>0,
bar=None, ax=ax)
ax = plt.subplot2grid((1,4), (0,3), colspan=1)
dd = mut.df.ix[run.gene_sets[p], keepers_o]
dd = dd.ix[dd.sum(1).dropna().order().index[::-1]]
vec = dd.apply(lambda s: s.idxmax() if s.max() > 0 else 'WT')
mm = pd.concat([vec[vec == 'WT'], vec[vec!='WT']], keys=['WT','Mut.']).unstack()
mm = mm.apply(pd.value_counts, 1).fillna(0)
mm = mm.ix[:,vec.value_counts().index]
mm.plot(kind='bar', stacked=True, color=['grey'] + ['green'] * 10, ax=ax, lw=.5,
width=.75)
ax.get_legend().set_visible(False)
ax.set_ylabel('Num. Patients')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'fig1bc.pdf', transparent=True)
p = 'REACTOME_SOS_MEDIATED_SIGNALLING'
df = mut.df.ix[run.gene_sets[p], keepers_o].dropna().T > 0
df[p] = df.sum(1) > 0
df = df.ix[df.sum(1).order().index[::-1]]
df.to_csv(FIGDIR + 'fig1bc.csv')
f = rna.pathways.ix['SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'].ix[keepers_o]
f.groupby(binarize(f)).min()
SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES False -2.84 True 0.60 Name: SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES, dtype: float64
p = 'SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'
fig, axs = subplots(1,2, figsize=(6,3.7))
rna.loadings[p].order().plot(kind='barh', ax=axs[0], color=colors[0],
width=.75)
axs[0].set_xticks([-.4, -.2, 0, .2, .4])
axs[0].set_xlabel('Gene Loading')
axs[0].set_ylabel('PIP3 Signaling')
h = rna.pathways.ix[p].ix[keepers_o].hist(ax=axs[1], bins=15, color=colors[1])
[b.set_facecolor('grey') for b in h.patches[:-7]]
axs[1].set_ylabel('Num. Patients')
axs[1].set_xlabel('Consensus Pathway Value')
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'fig1de.pdf', transparent=True)
rna.loadings['SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'].order().to_csv(FIGDIR + 'fig1d.csv')
f = rna.pathways.ix['SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'].ix[keepers_o]
df = pd.concat([f,binarize(f)],
keys=['Consensus Pathway Value','Pathway Induction Indicator'],
axis=1).sort('Consensus Pathway Value', ascending=False)
df.to_csv(FIGDIR + 'fig1e.csv')
p53_mut = mut.features.ix['TP53'].ix[keepers_o].dropna()
del_3p = cn.features.ix['Deletion'].ix['3p14.2'].ix[0].ix[keepers_o].dropna()
combo = combine(p53_mut==1, del_3p==-1)
combo = combo.map({'Lesion':'b', 'neither':'a', 'TP53':'c', 'both':'d'})
two_hit = combo=='d'
get_surv_fit_lr(clinical.survival.survival_5y, two_hit)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
18.6 | 1.59e-05 | |||||||||
1 | 179 | 87 | 1.9 | 1.5 | 2.84 | 0.339 | 0.255 | 0.45 | ||
0 | 71 | 15 | NaN | NaN | NaN | 0.658 | 0.512 | 0.846 |
p53_mut = mut.features.ix['TP53'].ix[keepers_o]
del_3p = cn.features.ix['Deletion'].ix['3p14.2'].ix[0].ix[keepers_o]
mir_3170 = mirna.features.ix['binary'].ix['hsa-mir-3170'].ix[keepers_o]
#ras = binarize(rna.pathways.ix['REACTOME_SOS_MEDIATED_SIGNALLING'].ix[keepers_o])
pip3 = binarize(rna.pathways.ix['SIG_PIP3_SIGNALING_IN_B_LYMPHOCYTES'].ix[keepers_o])
smoker = clinical.binary_df.ix['recent_smoker'].ix[keepers_o].dropna()
fig, axs = subplots(1,5, figsize=(6.5, 2.5), sharey=True)
var_list = [p53_mut.map({1: 'mutated',0:'wild-type'}),
del_3p.map({-1: 'deleted', 0: 'wild-type'}),
mir_3170.map({0: 'absent', 1: 'present'}),
pip3.map({0:'low', 1:'high'}),
smoker.map({1: 'yes', 0 : 'no'})
]
labels = ['TP53','3p14.2','mir-3170','PIP3 Pathway\nExpression','Recent\nSmoking']
color_list = array(colors)
n_rows = len(var_list)
for i,var in enumerate(var_list):
ax = axs[i]
surv_fit = get_surv_fit(surv, var)
surv_fit = surv_fit.sort([('5y Survival', 'Surv')])
t = surv_fit['5y Survival']
b = (t['Surv']).plot(kind='bar', ax=ax, color=colors[i],
yerr=[t.Surv-t.Lower, t.Upper-t.Surv], ecolor='black', alpha=.7,
width=.75)
bar_labels = ['{}'.format(j) for j,v in
surv_fit['Stats']['# Patients'].iteritems()]
counts = [ax.annotate('{}'.format(int(v)), (j, .05), ha='center')
for j,v in enumerate(surv_fit['Stats']['# Patients'])]
ax.set_xticklabels(bar_labels, rotation=340)
ax.set_xlabel(labels[i])
prettify_ax(ax)
axs[0].set_ylabel('5 Year Survival')
axs[0].set_yticks([0, .5, 1.])
fig.tight_layout(w_pad=.2)
fig.savefig(FIGDIR + 'fig2a.pdf', transparent=True)
labels = ['TP53','3p14.2','mir-3170','PIP3 Pathway Expression','Recent Smoking',
'Days to Death/Censoring', 'Death Indicator']
df = pd.concat(var_list + [surv[:,'days'] , surv[:,'event']], keys=labels,
axis=1).dropna()
df.to_csv(FIGDIR + 'fig2a.csv')
get_surv_fit_lr(surv, two_hit)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
18.6 | 1.59e-05 | |||||||||
1 | 179 | 87 | 1.9 | 1.5 | 2.84 | 0.339 | 0.255 | 0.45 | ||
0 | 71 | 15 | NaN | NaN | NaN | 0.658 | 0.512 | 0.846 |
fig, ax = subplots(figsize=(5,2.75))
c = {'a':colors_th[3], 'b': colors_th[2], 'c': colors_th[1], 'd': colors_th[0]}
draw_survival_curve(combo, surv, colors=c, ax=ax)
ax.legend().set_visible(False)
prettify_ax(ax)
fig.tight_layout()
plt.savefig(FIGDIR + 'fig2e.pdf', transparent=True)
survival_stat_plot(get_surv_fit(surv, combo).ix[['d','c','b','a']], colors=colors_th)
get_cox_ph(surv, combo=='d', print_desc=True);
coef exp(coef) se(coef) z p feature 1.15 3.15 0.28 4.09 4.3e-05 Likelihood ratio test=21.4 on 1 df, p=3.69e-06 n= 250, number of events= 102
exp(1.15), exp(1.15) - exp(1.15 - .28)
(3.1581929096897672, 0.77128205616549073)
cc = combo.map({'a': 'Neither', 'b':'3p only', 'c':'TP53 only', 'd': 'TP53 and 3p'})
df = pd.concat([p53_mut, del_3p, cc, surv[:,'days'], surv[:,'event']],
keys=['TP53', '3p', 'Combination', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).dropna()
df.to_csv(FIGDIR + 'fig2d.csv')
from matplotlib_venn import venn2
fig, axs = subplots(1,2, figsize=(4,2), sharey=True)
p53_mut.map({1:'Mut.',0:'WT'}).value_counts().plot(kind='bar', ax=axs[0],
rot=0, alpha=.7, width=.75)
axs[0].set_xlabel('TP53')
axs[0].set_ylabel('# of Patients')
del_3p.map({-1:'+/-',0:'+/+'}).value_counts().plot(kind='bar', ax=axs[1], rot=0,
color=colors[1], alpha=.7,
width=.75)
axs[1].set_xlabel('3p Arm')
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'p53_3p_freq.pdf')
fig, ax = subplots(figsize=(2.,2.))
a,b = p53_mut == 1, del_3p==-1
b.name = ' '
venn_pandas(a,b, colors=['white','white','white'], alpha=1)
venn_pandas(a,b, colors=np.array(colors_th)[[2,1,0]], alpha=.8)
r = plt.Rectangle((-.65,-.6), 1.28, 1.2, fill=True, alpha=.7, zorder=-10, facecolor=colors_th[3],
edgecolor='black', lw=3)
ax.add_artist(r);
plt.savefig(FIGDIR + 'fig2d.pdf', transparent=True)
mir548k = mirna.features.ix['binary'].ix['hsa-mir-548k']
muc5b = mut.features.ix['MUC5B']
fig, axs = subplots(1,2, figsize=(7,3))
st_1 = combine(mir548k, two_hit)
st_1 = st_1.map({'hsa-mir-548k':1, 'neither':1, 'TP53':2, 'both': 3})
draw_survival_curve(st_1, surv, colors={1:colors_st[2], 2:colors_st[1], 3:colors_st[0]},
ax=axs[0])
st_2 = combine(muc5b, two_hit)
st_2 = st_2.map({'MUC5B':1, 'neither':1, 'TP53':2, 'both': 3})
draw_survival_curve(st_2, surv, colors={1:colors_st[2], 2:colors_st[1], 3:colors_st[0]},
ax=axs[1])
for ax in axs:
prettify_ax(ax)
ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(FIGDIR + 'fig4ab.pdf', transparent=True)
cc = combo.map({'a': 'Neither', 'b':'3p only', 'c':'TP53 only', 'd': 'TP53 and 3p'})
df = pd.concat([p53_mut, del_3p, cc, mir548k, muc5b, surv[:,'days'], surv[:,'event']],
keys=['TP53', '3p', 'Combination', 'mir548k', 'muc5b', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).dropna()
df.to_csv(FIGDIR + 'fig3ab.csv')
mir-548k
survival_stat_plot(get_surv_fit(surv, st_1), colors=[colors_st[2], colors_st[1], colors_st[0]])
MUC5B
survival_stat_plot(get_surv_fit(surv, st_2), colors=[colors_st[2], colors_st[1], colors_st[0]])
Visualize main survival associations in the HNSCC cohort in ever-smokers under 75.
smoker = clinical.clinical.tobaccosmokinghistory
pts = ti(smoker.dropna() != 'lifelong non-smoker').intersection(ti(clinical.clinical.age < 75))
pts = pts.intersection(keepers_o)
fig, axs = subplots(1,3, figsize=(10,3))
c = {'a':colors_th[3], 'b': colors_th[2], 'c': colors_th[1], 'd': colors_th[0]}
draw_survival_curve(combo.ix[pts], surv, colors=c, ax=axs[0])
st_1 = combine(mir548k, two_hit).ix[pts]
st_1 = st_1.map({'hsa-mir-548k':1, 'neither':1, 'TP53':2, 'both': 3})
draw_survival_curve(st_1, surv, colors={1:colors_st[2], 2:colors_st[1], 3:colors_st[0]},
ax=axs[1])
st_2 = combine(muc5b, two_hit).ix[pts]
st_2 = st_2.map({'MUC5B':1, 'neither':1, 'TP53':2, 'both': 3})
draw_survival_curve(st_2, surv, colors={1:colors_st[2], 2:colors_st[1], 3:colors_st[0]},
ax=axs[2])
for ax in axs:
prettify_ax(ax)
ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(FIGDIR + 'supp_fig4ab.pdf', transparent=True)
TP53/3p Combinations
survival_stat_plot(get_surv_fit(surv, combo.ix[pts]), colors=colors_th[::-1])
mir-548k
survival_stat_plot(get_surv_fit(surv, st_1), colors=[colors_st[2], colors_st[1], colors_st[0]])
MUC5B
survival_stat_plot(get_surv_fit(surv, st_2), colors=[colors_st[2], colors_st[1], colors_st[0]])
Here we are visualizing the amplified region on 11q13 to show that the effect may indeed be centered at mir-548k verses FADD and CCND1. We are limiting this analysis to the TP53/3p positive patients as this is where this association occured in the analysis pipeline.
patients = true_index(two_hit)
Read in copy-number data at gene-by-gene granularity.
coords = pd.read_csv('../Extra_Data/gene_coords.csv', index_col=0)
coords['start'] = coords['start'].astype(float)
cn_by_gene = FH.get_gistic_gene_matrix(run.data_path, cancer.name)
cn_by_gene = cn_by_gene.reset_index(level=[0,1])
cn_by_gene = cn_by_gene.join(coords).set_index(['chrom','start', 'Cytoband','Locus ID'], append=True)
cn_by_gene = cn_by_gene.ix[:,patients].dropna(axis=1)
ch_3 = cn_by_gene.xs('11', level='chrom')
p_arm = pd.Series(ch_3.index.get_level_values('Cytoband').map(lambda s: '11q13' in s), ch_3.index)
p_arm = ch_3[p_arm].sortlevel('start')
p_arm = p_arm[1:]
For survival analysis we are using age, age > 75, and year of diagnosis > 2000 covariates.
test = lambda s: get_cox_ph_ms(surv, s, [age, old, clinical.processed.year],
interactions=None)
df = rna.df.xs('01', 1, 1).ix[p_arm.index.get_level_values(0)].dropna()
df = df.ix[:,patients].dropna(1)
df = df[(df == -3).sum(1) != len(df.T)]
exp = df.apply(test, axis=1)
exp = exp.sort_index()
exp.index = p_arm.sort_index().ix[exp.index].index
exp2 = -exp.LR.map(log10)
Calculate survival associations for mirna in this region.
mir = pd.read_table('ftp://mirbase.smith.man.ac.uk/pub/mirbase/12.0/genomes/hsa.gff',
skiprows=7, header=None)
mir.index = mir.apply(lambda s: s[8].split('"')[-2], 1)
mir = mir[[0,3]]
mir.columns = ['Chromosome', 'start']
r = p_arm.index.get_level_values('start')[[0,-1]]
p_range = lambda s: (s > r[0]) * (s < r[1])
loc = mir[mir.Chromosome == '11'].sort('start')
loc = loc[loc['start'].apply(p_range)]
df = mirna.df.ix[loc.index].dropna().xs('01',1,1)
df = df.ix[:,patients].dropna(1)
df = df[(df == -3).sum(1) != len(df.T)]
mir_surv = df.apply(test, axis=1)
loc = loc.set_index(['Chromosome', 'start'], drop=False, append=True)
mir_surv.index = loc.index
mir2 = -mir_surv.LR.map(log10)
Visualize the chromosomal region with copy number frequency and survival association tracks.
fig = plt.figure(figsize=(7,4))
ax1 = plt.subplot2grid((7,1), (0,0), rowspan=3, colspan=1)
ax2 = plt.subplot2grid((7,1), (3,0), rowspan=3, colspan=1)
ax3 = plt.subplot2grid((7,1), (6,0), rowspan=1, colspan=1)
bands = p_arm.reset_index().groupby('Cytoband')
bands = bands.first().start.order()
starts = p_arm.index.get_level_values('start')
for i,(idx,v) in enumerate(bands.iteritems()):
try:
r = plt.Rectangle((v,0), bands.iloc[i+1]-v, 1, lw=2,
alpha=.5, fc=colors[i%2])
except:
r = plt.Rectangle((v,0), starts[-1] - v, 1, lw=2,
alpha=.5, fc=colors[i%2])
ax3.add_artist(r)
ax3.set_xticks(bands)
b = [s for i,s in enumerate(bands.index)]
ax3.set_xticklabels(b, rotation=270+45, ha='left')
ax1.scatter(starts, (p_arm > 1).sum(1), s=40, c=colors[2],
edgecolor='grey', alpha=1)
ax2.scatter(exp2.index.get_level_values('start'), exp2, s=40,
c=colors[4], edgecolor='grey', alpha=1)
ax2.scatter(mir2.index.get_level_values('start'), mir2, s=40,
c=colors[0], edgecolor='grey', alpha=1)
#ax1.set_ybound(215,255)
ax1.set_ylabel('# of Patients\nwith Amp.\n\n', ha='center', va='center')
ax2.set_ybound(-.5,0)
ax2.set_ylabel('mRNA/miRNA\nCox Surv.\n(Log10 p)\n\n\n', ha='center', va='center')
ax3.set_yticks([])
fig.tight_layout()
for ax in [ax1, ax2, ax3]:
ax.set_xbound(starts[0],
starts[-1])
ax.set_xticks(bands)
for n, ss in mir2.order().tail(1).iteritems():
ax2.annotate(n[0], (n[2], ss), xytext=(15, -5),
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
for n, ss in exp2.ix[['CCND1', 'FADD']].iteritems():
ax2.annotate(n[0], (n[1], ss), xytext=(15, -5),
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
ax1.set_xticklabels('')
ax2.set_xticklabels('')
ax2.set_yticks([0,1,2,3,4])
fig.tight_layout()
fig.savefig(FIGDIR + 'fig4c.pdf', transparent=True)
(p_arm > 1).sum(1).to_csv(FIGDIR + 'fig3c_count.csv')
exp2.to_csv(FIGDIR + 'fig3c_p.csv')
mir2.to_csv(FIGDIR + 'fig3c_p_mir.csv')
patients = true_index(two_hit)
mir_level = mirna.df.ix['hsa-mir-548k'].ix[patients]
mir_level.name = 'exp'
cn_level = cn_by_gene.xs('MIR548K', level=0).iloc[0]
cn_level.name = 'cn'
mir_level.groupby(level=1).size()
01 179 11 23 dtype: int64
fig, axs = subplots(2,1, figsize=(4,5))
ax=axs[0]
violin_plot_series(mir_level.clip_lower(-3), ax=ax)
ax.axhline(-1, ls='--', lw=2, color='grey')
ax.set_xlabel('Tissue Type')
ax.set_ylabel('mir-548k Exp.')
#ax.set_xticklabels(['Tumor','Normal'])
prettify_ax(ax)
ax = axs[1]
violin_plot_pandas(cn_level, mir_level[:,'01'], order=[-1,0,1,2], ax=ax)
ax.axhline(-1, ls='--', lw=2, color='grey')
ax.set_xlabel('mir-548k Copy Number')
ax.set_ylabel('mir-548k Exp.')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'supp_fig4ab.pdf', tranparent=True)
fig, ax = subplots(figsize=(4.5, 3.5))
cb = combine(cn_level==2, mir_level[:,'01'] > -1)
draw_survival_curve(cb, surv, ax=ax)
ax.legend(loc='lower left', frameon=False)
prettify_ax(ax)
fig.savefig(FIGDIR + 'supp_fig4c.pdf', tranparent=True)
survival_stat_plot(get_surv_fit(surv, cb))