ipython notebook --pylab=inline
or
ipython notebook --profile=my_profile (preset configuration file)
You can italicize, boldface
format LaTex:
and embed code meant for illustration instead of execution in Python:
def f(x):
"""a docstring"""
return x**2
or other languages:
if (i=0; i<n; i++) {
printf("hello %d\n", i);
x += 4;
}
from IPython.display import Latex
Latex(r"""\begin{eqnarray}
\nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} & = \frac{4\pi}{c}\vec{\mathbf{j}} \\
\nabla \cdot \vec{\mathbf{E}} & = 4 \pi \rho \\
\nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} & = \vec{\mathbf{0}} \\
\nabla \cdot \vec{\mathbf{B}} & = 0
\end{eqnarray}""")
!pwd
/cellar/users/agross/TCGA_Code/TCGA/Pathway_Merge/src
!whereis python
python: /usr/bin/python2.7 /usr/bin/python2.7-dbg /usr/bin/python /usr/bin/python3.2 /usr/bin/python2.7-config /usr/bin/python3.2mu /usr/bin/python2.7-dbg-config /etc/python2.7 /etc/python /etc/python3.2 /usr/lib/python2.6 /usr/lib/python2.7 /usr/lib/python3.3 /usr/lib/python3.2 /usr/bin/X11/python2.7 /usr/bin/X11/python2.7-dbg /usr/bin/X11/python /usr/bin/X11/python3.2 /usr/bin/X11/python2.7-config /usr/bin/X11/python3.2mu /usr/bin/X11/python2.7-dbg-config /usr/local/lib/python2.7 /usr/local/lib/python3.2 /usr/include/python2.7 /usr/include/python2.7_d /usr/include/python3.2mu /usr/share/python /usr/share/man/man1/python.1.gz
%load http://matplotlib.sourceforge.net/mpl_examples/pylab_examples/integral_demo.py
plot(
plot??
Data Processing Hub
Run standard analyses in versioned runs across data as it is generated
data_path = ('http://gdac.broadinstitute.org/runs/analyses__2012_12_21/' +
'data/KIRC/20121221/' +
'gdac.broadinstitute.org_KIRC.MutSigNozzleReport2.0.Level_4.2012122100.0.0.tar.gz')
!curl $data_path -o mut_sig.tar.gz
!tar -xvf mut_sig.tar.gz > tmp
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 9281k 100 9281k 0 0 873k 0 0:00:10 0:00:10 --:--:-- 1028k
import pandas as pd
!mv gdac.broadinstitute.org_KIRC-TP.MutSigNozzleReport2.0.Level_4.2012122100.0.0/ ./mutsig
!ls ./mutsig/
ABCB1.stick_fig.png MSN.stick_fig.png BAP1.stick_fig.png MTOR.stick_fig.png CNTNAP4.stick_fig.png MUC4.stick_fig.png CR1.stick_fig.png nozzle.html EBPL.stick_fig.png nozzle.RData KDM5C.stick_fig.png NPNT.stick_fig.png KIRC-TP.bargraphs.png OR5M8.stick_fig.png KIRC-TP.clustered_muts.txt PBRM1.stick_fig.png KIRC-TP_coMut.pdf PIK3CA.stick_fig.png KIRC-TP_coMut.png PTEN.stick_fig.png KIRC-TP.cosmic_mutations.txt SETD2.stick_fig.png KIRC-TP.cosmic_sig_genes.txt SPAM1.stick_fig.png KIRC-TP.covplot.png STAG2.stick_fig.png KIRC-TP.final_analysis_set.maf SV2C.stick_fig.png KIRC-TP.mutation_breakdown.txt TOR1A.stick_fig.png KIRC-TP.mutation_rates.txt TP53.stick_fig.png KIRC-TP.patients.counts_and_rates.txt TPST1.stick_fig.png KIRC-TP.sig_genesets_2.txt TSPAN19.stick_fig.png KIRC-TP.sig_genesets.txt VHL.stick_fig.png KIRC-TP.sig_genes.txt WDR52.stick_fig.png MANIFEST.txt
pd.read_table?
maf = pd.read_table('./mutsig/KIRC-TP.final_analysis_set.maf')
maf
<class 'pandas.core.frame.DataFrame'> Int64Index: 23511 entries, 0 to 23510 Columns: 111 entries, Hugo_Symbol to pat_idx dtypes: float64(13), int64(26), object(72)
maf.head().T.head()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
Hugo_Symbol | RCC1 | MANEAL | ORC1L | CDCP2 | INADL |
Entrez_Gene_Id | 1104 | 149175 | 4998 | 200008 | 10207 |
Center | broad.mit.edu | broad.mit.edu | broad.mit.edu | broad.mit.edu | broad.mit.edu |
NCBI_Build | 37 | 37 | 37 | 37 | 37 |
Chromosome | 1 | 1 | 1 | 1 | 1 |
non_silent = maf[maf.is_silent == 0]
non_silent.shape
(17721, 111)
hit_counts = non_silent.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size()
hit_counts
Hugo_Symbol Tumor_Sample_Barcode A1BG TCGA-A3-3358-01A-01D-1534-10 1 A1CF TCGA-B0-4815-01A-01D-1501-10 1 TCGA-B0-5699-01A-11D-1534-10 1 TCGA-CW-5580-01A-01D-1669-08 1 A2BP1 TCGA-BP-4164-01A-02D-1386-10 1 TCGA-BP-5169-01A-01D-1429-08 1 TCGA-CJ-4907-01A-01D-1429-08 1 TCGA-CJ-5679-01A-11D-1534-10 1 A2M TCGA-B0-5099-01A-01D-1421-08 1 TCGA-B0-5691-01A-11D-1534-10 1 TCGA-BP-5009-01A-01D-1462-08 1 TCGA-CJ-4902-01A-01D-1429-08 1 TCGA-CJ-4920-01A-01D-1429-08 1 A2ML1 TCGA-BP-4985-01A-01D-1462-08 1 A4GNT TCGA-B0-5096-01A-01D-1421-08 1 ... ZYG11B TCGA-A3-3346-01A-01D-0966-08 1 TCGA-B0-4710-01A-01D-1501-10 1 TCGA-B0-5702-01A-11D-1534-10 1 TCGA-B0-5703-01A-11D-1534-10 1 TCGA-B8-5163-01A-01D-1421-08 1 TCGA-CJ-6030-01A-11D-1669-08 1 ZYX TCGA-CW-6093-01A-11D-1669-08 1 ZZEF1 TCGA-B0-4811-01A-01D-1501-10 1 TCGA-B0-4816-01A-01D-1501-10 1 TCGA-B0-5121-01A-02D-1421-08 1 TCGA-B2-4101-01A-02D-1458-08 1 TCGA-BP-4983-01A-01D-1462-08 1 TCGA-CJ-4882-01A-02D-1429-08 1 TCGA-CZ-4859-01A-02D-1429-08 1 ZZZ3 TCGA-B0-5108-01A-01D-1421-08 1 Length: 17521
hit_counts.value_counts()
1 17345 2 161 3 10 4 3 6 2
hit_matrix = hit_counts.unstack().fillna(0)
hit_matrix
<class 'pandas.core.frame.DataFrame'> Index: 9265 entries, A1BG to ZZZ3 Columns: 297 entries, TCGA-A3-3308-01A-01D-0966-08 to TCGA-EU-5907-01A-11D-1669-08 dtypes: float64(297)
fig, axs = subplots(1,2, figsize=(12,4))
axs[0].set_title('Patient Mutation Counts')
hit_matrix.sum().hist(ax=axs[0])
axs[1].set_title('Gene Mutation Counts')
hit_matrix.sum(axis=1).hist(ax=axs[1], log=True)
<matplotlib.axes.AxesSubplot at 0x1acf4690>
hit_matrix.sum(1).order().tail(10).plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x1a620ed0>
data_path = ('http://gdac.broadinstitute.org/runs/stddata__2012_12_21/data/KIRC/' +
'20121221/gdac.broadinstitute.org_KIRC.Clinical_Pick_Tier1.Level_4.' +
'2012122100.0.0.tar.gz')
!curl $data_path -o clinical.tar.gz
!tar -xvf clinical.tar.gz > tmp
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 60860 100 60860 0 0 79752 0 --:--:-- --:--:-- --:--:-- 98k
!mv gdac.broadinstitute.org_KIRC.Clinical_Pick_Tier1.Level_4.2012122100.0.0/ ./clin_folder
!ls ./clin_folder/
gdac.broadinstitute.org_KIRC.Clinical_Pick_Tier1.Level_4.2012122100.0.0 KIRC.clin.merged.picked.txt MANIFEST.txt nozzle.html nozzle.RData
clinical = pd.read_table('./clin_folder/KIRC.clin.merged.picked.txt', index_col=0)
clinical = clinical.T
clinical.gender.value_counts()
male 329 female 173
clinical.vitalstatus.value_counts()
0 342 1 160
clinical.tumorstage.value_counts().sort_index().plot(kind='bar')
title('Tumor Stage')
<matplotlib.text.Text at 0x1c804e90>
age = clinical.yearstobirth.astype(float)
age.hist()
title('Patient Age')
<matplotlib.text.Text at 0x1c645710>
hit_matrix
<class 'pandas.core.frame.DataFrame'> Index: 9265 entries, A1BG to ZZZ3 Columns: 297 entries, TCGA-A3-3308-01A-01D-0966-08 to TCGA-EU-5907-01A-11D-1669-08 dtypes: float64(297)
hit_matrix.columns = hit_matrix.columns.map(lambda s: s[:12])
hit_matrix = hit_matrix.groupby(level=0, axis=1).sum()
hit_matrix
<class 'pandas.core.frame.DataFrame'> Index: 9265 entries, A1BG to ZZZ3 Columns: 293 entries, TCGA-A3-3308 to TCGA-EU-5907 dtypes: float64(293)
clinical.index = clinical.index.map(str.upper)
age = clinical.yearstobirth.astype(float)
clinical
<class 'pandas.core.frame.DataFrame'> Index: 502 entries, TCGA-A3-3306 to TCGA-EU-5907 Data columns: Composite Element REF 502 non-null values yearstobirth 501 non-null values daystodeath 159 non-null values daystolastfollowup 498 non-null values vitalstatus 502 non-null values dccuploaddate 502 non-null values primarysiteofdesease 502 non-null values gender 502 non-null values dateofinitialpathologicdiagnosis 502 non-null values daystolastknownalive 492 non-null values karnofskyperformancescore 36 non-null values histologicaltype 502 non-null values pathologicspread(pt) 502 non-null values pathologicspread(pn) 502 non-null values pathologicspread(m) 502 non-null values tumorstage 502 non-null values radiations.radiation.regimenindication 502 non-null values batchnumber 502 non-null values dtypes: object(18)
from scipy.stats import f_oneway
def anova_pandas(hit_vec, response_vec):
'''
Wrapper around scipy.stats f_oneway function.
hit_vec: Series of a categorical variable
response_vec: Series of a continuous variable
'''
common = hit_vec.dropna().index.intersection(response_vec.dropna().index)
hit_vec, response_vec = hit_vec[common], response_vec[common]
try:
res = f_oneway(*[response_vec[hit_vec == num] for num in
hit_vec.unique()])
return pd.Series({'stat': res[0], 'p': res[1]})
except ValueError:
return pd.Series({'stat': nan, 'p': nan})
test = lambda hit_vec: anova_pandas(hit_vec, age.dropna())
to_test = hit_matrix[hit_matrix.sum(1) > 10]
result = to_test.clip_upper(1.).apply(test, axis=1)
result.sort(columns='p').head()
p | stat | |
---|---|---|
HMCN1 | 0.004757 | 8.093897 |
TTN | 0.025700 | 5.027587 |
VHL | 0.030434 | 4.730694 |
AHNAK2 | 0.071754 | 3.266207 |
USH2A | 0.087340 | 2.942532 |
pd.concat([age, hit_matrix.ix['HMCN1']], axis=1).boxplot(by='HMCN1')
<matplotlib.axes.AxesSubplot at 0x2e798650>
import rpy2.robjects as robjects
survival = robjects.packages.importr('survival')
base = robjects.packages.importr('base')
days = clinical[['daystodeath','daystolastfollowup']].max(1)
event = clinical.vitalstatus
hit_vec = hit_matrix.ix['HMCN1']
df = pd.concat([days, event, hit_vec], axis=1).astype(float)
df.columns = ['days','event','feature']
df
<class 'pandas.core.frame.DataFrame'> Index: 502 entries, TCGA-A3-3306 to TCGA-EU-5907 Data columns: days 502 non-null values event 502 non-null values feature 293 non-null values dtypes: float64(3)
from pandas.rpy.common import convert_to_r_dataframe, convert_robj
df_r = convert_to_r_dataframe(df)
logrank = robjects.Formula('Surv(days,event) ~ feature')
test = survival.coxph(logrank, df_r)
print '\n'.join(str(test).split('\n')[-6:])
coef exp(coef) se(coef) z p feature 0.713 2.04 0.296 2.41 0.016 Likelihood ratio test=4.96 on 1 df, p=0.0259 n= 293, number of events= 77 (209 observations deleted due to missingness)
logtest = base.summary(test).rx2('logtest')
print logtest
test df pvalue 2.75711710 1.00000000 0.09682257
logrank = robjects.Formula('Surv(days,event) ~ feature')
def logrank_pandas(feature):
df = pd.concat([days, event, feature], axis=1).astype(float)
df.columns = ['days','event','feature']
df_r = convert_to_r_dataframe(df)
test = survival.coxph(logrank, df_r)
logtest = base.summary(test).rx2('logtest')
return pd.Series({'logtest': logtest[0], 'p': logtest[2]})
to_test = hit_matrix[hit_matrix.sum(1) > 10]
result = to_test.clip_upper(1.).apply(logrank_pandas, axis=1)
result.sort(columns='p').head()
logtest | p | |
---|---|---|
BAP1 | 4.959390 | 0.025949 |
MUC16 | 4.719186 | 0.029828 |
DNAH2 | 3.600955 | 0.057746 |
HMCN1 | 2.757117 | 0.096823 |
KDM5C | 2.472023 | 0.115889 |
def draw_survival_curves(feature):
df = pd.concat([days, event, feature], axis=1).astype(float)
df.columns = ['days','event','feature']
df_r = convert_to_r_dataframe(df)
robjects.r.png(filename='tmp.png', width=300, height=250, res=100, pointsize=8)
ls = robjects.r.c('red','blue')
robjects.r.plot(survival.survfit(logrank, df_r), lty=1, col=ls, lwd=3, cex=1.5,
xlab='Days to Death', ylab= 'Survival')
robjects.r.title(feature.name)
robjects.r('dev.off()');
from IPython.display import Image
draw_survival_curves(hit_matrix.ix['BAP1'])
Image(filename='tmp.png')
draw_survival_curves(hit_matrix.ix['DNAH2'])
Image(filename='tmp.png')