Here we are downloading and processing most of the necissary data to run this analysis pipeline. I have set up a series of scripts to do this in an automated fashon in order to allow for reproduction of this study by others as well as for updating the results obtained here as more TCGA data is collected and reseased.
Downloading this data can take a considerable amount of time (~5 hours) and disk space (~45GB), be prepared.
We use the firehose_get script provided by the Broad to download the data, please see the firehose_get documentation for troubleshooting. As we are making using the Broad's initial processing pipeline and data formats we can not promise this initial code will not break upon future update that they make.
%pylab inline
Populating the interactive namespace from numpy and matplotlib
cd ../src/
/cellar/users/agross/TCGA_Code/TCGA/src
Change OUT_PATH to directory on your machine where you want to store the data
OUT_PATH = '../Data'
RUN_DATE = '2014_01_15'
VERSION = 'all'
CANCER = 'HNSC'
FIGDIR = '../Figures'
DESCRIPTION = '''Updating analysis for updated dataset.'''
PARAMETERS = {'min_patients' : 12,
'pathway_file' : '../Extra_Data/c2.cp.v3.0.symbols_edit.csv'
}
import pickle as pickle
import pandas as pd
import os as os
from Data.Containers import Run
from Data.Containers import get_run
from Data.Containers import Cancer
from Initialization.InitializeCN import initialize_cn
from Initialization.InitializeReal import initialize_real
from Initialization.InitializeMut import initialize_mut
from Initialization.PreprocessMethylation import process_meth
from IPython import utils
from IPython.display import HTML
css_file = 'profile_default/static/custom/custom.css'
base = utils.path.get_ipython_dir()
styles = "<style>\n%s\n</style>" % (open(os.path.join(base, css_file),'r').read())
display(HTML(styles))
!curl http://gdac.broadinstitute.org/runs/code/firehose_get_latest.zip -o fh_get.zip
!unzip fh_get.zip
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 6542 100 6542 0 0 8343 0 --:--:-- --:--:-- --:--:-- 10620 Archive: fh_get.zip replace firehose_get? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
d = 'http://gdac.broadinstitute.org/runs/analyses__{}/ingested_data.tsv'.format(RUN_DATE)
tab = pd.read_table(d, sep='\n', header=None)
skip = tab[0].dropna().apply(lambda s: s.startswith('#'))
skip = list(skip[skip==True].index)
tab = pd.read_table(d, skiprows=skip, index_col=0).dropna()
cancers = tab[tab.Clinical>0].index[:-1]
Takes about 5 min to download data.
cancer_string = ' '.join(cancers)
!firehose_get -b analyses $RUN_DATE $cancer_string > tmp
!firehose_get -b -o miR_gene_expression stddata $RUN_DATE $cancer_string > tmp
!firehose_get -b -o RSEM_genes_normalized stddata $RUN_DATE $cancer_string > tmp
!firehose_get -b -o rppa stddata $RUN_DATE $cancer_string > tmp
!firehose_get -b -o clinical stddata $RUN_DATE $cancer_string > tmp
Takes about 25 min to download methylation data.
!firehose_get -b -o humanmethylation450 stddata $RUN_DATE HNSC
No going back from here, so I would check your data to make sure everything got downloaded correctly.
!rm fh_get.zip
!rm firehose_get
if not os.path.isdir(OUT_PATH):
os.makedirs(OUT_PATH)
analyses_folder = 'analyses__' + RUN_DATE
!mv $analyses_folder {OUT_PATH + '/' + analyses_folder}
stddata_folder = 'stddata__' + RUN_DATE
!mv $stddata_folder {OUT_PATH + '/' + stddata_folder}
from Initialization.ProcessFirehose import process_all_cancers
process_all_cancers(OUT_PATH, RUN_DATE)
Get rid of all of the downloaded zip files that we processed
!rm -rf {OUT_PATH + '/' + stddata_folder}
!rm -rf {OUT_PATH + '/' + analyses_folder}
ls
data_path = '{}/Firehose__{}/'.format(OUT_PATH, RUN_DATE)
result_path = data_path + 'ucsd_analyses/'
cancer_codes = pd.read_table('../Extra_Data/diseaseStudy.txt',
index_col=0, squeeze=True)
run_dir = 'http://gdac.broadinstitute.org/runs'
f = '{}/analyses__{}/ingested_data.tsv'.format(run_dir, RUN_DATE)
sample_matrix = pd.read_table(f, index_col=0).dropna()
sample_matrix = sample_matrix.ix[[c for c in sample_matrix.index if
c not in ['PANCAN12', 'COADREAD','Totals']]]
run = Run(RUN_DATE, VERSION, data_path, result_path, PARAMETERS,
cancer_codes, sample_matrix, DESCRIPTION)
run.save()
run
def init(c, run):
try:
cancer_obj = Cancer(c, run)
cancer_obj.initialize_data(run, save=True)
except:
print c + '\t' + 'all'
try:
initialize_real(c, run.report_path, 'mRNASeq',
create_meta_features=True)
except:
print c + '\t' + 'mRNASeq'
try:
initialize_real(c, run.report_path, 'RPPA',
create_meta_features=True, create_real_features=False)
except:
print c + '\t' + 'RPPA'
try:
initialize_real(c, run.report_path, 'miRNASeq',
create_meta_features=False)
except:
print c + '\t' + 'miRNASeq'
try:
initialize_cn(c, run.report_path, 'CN_broad')
except:
print c + '\t' + 'CN'
try:
initialize_mut(c, run.report_path, create_meta_features=True);
except:
print c + '\t' + 'mut'
for cancer in run.cancers:
init(cancer, run)