import pandas as pd
import os as os
from bs4 import BeautifulSoup
from urllib2 import HTTPError
import NotebookImport
from Imports import OUT_PATH
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory
populating namespace with data
Download a copy of a generic cacert.pem here.
PATH_TO_CACERT = '/cellar/users/agross/cacert.pem'
out_path = OUT_PATH + '/MAFs_new_2/'
if not os.path.isdir(out_path):
os.makedirs(out_path)
maf_dashboard = 'https://confluence.broadinstitute.org/display/GDAC/MAF+Dashboard'
!curl --cacert $PATH_TO_CACERT $maf_dashboard -o tmp.html
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 131k 0 131k 0 0 47417 0 --:--:-- 0:00:02 --:--:-- 48890
f = open('tmp.html', 'rb').read()
soup = BeautifulSoup(f)
r = [l.get('href') for l in soup.find_all('a')
if l.get('href') != None
and '.maf' in l.get('href')]
maf = {}
for f in r:
try:
#t = pd.read_table(f, nrows=10, sep='not_real_term', header=None, squeeze=True,
# low_memory=False)
#skip = t.apply(lambda s: s.startswith('#'))
#skip = list(skip[skip==True].index)
maf[f] = pd.read_table(f, header=0, index_col=0,
low_memory=True, comment='#')
except HTTPError:
print f
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (86,87) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,80,81) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,56,57,67,72,73,74,75,77,78,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,67) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,67,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (4,38,50,57,60,61,62,67,70,71,72,73,74,75,76,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (42,43,44,45) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (17,18,36,63,81) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (50,67,79,81,82,84) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (48,50,67) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows) /cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (38,50,85) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/laml/gsc/genome.wustl.edu/illuminaga_dnaseq/mutations/genome.wustl.edu_LAML.IlluminaGA_DNASeq.Level_2.1.2.0/genome.wustl.edu_LAML.IlluminaGA_DNASeq.preliminary.1.maf
m2 = pd.concat(maf)
m3 = m2.dropna(axis=1, how='all')
cols = ['NCBI_Build', 'Chromosome', 'Start_position',
'End_position', 'Strand', 'Reference_Allele',
'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2',
'Tumor_Sample_Barcode', 'Protein_Change',
'Variant_Classification','Variant_Type']
m4 = m3[cols]
m4 = m4.reset_index()
#m4.index = map(lambda s: s.split('/')[-1], m4.index)
m4 = m4.groupby(['Hugo_Symbol','Tumor_Sample_Barcode','Start_position']).first()
m4 = m4.reset_index()
m4.to_csv(out_path + 'mega_maf.csv')
m5 = m4.ix[m4.Variant_Classification != 'Silent']
cc = m5.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size()
cc = cc.reset_index()
cc.to_csv(out_path + 'meta.csv')
cc.shape
(1411730, 3)