import pandas as pd import os as os from bs4 import BeautifulSoup from urllib2 import HTTPError import NotebookImport from Imports import OUT_PATH PATH_TO_CACERT = '/cellar/users/agross/cacert.pem' out_path = OUT_PATH + '/MAFs_new_2/' if not os.path.isdir(out_path): os.makedirs(out_path) maf_dashboard = 'https://confluence.broadinstitute.org/display/GDAC/MAF+Dashboard' !curl --cacert $PATH_TO_CACERT $maf_dashboard -o tmp.html f = open('tmp.html', 'rb').read() soup = BeautifulSoup(f) r = [l.get('href') for l in soup.find_all('a') if l.get('href') != None and '.maf' in l.get('href')] maf = {} for f in r: try: #t = pd.read_table(f, nrows=10, sep='not_real_term', header=None, squeeze=True, # low_memory=False) #skip = t.apply(lambda s: s.startswith('#')) #skip = list(skip[skip==True].index) maf[f] = pd.read_table(f, header=0, index_col=0, low_memory=True, comment='#') except HTTPError: print f m2 = pd.concat(maf) m3 = m2.dropna(axis=1, how='all') cols = ['NCBI_Build', 'Chromosome', 'Start_position', 'End_position', 'Strand', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Protein_Change', 'Variant_Classification','Variant_Type'] m4 = m3[cols] m4 = m4.reset_index() #m4.index = map(lambda s: s.split('/')[-1], m4.index) m4 = m4.groupby(['Hugo_Symbol','Tumor_Sample_Barcode','Start_position']).first() m4 = m4.reset_index() m4.to_csv(out_path + 'mega_maf.csv') m5 = m4.ix[m4.Variant_Classification != 'Silent'] cc = m5.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size() cc = cc.reset_index() cc.to_csv(out_path + 'meta.csv') cc.shape