import pandas as pd
import os as os

from bs4 import BeautifulSoup
from urllib2 import HTTPError

import NotebookImport
from Imports import OUT_PATH

PATH_TO_CACERT = '/cellar/users/agross/cacert.pem'

out_path = OUT_PATH + '/MAFs_new_2/'
if not os.path.isdir(out_path):
    os.makedirs(out_path)

maf_dashboard = 'https://confluence.broadinstitute.org/display/GDAC/MAF+Dashboard'

!curl --cacert $PATH_TO_CACERT $maf_dashboard -o tmp.html

f = open('tmp.html', 'rb').read()
soup = BeautifulSoup(f)

r = [l.get('href') for l in soup.find_all('a')
   if l.get('href') != None
   and '.maf' in l.get('href')]

maf = {}
for f in r:
    try:
        #t = pd.read_table(f, nrows=10, sep='not_real_term', header=None, squeeze=True,
        #                  low_memory=False)
        #skip = t.apply(lambda s: s.startswith('#'))
        #skip = list(skip[skip==True].index)
        maf[f] = pd.read_table(f, header=0, index_col=0,
                               low_memory=True, comment='#')
    except HTTPError:
        print f

m2 = pd.concat(maf)
m3 = m2.dropna(axis=1, how='all')

cols = ['NCBI_Build', 'Chromosome', 'Start_position', 
         'End_position', 'Strand', 'Reference_Allele', 
         'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2',
         'Tumor_Sample_Barcode', 'Protein_Change',
         'Variant_Classification','Variant_Type']

m4 = m3[cols]
m4 = m4.reset_index()
#m4.index = map(lambda s: s.split('/')[-1], m4.index)
m4 = m4.groupby(['Hugo_Symbol','Tumor_Sample_Barcode','Start_position']).first()
m4 = m4.reset_index()

m4.to_csv(out_path + 'mega_maf.csv')

m5 = m4.ix[m4.Variant_Classification != 'Silent']
cc = m5.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size()
cc = cc.reset_index()

cc.to_csv(out_path + 'meta.csv')

cc.shape