cd ../src/

%pylab inline

from Processing.Imports import *

PATH = '/cellar/data/TCGA/protected/exome_andy/p53_all/'

pts_snv = [f[:12] for f in os.listdir(PATH) if 'call_stats' in f and '~' not in f]
vcf = pd.concat({p: pd.read_table(PATH + '{}_call_stats.txt'.format(p), skiprows=[0]) 
                 for p in pts_snv})
vcf = vcf.reset_index()
vcf = vcf.rename(columns={'level_0':'barcode'})
vcf['barcode'] = vcf['barcode'].map(lambda s: s.replace('_','-'))
pts_snv = map(lambda p: p.replace('_','-'), pts_snv)

vcf = vcf[vcf.contig == 17]
tab = vcf.groupby(['tumor_name','contig']).size().unstack()
vcf = vcf[vcf.judgement == 'KEEP']

indels = {}
pts_indels = {}
for f in os.listdir(PATH):
    if not f.endswith('indels.txt'):
        continue
    pts_indels[f] = '-'.join(f.split('_')[:3])
    try:
        indels[pts_indels[f]] = pd.read_table(PATH + f, skiprows=102, header=None)
    except ValueError:
        pass
pts_indels = pd.Series(pts_indels)
indels = pd.concat(indels)
indels.index.names = ['barcode','num']
indels.columns = ['chromosome','pos','id','ref','alt','qual','filter','info',
                  'format','tumor','normal']
indels = indels.reset_index(0)
indels = indels[indels['info'] == 'SOMATIC']

pts = list(set(pts_indels).intersection(set(pts_snv)))
len(pts), len(pts_indels), len(pts_snv)

def format_indels_for_oncotator(s):
    r = {}
    r['chr'] = s['chromosome']
    r['start'] = s['pos']
    r['end'] = int(s['pos']) + len(s['ref']) - 1
    r['reference_allele'] = s['ref']
    r['observed_allele'] = s['alt']
    return pd.Series(r)

def format_snvs_for_oncotator(s):
    r = {}
    r['chr'] = s['contig']
    r['start'] = s['position']
    r['end'] = s['position']
    r['reference_allele'] = s['ref_allele']
    r['observed_allele'] = s['alt_allele']
    r = pd.Series(r)
    return r

onc_indel = indels.set_index('barcode').apply(format_indels_for_oncotator,1)
onc_vcf = vcf.set_index('barcode').apply(format_snvs_for_oncotator, 1)
onc = pd.concat([onc_indel, onc_vcf])
onc = onc.reset_index('barcode')
onc = onc[['chr', 'start', 'end', 'reference_allele', 'observed_allele']]
onc.to_csv('tmp.txt', sep=' ', index=False, header=False)

ONCOTATOR_FILE = '/cellar/users/agross/Downloads/oncotator_output (9).txt'

tab = pd.read_table(ONCOTATOR_FILE, skiprows=[0])
tt = tab[['Chromosome','Start_position','End_position','Reference_Allele', 'Tumor_Seq_Allele1']]
tt = pd.Series({i: tuple(v) for i,v in tt.iterrows()})

onc_indel = indels.set_index('barcode').apply(format_indels_for_oncotator,1)
onc_vcf = vcf.set_index('barcode').apply(format_snvs_for_oncotator, 1)
onc = pd.concat([onc_indel, onc_vcf])
oo = onc[['chr','start','end','reference_allele','observed_allele']]
oo = [tuple(v) for i,v in oo.iterrows()]
maf = pd.DataFrame(tab.ix[tt[tt == s].index[0]] for s in oo)
maf.index = onc.index
maf = maf.set_index('Hugo_Symbol', append=True)

vc = maf.Variant_Classification.copy()
vc.index = vc.index.droplevel(1)
vc = (vc.isin(['Silent', 'Intron', "3'UTR", "5'UTR"])==False).groupby(level=0).sum()
vc = vc.ix[pts].fillna(0) > 0
vc.name = 'new'
vc.value_counts()

wildtype = pd.Series({(p, 'wildtype'):nan for p in vc[vc==False].index})
wildtype.index = pd.MultiIndex.from_tuples(wildtype.index)
wildtype = pd.DataFrame({'Entrez_Gene_Id': wildtype})
maf_wt = maf.append(wildtype)[maf.columns]
maf_wt = maf_wt.dropna(axis=1, how='all')

maf.to_csv('../Extra_Data/p53_calls_pancancer.maf')
vc.to_csv('../Extra_Data/p53_calls_pancancer.csv')

old = pd.read_csv('../Data/MAFs/meta.csv')
old['Tumor_Sample_Barcode'] = map(lambda s: s[:12], old['Tumor_Sample_Barcode'])

p53_old = old[old.Hugo_Symbol == 'TP53'].set_index('Tumor_Sample_Barcode')['0']
p53_old = p53_old.ix[old.Tumor_Sample_Barcode.unique()].fillna(0)
p53_old = p53_old.groupby(level=0).first()
p53_old.name = 'old'

pd.crosstab(vc>0, p53_old>0)

1400. / (1400 + 90)

2103. / (2103 + 79)