cd ../src/ %pylab inline from Processing.Imports import * PATH = '/cellar/data/TCGA/protected/exome_andy/p53_all/' pts_snv = [f[:12] for f in os.listdir(PATH) if 'call_stats' in f and '~' not in f] vcf = pd.concat({p: pd.read_table(PATH + '{}_call_stats.txt'.format(p), skiprows=[0]) for p in pts_snv}) vcf = vcf.reset_index() vcf = vcf.rename(columns={'level_0':'barcode'}) vcf['barcode'] = vcf['barcode'].map(lambda s: s.replace('_','-')) pts_snv = map(lambda p: p.replace('_','-'), pts_snv) vcf = vcf[vcf.contig == 17] tab = vcf.groupby(['tumor_name','contig']).size().unstack() vcf = vcf[vcf.judgement == 'KEEP'] indels = {} pts_indels = {} for f in os.listdir(PATH): if not f.endswith('indels.txt'): continue pts_indels[f] = '-'.join(f.split('_')[:3]) try: indels[pts_indels[f]] = pd.read_table(PATH + f, skiprows=102, header=None) except ValueError: pass pts_indels = pd.Series(pts_indels) indels = pd.concat(indels) indels.index.names = ['barcode','num'] indels.columns = ['chromosome','pos','id','ref','alt','qual','filter','info', 'format','tumor','normal'] indels = indels.reset_index(0) indels = indels[indels['info'] == 'SOMATIC'] pts = list(set(pts_indels).intersection(set(pts_snv))) len(pts), len(pts_indels), len(pts_snv) def format_indels_for_oncotator(s): r = {} r['chr'] = s['chromosome'] r['start'] = s['pos'] r['end'] = int(s['pos']) + len(s['ref']) - 1 r['reference_allele'] = s['ref'] r['observed_allele'] = s['alt'] return pd.Series(r) def format_snvs_for_oncotator(s): r = {} r['chr'] = s['contig'] r['start'] = s['position'] r['end'] = s['position'] r['reference_allele'] = s['ref_allele'] r['observed_allele'] = s['alt_allele'] r = pd.Series(r) return r onc_indel = indels.set_index('barcode').apply(format_indels_for_oncotator,1) onc_vcf = vcf.set_index('barcode').apply(format_snvs_for_oncotator, 1) onc = pd.concat([onc_indel, onc_vcf]) onc = onc.reset_index('barcode') onc = onc[['chr', 'start', 'end', 'reference_allele', 'observed_allele']] onc.to_csv('tmp.txt', sep=' ', index=False, header=False) ONCOTATOR_FILE = '/cellar/users/agross/Downloads/oncotator_output (9).txt' tab = pd.read_table(ONCOTATOR_FILE, skiprows=[0]) tt = tab[['Chromosome','Start_position','End_position','Reference_Allele', 'Tumor_Seq_Allele1']] tt = pd.Series({i: tuple(v) for i,v in tt.iterrows()}) onc_indel = indels.set_index('barcode').apply(format_indels_for_oncotator,1) onc_vcf = vcf.set_index('barcode').apply(format_snvs_for_oncotator, 1) onc = pd.concat([onc_indel, onc_vcf]) oo = onc[['chr','start','end','reference_allele','observed_allele']] oo = [tuple(v) for i,v in oo.iterrows()] maf = pd.DataFrame(tab.ix[tt[tt == s].index[0]] for s in oo) maf.index = onc.index maf = maf.set_index('Hugo_Symbol', append=True) vc = maf.Variant_Classification.copy() vc.index = vc.index.droplevel(1) vc = (vc.isin(['Silent', 'Intron', "3'UTR", "5'UTR"])==False).groupby(level=0).sum() vc = vc.ix[pts].fillna(0) > 0 vc.name = 'new' vc.value_counts() wildtype = pd.Series({(p, 'wildtype'):nan for p in vc[vc==False].index}) wildtype.index = pd.MultiIndex.from_tuples(wildtype.index) wildtype = pd.DataFrame({'Entrez_Gene_Id': wildtype}) maf_wt = maf.append(wildtype)[maf.columns] maf_wt = maf_wt.dropna(axis=1, how='all') maf.to_csv('../Extra_Data/p53_calls_pancancer.maf') vc.to_csv('../Extra_Data/p53_calls_pancancer.csv') old = pd.read_csv('../Data/MAFs/meta.csv') old['Tumor_Sample_Barcode'] = map(lambda s: s[:12], old['Tumor_Sample_Barcode']) p53_old = old[old.Hugo_Symbol == 'TP53'].set_index('Tumor_Sample_Barcode')['0'] p53_old = p53_old.ix[old.Tumor_Sample_Barcode.unique()].fillna(0) p53_old = p53_old.groupby(level=0).first() p53_old.name = 'old' pd.crosstab(vc>0, p53_old>0) 1400. / (1400 + 90) 2103. / (2103 + 79)