css
Here we are reading in variant calls for our molecular validation cohort. This script is still a little dirty... aka hard paths and the like. The rest of the analysis can be picked up from the MAF file that we generate here, but if you are actually trying to run this part of the analysis please contact me.
cd ../src/
/cellar/users/agross/TCGA_Code/TCGA/src
%pylab inline
Populating the interactive namespace from numpy and matplotlib
from Processing.Imports import *
PATH = '/cellar/data/TCGA/protected/exome_andy/hnsc_targeted/'
Read in all of the VCF files and pull out the SNVs.
pts_snv = [f[:12] for f in os.listdir(PATH) if 'call_stats' in f and '~' not in f]
vcf = pd.concat({p: pd.read_table(PATH + '{}_call_stats.txt'.format(p), skiprows=[0])
for p in pts_snv})
vcf = vcf.reset_index()
vcf = vcf.rename(columns={'level_0':'barcode'})
vcf['barcode'] = vcf['barcode'].map(lambda s: s.replace('_','-'))
pts_snv = map(lambda p: p.replace('_','-'), pts_snv)
pts_snv = [f[:12] for f in os.listdir(path) if 'call_stats' in f and '~' not in f] vcf = pd.concat([pd.read_table(path + '{}call_stats.txt'.format(p), skiprows=[0]) for p in pts_snv]) vcf['barcode'] = vcf.tumor_name.map(lambda s: s[:12]) pts_snv = map(lambda p: p.replace('','-'), pts_snv) tab = vcf.groupby(['tumor_name','contig']).size().unstack()
MuTect sometimes likes to fail ungracefully. We can tell this when we see no variants in the last few genes. Here I check for this and filter out those patients.
tab = vcf.groupby(['tumor_name','contig']).size().unstack()
drop = tab[[19,20,22]].isnull().sum(1).order() == 3
drop = [p[:12] for p,v in drop.iteritems() if v == True]
pts_snv = [p for p in pts_snv if p not in drop]
vcf = vcf[vcf.judgement == 'KEEP']
Read in all of the indel calls. I have this funky try catch because some of the files have the VCF header but no actual calls, Pandas does not like this. For this reason I keep track of the patients with the pts_indels dict and the indel calls with the indels dict, and then put it all together.
indels = {}
pts_indels = {}
for f in os.listdir(PATH):
if not f.endswith('indels.txt'):
continue
pts_indels[f] = '-'.join(f.split('_')[:3])
try:
indels[pts_indels[f]] = pd.read_table(PATH + f, skiprows=102, header=None)
except:
pass
pts_indels = pd.Series(pts_indels)
indels = pd.concat(indels)
indels.index.names = ['barcode','num']
indels.columns = ['chromosome','pos','id','ref','alt','qual','filter','info',
'format','tumor','normal']
indels = indels.reset_index(0)
indels = indels[indels['info'] == 'SOMATIC']
Not all variant calling runs were sucessfull. Here we are only using patients with a sucessfull SomaticIndelDetector run and a sucessfull MuTect run.
pts = list(set(pts_indels).intersection(set(pts_snv)))
len(pts), len(pts_indels), len(pts_snv)
(462, 463, 462)
def format_indels_for_oncotator(s):
r = {}
r['Chromosome'] = s['chromosome']
r['Start_Position'] = s['pos']
r['End_Position'] = int(s['pos']) + len(s['ref']) - 1
#if r['Start_Position'] > r['End_Position']:
# r['Start_Position'] = r['End_Position']
# r['End_Position'] = r['Start_Position']
r['Reference_Allele'] = s['ref']
r['Tumor_Seq_Allele1'] = s['alt']
r['Tumor_Seq_Allele2'] = s['alt']
r['NCBI_Build'] = '37'
return pd.Series(r)
def format_snvs_for_oncotator(s):
r = {}
r['Chromosome'] = s['contig']
r['Start_Position'] = s['position']
r['End_Position'] = s['position']
r['Reference_Allele'] = s['ref_allele']
r['Tumor_Seq_Allele1'] = s['alt_allele']
r['Tumor_Seq_Allele2'] = s['alt_allele']
r['NCBI_Build'] = '37'
return pd.Series(r)
onc_indel = indels.set_index('barcode').apply(format_indels_for_oncotator,1)
onc_vcf = vcf.set_index('barcode').apply(format_snvs_for_oncotator, 1)
onc = pd.concat([onc_indel, onc_vcf])
onc = onc.reset_index('barcode')
onc.to_csv('tmp.txt', sep=' ', index=False)
!java -Xmx1524M -jar /cellar/users/agross/sources/oncotator/oncotator.jar -nocache tmp.maf tmp1.maf
maf = pd.read_table('tmp1.maf')
maf = maf.set_index(['barcode','ONCOTATOR_GENE_SYMBOL'])
classification = maf['ONCOTATOR_VARIANT_CLASSIFICATION_BEST_EFFECT']
def format_indels_for_oncotator(s):
r = {}
r['chr'] = s['chromosome']
r['start'] = s['pos']
r['end'] = int(s['pos']) + len(s['ref']) - 1
r['reference_allele'] = s['ref']
r['observed_allele'] = s['alt']
return pd.Series(r)
def format_snvs_for_oncotator(s):
r = {}
r['chr'] = s['contig']
r['start'] = s['position']
r['end'] = s['position']
r['reference_allele'] = s['ref_allele']
r['observed_allele'] = s['alt_allele']
r = pd.Series(r)
return r
onc_indel = indels.set_index('barcode').apply(format_indels_for_oncotator,1)
onc_vcf = vcf.set_index('barcode').apply(format_snvs_for_oncotator, 1)
onc = pd.concat([onc_indel, onc_vcf])
onc = onc.reset_index('barcode')
onc = onc[['chr', 'start', 'end', 'reference_allele', 'observed_allele']]
onc.to_csv('tmp.txt', sep=' ', index=False, header=False)
onc.shape
(819, 5)
Best way is to paste variants into the Oncotator web site... need to find a better way.
tab = pd.read_table('/cellar/users/agross/Downloads/oncotator_output (5).txt', skiprows=[0])
tt = tab[['Chromosome','Start_position','End_position','Reference_Allele', 'Tumor_Seq_Allele1']]
tt = bpd.Series({i: tuple(v) for i,v in tt.iterrows()})
onc_indel = indels.set_index('barcode').apply(format_indels_for_oncotator,1)
onc_vcf = vcf.set_index('barcode').apply(format_snvs_for_oncotator, 1)
onc = pd.concat([onc_indel, onc_vcf])
oo = onc[['chr','start','end','reference_allele','observed_allele']]
oo = [tuple(v) for i,v in oo.iterrows()]
maf = pd.DataFrame(tab.ix[tt[tt == s].index[0]] for s in oo)
maf.index = onc.index
maf = maf.set_index('Hugo_Symbol', append=True)
m = maf.dropna(axis=1, how='all')
m['Included in Analysis'] = maf.Variant_Classification.isin(['Silent', 'Intron', "3'UTR", "5'UTR"])==False
m = m.iloc[[i[0] not in mut.features.columns for i in m.index]]
m.to_csv('/cellar/users/agross/figures/supplemental_table3.csv')
print '\n'.join(list((set(pts) - set(m.index.get_level_values(0).unique()) - set(mut.features.columns))))
TCGA-UP-A6WW TCGA-BB-A6UM TCGA-CQ-A4CH TCGA-CV-A6K0 TCGA-CN-A49C TCGA-DQ-7593 TCGA-MZ-A6I9 TCGA-CN-A640 TCGA-P3-A6SW TCGA-BA-A6DF TCGA-UF-A7JJ TCGA-CN-A6V7 TCGA-RS-A6TP TCGA-KU-A6H8 TCGA-CQ-7069 TCGA-TN-A7HJ TCGA-TN-A7HI TCGA-T2-A6X0 TCGA-KU-A6H7 TCGA-MT-A67G TCGA-CV-A45T TCGA-UF-A7JV TCGA-CN-A6UY TCGA-DQ-7596 TCGA-MZ-A5BI TCGA-QK-A6IF TCGA-CV-A45Y
maf.Variant_Classification.value_counts()
Missense_Mutation 415 Intron 146 Nonsense_Mutation 91 Silent 66 Splice_Site 35 Frame_Shift_Del 33 Frame_Shift_Ins 17 3'UTR 7 In_Frame_Del 5 5'UTR 3 In_Frame_Ins 1 dtype: int64
mut_new = (maf.Variant_Classification.isin(['Silent', 'Intron', "3'UTR", "5'UTR"])==False).groupby(level=[0,1]).sum()
mut_new = mut_new.unstack().ix[pts].fillna(0).T
(mut_new > 0).sum(1).order()
Hugo_Symbol GRB2 0 MAPK3 0 SNAPC5 0 KRAS 1 NRAS 1 YWHAB 2 RAF1 3 IRS1 4 MAP2K1 6 MAP2K2 6 MAPK1 7 SOS1 8 IRS2 9 EGFR 14 HRAS 28 ASPM 29 CASP8 47 MUC5B 48 TP53 307 dtype: int64
len(pts) - 306
156
maf.xs('HRAS', level=1).Protein_Change.value_counts()
p.G13V 8 p.G12S 6 p.G13R 3 p.Q61L 3 p.G12C 2 p.G12D 2 p.K117N 1 p.H27H 1 p.E76* 1 p.Q61H 1 p.A11D 1 p.G12V 1 p.G12A 1 p.G13C 1 dtype: int64
params = pd.read_table('../global_params.txt', header=None, squeeze=True,
index_col=0)
run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])
run = get_run(run_path, 'Run_' + params.ix['VERSION'])
cancer = run.load_cancer(params.ix['CANCER'])
clinical = cancer.load_clinical()
cn = cancer.load_data('CN_broad')
cn.uncompress()
mut = cancer.load_data('Mutation')
mut.uncompress()
del_3p = cn.features.ix['Deletion'].ix['3p14.2'].ix[0]
del_3p = del_3p.ix[del_3p.index.diff(ti(clinical.hpv_inferred==1))]
rna = pickle.load(open(cancer.path + '/mRNASeq/store/no_hpv.p', 'rb'))
mirna = pickle.load(open(cancer.path + '/miRNASeq/store/no_hpv.p', 'rb'))
surv = clinical.survival.survival_5y
hpv_inferred = clinical.processed.hpv_inferred
keepers_o = true_index(hpv_inferred == 0)
keepers_o = keepers_o.intersection(mut.features.columns)
keepers_o = keepers_o.intersection(cn.features.columns)
keepers_o = keepers_o.intersection(surv.unstack().index)
keepers_o = keepers_o.intersection(rna.features.columns)
keepers_o = keepers_o.intersection(mirna.features.columns)
keepers_o = keepers_o.intersection(ti(clinical.clinical.age < 85))
len(keepers_o)
251
survival_and_stats(combine(mut_new.ix['TP53']>0, mut.features.ix['TP53']), surv)
pd.crosstab(mut.features.ix['TP53'], pd.Series(mut_new.ix['TP53']>0, name='new'))
new | False | True |
---|---|---|
TP53 | ||
0 | 74 | 4 |
1 | 14 | 181 |
2 rows × 2 columns
import Data.Firehose as FH
mmm = FH.get_submaf(run.data_path, 'HNSC', genes=['TP53'])
cc = combine(mut_new.ix['TP53']>0, mut.features.ix['TP53'])
mmm.set_index('Tumor_Sample_Barcode').ix[ti(cc=='second')].sort(axis=0)
NCBI_Build | Chromosome | Start_position | End_position | Strand | Reference_Allele | Alt_Allele | |
---|---|---|---|---|---|---|---|
Tumor_Sample_Barcode | |||||||
TCGA-BA-7269 | 37 | 17 | 7578180 | 7578181 | + | - | T |
TCGA-BB-4217 | 37 | 17 | 7578457 | 7578457 | + | C | A |
TCGA-CN-5358 | 37 | 17 | 7578222 | 7578223 | + | TC | - |
TCGA-CN-5360 | 37 | 17 | 7578474 | 7578474 | + | C | - |
TCGA-CN-6017 | 37 | 17 | 7579441 | 7579453 | + | CGGTGTAGGAGCT | - |
TCGA-CR-7386 | 37 | 17 | 7578217 | 7578218 | + | GT | - |
TCGA-CR-7393 | 37 | 17 | 7579566 | 7579566 | + | C | - |
TCGA-CV-5970 | 37 | 17 | 7577538 | 7577538 | + | C | T |
TCGA-CV-6960 | 37 | 17 | 7577142 | 7577143 | + | CC | AA |
TCGA-CV-7097 | 37 | 17 | 7576840 | 7576851 | + | CAAGACTTAGTA | - |
TCGA-CV-7178 | 37 | 17 | 7577120 | 7577137 | + | CGCACCTCAAAGCTGTTC | - |
TCGA-CV-7243 | 37 | 17 | 7578446 | 7578466 | + | TGGCCATGGCGCGGACGCGGG | - |
TCGA-CV-7432 | 37 | 17 | 7577554 | 7577564 | + | TGCAGGAACTG | - |
TCGA-CV-7432 | 37 | 17 | 7579312 | 7579312 | + | C | A |
TCGA-DQ-5630 | 37 | 17 | 7576868 | 7576886 | + | TTCTCCATCCAGTGGTTTC | - |
15 rows × 7 columns
fisher_exact_test(clinical.hpv_inferred,
mut_new.ix['TP53'].ix[mut_new.columns.diff(mut.features.columns)]>0)
odds_ratio 7.06e-02 p 7.60e-06 dtype: float64
(combo_new == 'both').value_counts()
True 66 False 45 dtype: int64
print len(combine(mut_new.ix['TP53'].ix[mut_new.columns.diff(mut.features.columns)]>0,
clinical.hpv_inferred))
pd.crosstab(mut_new.ix['TP53'].ix[mut_new.columns.diff(mut.features.columns)]>0,
clinical.hpv_inferred)
130
h | False | True |
---|---|---|
TP53 | ||
False | 28 | 14 |
True | 85 | 3 |
2 rows × 2 columns
fisher_exact_test(mut.features.ix['TP53'], clinical.hpv_inferred)
odds_ratio 6.25e-03 p 4.79e-23 dtype: float64
print len(combine(mut.features.ix['TP53'], clinical.hpv_inferred))
pd.crosstab(mut.features.ix['TP53'], clinical.hpv_inferred)
306
d | False | True |
---|---|---|
TP53 | ||
0 | 53 | 40 |
1 | 212 | 1 |
2 rows × 2 columns
pd.crosstab(del_3p<0, mut_new.ix['TP53']>0)
TP53 | False | True |
---|---|---|
Lesion | ||
False | 44 | 38 |
True | 39 | 227 |
2 rows × 2 columns
fisher_exact_test(del_3p<0, mut_new.ix['TP53']>0)
odds_ratio 6.74e+00 p 1.23e-11 dtype: float64
new = pd.Series(1, index=mut_new.columns.intersection(ti(hpv_inferred==False)).diff(mut.features.columns))
new = new.ix[keepers_o.union(new.index)].fillna(0)
new.value_counts()
0 251 1 113 dtype: int64
p53 = mut_new.ix['TP53']>0
combo = combine(p53, del_3p.dropna()<0)
combo_new = combo.ix[ti(new>0)].dropna()
fisher_exact_test(p53.ix[ti(new>0)], del_3p<0)
odds_ratio 6.99e+00 p 3.74e-05 dtype: float64
r1 = screen_feature(p53.ix[ti(new>0)], fisher_exact_test, cn.features.ix['Deletion'] < 0)
del r1['q']
r1['p_bonf'] = r1.p * len(r1)
r1.index = [i[0] for i in r1.index]
r2 = screen_feature(p53.ix[ti(new>0)], fisher_exact_test, cn.features.ix['Amplification'] > 0)
del r2['q']
r2['p_bonf'] = r2.p * len(r2)
r2.index = [i[0] for i in r2.index]
r3 = screen_feature(mut.features.ix['TP53'].ix[keepers_o], fisher_exact_test,
cn.features.ix['Deletion'] < 0)
del r3['q']
r3['p_bonf'] = r3.p * len(r3)
r3.index = [i[0] for i in r3.index]
r4 = screen_feature(mut.features.ix['TP53'].ix[keepers_o], fisher_exact_test,
cn.features.ix['Amplification'] > 0)
del r4['q']
r4['p_bonf'] = r4.p * len(r4)
r4.index = [i[0] for i in r4.index]
r3['odds_ratio_validation'] = r1.odds_ratio
r4['odds_ratio_validation'] = r2.odds_ratio
r3['p_validation'] = r1.p
r4['p_validation'] = r2.p
tab = pd.concat([r3.head(6), r4.head(6)], keys=['Deletion','Amplification'])
cols = [('odds ratio','test'),('p','test'),('q','bonf'), ('odds ratio','validation'), ('p', 'validation')]
tab.columns = pd.MultiIndex.from_tuples(cols)
tab.sort_index(axis=1)
odds ratio | p | q | ||||
---|---|---|---|---|---|---|
test | validation | test | validation | bonf | ||
Deletion | 3p14.3 | 6.34 | 6.99 | 5.34e-07 | 3.74e-05 | 2.56e-05 |
3p14.2 | 6.34 | 6.99 | 5.34e-07 | 3.74e-05 | 2.56e-05 | |
3p25.3 | 5.73 | 5.21 | 7.73e-07 | 3.94e-04 | 3.71e-05 | |
3p12.2 | 4.88 | 5.67 | 7.98e-06 | 1.89e-04 | 3.83e-04 | |
10p15.3 | 4.10 | 4.48 | 4.17e-04 | 1.57e-02 | 2.00e-02 | |
11q14.2 | 3.93 | 1.21 | 7.13e-04 | 8.16e-01 | 3.42e-02 | |
Amplification | 3q26.33 | 6.06 | 4.99 | 1.53e-07 | 5.37e-04 | 3.98e-06 |
8q24.21 | 4.08 | 3.64 | 5.14e-05 | 1.29e-02 | 1.34e-03 | |
12p13.33 | 3.30 | 2.16 | 1.68e-03 | 3.01e-01 | 4.36e-02 | |
18p11.31 | 2.68 | 3.19 | 1.14e-02 | 7.56e-02 | 2.96e-01 | |
9p24.1 | 0.41 | 0.79 | 1.48e-02 | 6.24e-01 | 3.84e-01 | |
8q11.21 | 2.04 | 3.15 | 3.33e-02 | 1.67e-02 | 8.66e-01 |
12 rows × 5 columns
print len(combine(p53.ix[ti(new>0)], del_3p<0))
pd.crosstab(p53.ix[ti(new>0)], del_3p<0)
111
Lesion | False | True |
---|---|---|
TP53 | ||
False | 18 | 10 |
True | 17 | 66 |
2 rows × 2 columns
combo_old = combine(mut.features.ix['TP53'] > 0, cn.features.ix['Deletion'].ix['3p14.2'].ix[0] < 0)
combo_old = combo_old.ix[keepers_o]
combo_old.ix[keepers_o].value_counts()
both 179 Lesion 27 TP53 23 neither 22 dtype: int64
fisher_exact_test(combo_new=='both', mut_new.ix['CASP8']>0)
odds_ratio 5.15e-02 p 2.13e-06 dtype: float64
pd.crosstab(combo_new=='both', mut_new.ix['CASP8']>0)
CASP8 | False | True |
---|---|---|
TP53 | ||
False | 28 | 17 |
True | 64 | 2 |
2 rows × 2 columns
v = mut_new.ix[run.gene_sets['REACTOME_SOS_MEDIATED_SIGNALLING']].sum()>0
fisher_exact_test(combo_new=='both', v)
odds_ratio 7.14e-02 p 3.62e-06 dtype: float64
pd.crosstab(combo_new=='both', v)
d | False | True |
---|---|---|
TP53 | ||
False | 27 | 18 |
True | 63 | 3 |
2 rows × 2 columns
ras_p = mut.features.ix['REACTOME_SOS_MEDIATED_SIGNALLING'].combine_first(v) > 0
fisher_exact_test(combo=='both', ras_p)
odds_ratio 1.05e-01 p 3.80e-11 dtype: float64
from Figures.MemoPlot import pathway_plot
combo_t.value_counts()
both 251 neither 42 TP53 42 Lesion 37 dtype: int64
len(ti(combo_t != 'both'))
121
combo_t.value_counts()
both 251 neither 42 TP53 42 Lesion 37 dtype: int64
df
TCGA-BA-4074 | TCGA-BA-4075 | TCGA-BA-4076 | TCGA-BA-4077 | TCGA-BA-4078 | TCGA-BA-5149 | TCGA-BA-5151 | TCGA-BA-5152 | TCGA-BA-5153 | TCGA-BA-5555 | TCGA-BA-5556 | TCGA-BA-5557 | TCGA-BA-5558 | TCGA-BA-5559 | TCGA-BA-6868 | TCGA-BA-6869 | TCGA-BA-6870 | TCGA-BA-6871 | TCGA-BA-6872 | TCGA-BA-6873 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SOS1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
MAP2K2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
HRAS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
NRAS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
IRS1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
IRS2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
MAPK3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
MAP2K1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
GRB2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
YWHAB | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
MAPK1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
RAF1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
KRAS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
13 rows × 495 columns
p = 'REACTOME_SOS_MEDIATED_SIGNALLING'
gs = {'HRAS','CASP8','TP53'}
p53_t = mut.features.ix['TP53'].combine_first(mut_new.ix['TP53'])
combo_t = combine(p53_t>0, del_3p<0).ix[ti(hpv_inferred == False)].dropna()
combo_t.name = 'two_hit'
df = mut.df.ix[gs].combine_first(mut_new.ix[gs])
#df = mut.df.ix[gs]
df = df.append(combo_t=='both').append(del_3p<0)
pathway_plot((df > 0).ix[:, ti(hpv_inferred == False)], bar=None);
pathway_plot((df > 0).ix[:, ti(combo_t == 'both')]);
c = combine(mut.df.ix['MUC5B']>0, mut_new.ix['MUC5B'] > 0)
survival_and_stats(c.ix[ti(combo=='both')].dropna() =='neither', surv)