%pylab inline import pandas as pd import os as os cghub_manifest = 'https://cghub.ucsc.edu/reports/SUMMARY_STATS/2014-02-24T12:00:01-0800_data_manifest.tsv' CGHUB = 'https://cghub.ucsc.edu/cghub/data/analysis/download' DBSNP = '/home/moores/projects/pipeline/files/variation/dbsnp_138.vcf' COSMIC = '/home/moores/projects/pipeline/files/variation/cosmic-v67_20131024-GRCh37.vcf' REFERENCE = '/home/moores/projects/pipeline/files/reference/GRCh37.fa' KEY = '/home/moores/cghub.key' MUTECT_JAR = '/usr/local/share/java/mutect/muTect-1.1.5.jar' SID_JAR = '/home/moores/projects/pipeline/progs/GenomeAnalysisTK-2.2-2/GenomeAnalysisTK.jar' CACHE = '/home/moores/cache' NUM_PROCESSES = 16 VM_DIRECTORY = '/home/moores/projects/recall' LOCAL_DIRECTORY = '/cellar/users/agross/scripts' mutect = 'java -Xmx32g -jar {}'.format(MUTECT_JAR) mutect_args = {'analysis_type': 'MuTect', 'reference_sequence': REFERENCE, 'dbsnp' : DBSNP, 'cosmic': COSMIC, } somaticIndelDetector = 'java -Xmx32g -jar {}'.format(SID_JAR) indel_args = {'analysis_type': 'SomaticIndelDetector', 'reference_sequence': REFERENCE, } gt_fuse = 'gtfuse -c {} --inactivity-timeout=2'.format(KEY) regions = pd.read_csv('../Extra_Data/hgnc_regions.txt') regions = regions.dropna().set_index('HGNC symbol') regions = regions.sort('Chromosome Name').groupby(level=0).first() def get_region(g): ''' Given a gene, return the chromosomal in a format that GATK likes. ''' r = regions.ix[g] return '{}:{}-{}'.format(r['Chromosome Name'], r['Gene Start (bp)'] - 200, r['Gene End (bp)'] + 200) def build_call(pat, genes, normal_code='NB', tumor_code='TP', directory='', out=''): ''' genes is a list of genes: ['HRAS','TP53'] directory is like: /home/moores/projects/recall out is a folder to output the data. One caveat to remember with the argslists is that the normal input file needs to be fed in before the tumor input file. ''' pat_ = pat.replace('-','_') normal_id = tcga.ix[pat, normal_code]['analysis_id'] normal_ext = tcga.ix[pat, normal_code]['filename'] tumor_id = tcga.ix[pat, tumor_code]['analysis_id'] tumor_ext = tcga.ix[pat, tumor_code]['filename'] normal_folder = '{}/normal_{}'.format(directory, pat_) init_normal = 'mkdir {}'.format(normal_folder) mount_normal = '{} {}/{} {}'.format(gt_fuse, CGHUB, normal_id, normal_folder) tumor_folder = '{}/tumor_{}'.format(directory, pat_) init_tumor = 'mkdir {}'.format(tumor_folder) mount_tumor = '{} {}/{} {}'.format(gt_fuse, CGHUB, tumor_id, tumor_folder) normal_f = '{}/{}/{}'.format(normal_folder, normal_id, normal_ext) tumor_f = '{}/{}/{}'.format(tumor_folder, tumor_id, tumor_ext) '''Call Mutect''' mutect_args['out'] = '{}/{}/{}_call_stats.txt'.format(directory, out, pat_) mutect_args['performanceLog'] = '{}/{}/{}_mutect_log.txt'.format(directory, out, pat_) mutect_args['coverage_file'] = '{}/{}/{}_overage.wig.txt'.format(directory, out, pat_) arglist = [''] + [k + ' ' + v for k,v in mutect_args.iteritems()] arglist += ['input_file:normal ' + normal_f, 'input_file:tumor ' + tumor_f] arglist += ['intervals ' + get_region(g) for g in genes] mutect_call = mutect + ' --'.join(arglist) '''Call SomaticIndelLocator''' indel_args['out'] = '{}/{}/{}_indels.txt'.format(directory, out, pat_) indel_args['performanceLog'] = '{}/{}/{}_indel_log.txt'.format(directory, out, pat_) arglist = [''] + [k + ' ' + v for k,v in indel_args.iteritems()] arglist += ['input_file:normal ' + normal_f, 'input_file:tumor ' + tumor_f] arglist += ['intervals ' + get_region(g) for g in genes] indel_call = somaticIndelDetector + ' --'.join(arglist) unmount_normal = 'fusermount -u {}'.format(normal_folder) clear_normal_cache = 'rm -rf {}/{}*'.format(CACHE, normal_id) clean_normal = 'rm -rf {}'.format(normal_folder) unmount_tumor = 'fusermount -u {}'.format(tumor_folder) clear_tumor_cache = 'rm -rf {}/{}*'.format(CACHE, tumor_id) clean_tumor = 'rm -rf {}'.format(tumor_folder) r = ' ; '.join([init_normal, mount_normal, init_tumor, mount_tumor, mutect_call, indel_call, unmount_normal, clear_tumor_cache, clean_normal, unmount_tumor, clear_tumor_cache, clean_tumor]) return r def generate_scripts(patients, genes, out): ''' Parses calls out to one script for each process you want to run. Then builds a driver script to run them all in parallel. Saves the files to the output directory. The threads are names thread_n.sh and the driver is named driver.sh. ''' script_dir = '{}/{}'.format(LOCAL_DIRECTORY, out) if not os.path.isdir(script_dir): os.makedirs(script_dir) for i in range(NUM_PROCESSES): calls = ' ; '.join([build_call(pat, genes, directory=VM_DIRECTORY, out=out) for pat in patients[i::NUM_PROCESSES]]) f = open('{}/thread_{}.sh'.format(script_dir, i), 'wb') f.write(calls) f.close() super_script = 'mkdir {}/{}\n'.format(VM_DIRECTORY, out) threads = ['bash thread_{}.sh &'.format(i) for i in range(NUM_PROCESSES)] super_script += '\n'.join(threads) f = open('{}/driver.sh'.format(script_dir), 'wb') f.write(super_script) f.close() tcga = pd.read_table(cghub_manifest, low_memory=False) tcga = tcga[tcga.study == 'TCGA'] tcga = tcga[tcga.library_type =='WXS'] tcga = tcga.sort('uploaded') tcga['patient'] = tcga['barcode'].map(lambda s: s[:12]) tcga = tcga[tcga.state == 'Live'] tcga = tcga.set_index(['patient', 'sample_type']) tcga = tcga.groupby(level=[0,1]).last() pats = set(tcga.index.get_level_values(0)) tumor = tcga.xs('TP', level='sample_type') tumor = set(tumor.index.get_level_values(0)) normal = tcga.xs('NB', level='sample_type') normal = set(normal.index.get_level_values(0)) normal_tissue = tcga.xs('NT', level='sample_type') normal_tissue = set(normal_tissue.index.get_level_values(0)) metastatic = tcga.xs('TM', level='sample_type') metastatic = set(metastatic.index.get_level_values(0)) blood = tcga.xs('TB', level='sample_type') blood = set(blood.index.get_level_values(0)) tn_blood = [i for i in pats if i in tumor and i in normal] tn_tissue = [i for i in pats if i in tumor and i in normal_tissue] met_norm = [i for i in pats if i in normal and i in metastatic] blood_tumor = [i for i in pats if i in normal_tissue and i in blood] len(pats), len(tn_blood), len(tn_tissue), len(met_norm), len(blood_tumor) generate_scripts(tn_blood, ['TP53'], 'p53_all2') hnsc = tcga[tcga.disease == 'HNSC'] hnsc = hnsc.ix[tn_blood] hnsc_pts = hnsc.index.get_level_values(0).unique() hnsc_pts.shape sos_signaling = ['GRB2', 'HRAS', 'IRS1', 'IRS2', 'KRAS', 'MAP2K1', 'MAP2K2', 'MAPK1', 'MAPK3', 'NRAS', 'RAF1', 'SOS1', 'YWHAB'] genes = sos_signaling + ['TP53', 'MUC5B', 'CASP8', 'ASPM', 'EGFR'] generate_scripts(hnsc_pts, genes, 'HNSCC_validation')