# Before anything, run: %%bash export PATH=/tigress/BEE/gtex/test/bin/bin/:/tigress/BEE/gtex/tools/bedtools2/bin/:$PATH export LD_LIBRARY_PATH=/tigress/BEE/gtex/test/bin/lib/zlib/lib/:/tigress/BEE/gtex/test/bin/lib/gslib/lib/ %%bash # Set up environment mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/ mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_specific mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans/featurecounts/ mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans/featurecounts/tissue_specific mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans/featurecounts/tissue_together %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_gene_list.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_positions/ chr_index.txt 200 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_gene_list.py ################################################## # format_gene_list.py # Format the gene list and slice them into pieces for eQTLBMA # ################################################## # eqtlbma_home_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/' # gene_orig_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_positions/' # geno_orig_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/' # chr_index_file = 'chr_index.txt' # slice_size = 200 eqtlbma_home_dir = argv[1] # create gene lists gene_orig_dir = argv[2] gene_dest_dir = eqtlbma_home_dir + 'input_files/gene_list/' misc_dest_dir = eqtlbma_home_dir + 'input_files/misc/' # Indicates the indeces of job at which each chromosome start chr_index_file = argv[3] # sizes (of number of genes) to slice each chr into. slice_size = int(argv[4]) index = 1 f_chr_out = open(misc_dest_dir + chr_index_file, 'w') for i in range(22): chr_dict = [] f_chr_out.write('chr' + str(i+1) + '\t' + str(index) + '\n') f = open(gene_orig_dir + 'gene_positions_Chr' + str(i+1) + '.txt') for line in f.readlines(): entry = str.split(line.strip(), '\t') gene_id = str.split(entry[0], '.')[0] chr_dict.append([gene_id, entry[1], entry[2], entry[3]]) f.close() num_split = len(chr_dict)/slice_size+1 num_increment = len(chr_dict)/num_split+1 for j in range(num_split): f_out = open(gene_dest_dir + 'gene_coords_' + "{:0>3d}".format(index) + '.bed', 'w') for item in chr_dict[j*num_increment:(j+1)*num_increment]: f_out.write('chr' + item[1] + '\t' + item[2] + '\t' + item[3] + '\t' + item[0] + '\n') f_out.close() index = index+1 f_chr_out.write('END' + '\t' + str(index-1)) f_chr_out.close() %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_snp_list.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/ chr_index.txt 100000 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_snp_list.py ################################################## # format_snp_list.py # Format the snp list and slice them into pieces for eQTLBMA # ################################################## # eqtlbma_home_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/' # snp_ref_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/' # chr_index_file = 'chr_index.txt' # window = '100000' eqtlbma_home_dir = argv[1] misc_dest_dir = eqtlbma_home_dir + 'input_files/misc/' snp_ref_dir = argv[2] snp_dest_dir = eqtlbma_home_dir + 'input_files/snp_list/' chr_index_file = argv[3] window = argv[4] # reformat snps into BED format for i in range(22): print i f = open(snp_ref_dir + 'SNP_positions_Chr' + str(i+1) + '.txt') f_out = open(snp_ref_dir + 'SNP_positions_Chr' + str(i+1) + '.bed', 'w') f.readline() for line in f.readlines(): entry = str.split(line.strip(), '\t') f_out.write(entry[1] + '\t' + entry[2] + '\t' + str(int(entry[2])+1) + '\t' + entry[0] + '\n') f.close() f_out.close() # Reconstruct chromosome indices f = open(misc_dest_dir + chr_index_file) chr_index = [] for line in f.readlines(): entry = str.split(line.strip(), '\t') chr_index.append([entry[0], entry[1]]) f.close() last = int(chr_index[-1][1]) # 278 in our case cur_chr = 0 f_script_out = open(snp_dest_dir + 'snp_collection_script.sh', 'w') for i in range(last): if i+1 == int(chr_index[cur_chr][1]) and i+1 != last: cur_chr = cur_chr+1 # Use bedtools to capture a window of specific size bashCommand = r"/tigress/BEE/gtex/tools/bedtools2/bin/bedtools window -w %s -a %sgene_coords_%s.bed -b %sSNP_positions_Chr%s.bed | cut -f8 | sort -V | uniq | gzip > %ssnp_coords_%s.txt.gz"%(window, gene_dest_dir, "{:0>3d}".format(i+1), snp_ref_dir, str(cur_chr), snp_dest_dir, "{:0>3d}".format(i+1)) f_script_out.write(bashCommand + '\n') f_script_out.close() %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_exp_list.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \ featurecounts _genes_certain_biotypes_counts_normalized.txt \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt chr_index.txt %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_exp_list.py ################################################## # format_snp_list.py # Format the expression matrices and lists for eQTLBMA # ################################################## # eqtlbma_home_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/' # method = 'featurecounts' # in_suffix = '_genes_certain_biotypes_counts_normalized.txt' # exp_src_dir = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/' + method + '/' # PEER_k_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt' # chr_index_file = 'chr_index.txt' # Make expression files eqtlbma_home_dir = argv[1] method = argv[2] in_suffix = argv[3] exp_src_dir = argv[4] + method + '/' PEER_k_file = argv[5] exp_dest_dir = eqtlbma_home_dir + 'input_files/expression_matrices/' exp_list_dir = eqtlbma_home_dir + 'input_files/expfile_list/' misc_dest_dir = eqtlbma_home_dir + 'input_files/misc/' chr_index_file = argv[6] # Which expression matrix to take? f = open(PEER_k_file) # header f.readline() tissue_peer = {} for line in f.readlines(): entry = str.split(line.strip(), '\t') tissue_peer[entry[0]] = entry[-1] f.close() # Take the given exp. mat., create the actual expression matrices that are sliced for eQTLBMA input. for tissue in tissue_peer: print tissue tissue_exp_dict = {} expr_src_file = exp_src_dir + 'TPM_GC_3GenPC_' + tissue_peer[tissue] + 'PEER1000/' + tissue + '_' + method + in_suffix f_in = open(expr_src_file) header = f_in.readline() for line in f_in.readlines(): entry = str.split(line.strip(), '\t') gene_id = str.split(entry[0], '.')[0] tissue_exp_dict[gene_id] = entry[1:] f_in.close() print "Done reading in expfile." for i in range(last): if i%10 == 0: print i f_gene_list = open(gene_dest_dir + 'gene_coords_' + "{:0>3d}".format(i+1) + '.bed') f_exp_file = open(exp_dest_dir + tissue + '_' + "{:0>3d}".format(i+1) + in_suffix, 'w') f_exp_file.write(header) for line in f_gene_list.readlines(): entry = str.split(line.strip(), '\t') gene_id = entry[-1] if gene_id in tissue_exp_dict: f_exp_file.write(gene_id + '\t' + '\t'.join(tissue_exp_dict[gene_id]) + '\n') f_exp_file.close() f_gene_list.close() # Reconstruct chromosome indices f = open(misc_dest_dir + chr_index_file) chr_index = [] for line in f.readlines(): entry = str.split(line.strip(), '\t') chr_index.append([entry[0], entry[1]]) f.close() last = int(chr_index[-1][1]) # 278 in our case # Write the exp file lists for i in range(last): f_exp_list = open(exp_list_dir + 'list_expfiles_' + "{:0>3d}".format(i+1) + '.txt', 'w') for tissue in tissue_peer: f_exp_list.write(tissue + '\t' + exp_dest_dir + tissue + '_' + "{:0>3d}".format(i+1) + in_suffix + '\n') f_exp_list.close() %%bash # Create list of seeds, No changes here: nbSeeds=$(ls gene_list/* | wc -l); echo "set.seed(1859); x <- sample.int(n=1000000, size=${nbSeeds}); write(x, gzfile(\"list_seeds.txt.gz\"),1)" | R --vanilla --quiet %%bash echo "Original Grid" cat /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/misc/grid_phi2_oma2_general.txt echo "Extended Grid" cat /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/misc/grid_phi2_oma2_general_extended_grid.txt %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \ 0 0 chr_index.txt 100000 python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype_permute/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list_permute/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \ 1 0 chr_index.txt 100000 python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis_extended/featurecounts/tissue_together/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \ 0 1 chr_index.txt 100000 python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis_extended/featurecounts/tissue_together/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype_permute/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list_permute/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \ 1 1 chr_index.txt 100000 # temp python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \ /tigress/bj5/eQTLBMA/cis_extend_grid/featurecounts/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype_permute/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list_permute/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \ 1 1 chr_index.txt 100000 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py ################################################## # prepare_eQTLBMA_jobs.py # Prepare the eQTLBMA jobs to submit to queue # ################################################## from sys import argv # Since we are Make genotype location list eqtlbma_home_dir = argv[1] input_dir = argv[2] gene_orig_dir = argv[3] geno_location_dir = argv[4] PEER_k_file = argv[5] f = open(PEER_k_file) # header f.readline() tissue_peer = {} for line in f.readlines(): entry = str.split(line.strip(), '\t') tissue_peer[entry[0]] = entry[-1] f.close() for i in range(22): f = open(geno_location_dir + 'list_genotypesChr' + str(i+1) + '.txt', 'w') for tissue in tissue_peer: f.write(tissue + '\t' + gene_orig_dir + 'genotypesChr' + str(i+1) + '_Imput.txt' + '\n') f.close() permute = bool(int(argv[6])) extended = bool(int(argv[7])) joblog_dir = eqtlbma_home_dir + "joblogs/" if not permute else eqtlbma_home_dir + "joblogs_permute/" script_dir = eqtlbma_home_dir + "run_scripts/" if not permute else eqtlbma_home_dir + "run_scripts_permute/" misc_dest_dir = input_dir + 'misc/' chr_index_file = argv[8] window = argv[9] general_grid_file = misc_dest_dir + 'grid_phi2_oma2_general.txt' if not extended else misc_dest_dir + 'grid_phi2_oma2_general_extended_grid.txt' config_grid_file = misc_dest_dir + 'grid_phi2_oma2_with-configs.txt' if not extended else misc_dest_dir + 'grid_phi2_oma2_with-configs_extended_grid.txt' master_script = script_dir + 'eQTLBMA_parallel_master.sh' if not permute else script_dir + 'eQTLBMA_parallel_permute_master.sh' master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") # Reconstruct chromosome indices f = open(misc_dest_dir + chr_index_file) chr_index = [] for line in f.readlines(): entry = str.split(line.strip(), '\t') chr_index.append([entry[0], entry[1]]) f.close() last = int(chr_index[-1][1]) # 278 in our case cur_chr = 0 for i in range(last): if i+1 == int(chr_index[cur_chr][1]) and i+1 != last: cur_chr = cur_chr+1 index = "{:0>3d}".format(i+1) sbatchfile = script_dir + 'eQTLBMA_parallel_' + index + ".slurm" if not permute else script_dir + 'eQTLBMA_parallel_permute_' + index + ".slurm" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/usr/bin/env bash #SBATCH -J %s_eQTLBMA # job name #SBATCH --mem=8000 # 8 GB requested #SBATCH -t 24:00:00 #SBATCH -e %s # err output directory #SBATCH -o %s # out output directory eqtlbma_bf_parallel.bash \ --p2b /tigress/BEE/gtex/test/bin/bin/eqtlbma_bf \ --geneD %sgene_list \ --snpD %ssnp_list \ --seedF %smisc/list_seeds.txt.gz \ --task %s \ --geno %slist_genotypesChr%s.txt \ --scoord /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr%s.bed \ --exp %sexpfile_list/list_expfiles_%s.txt \ --out %sout_eqtlbma \ --outss \ --analys join \ --gridL %s \ --gridS %s \ --anchor TSS+TES \ --cis %s \ --bfs sin \ --error hybrid \ """%(index, joblog_dir+'err/eQTLBMA_'+index+'.err', joblog_dir+'out/eQTLBMA_'+index+'.out', input_dir, input_dir, input_dir, index, geno_location_dir, str(cur_chr), str(cur_chr), input_dir, index, joblog_dir, general_grid_file, config_grid_file, window) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) # In R, run: source("http://bioconductor.org/biocLite.R") biocLite("qvalue")