# Before anything, run:
%%bash
export PATH=/tigress/BEE/gtex/test/bin/bin/:/tigress/BEE/gtex/tools/bedtools2/bin/:$PATH
export LD_LIBRARY_PATH=/tigress/BEE/gtex/test/bin/lib/zlib/lib/:/tigress/BEE/gtex/test/bin/lib/gslib/lib/
Since setting up eQTLBMA takes a bit of extra work, I decided to separate out the notebook. Now that we have the PEER-corrected expression matrices ready for our analysis, we can perform our analysis in the following fashion:
%%bash
# Set up environment
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_specific
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans/featurecounts/
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans/featurecounts/tissue_specific
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/trans/featurecounts/tissue_together
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_gene_list.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_positions/ chr_index.txt 200
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_gene_list.py
##################################################
# format_gene_list.py
# Format the gene list and slice them into pieces for eQTLBMA
#
##################################################
# eqtlbma_home_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/'
# gene_orig_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_positions/'
# geno_orig_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/'
# chr_index_file = 'chr_index.txt'
# slice_size = 200
eqtlbma_home_dir = argv[1]
# create gene lists
gene_orig_dir = argv[2]
gene_dest_dir = eqtlbma_home_dir + 'input_files/gene_list/'
misc_dest_dir = eqtlbma_home_dir + 'input_files/misc/'
# Indicates the indeces of job at which each chromosome start
chr_index_file = argv[3]
# sizes (of number of genes) to slice each chr into.
slice_size = int(argv[4])
index = 1
f_chr_out = open(misc_dest_dir + chr_index_file, 'w')
for i in range(22):
chr_dict = []
f_chr_out.write('chr' + str(i+1) + '\t' + str(index) + '\n')
f = open(gene_orig_dir + 'gene_positions_Chr' + str(i+1) + '.txt')
for line in f.readlines():
entry = str.split(line.strip(), '\t')
gene_id = str.split(entry[0], '.')[0]
chr_dict.append([gene_id, entry[1], entry[2], entry[3]])
f.close()
num_split = len(chr_dict)/slice_size+1
num_increment = len(chr_dict)/num_split+1
for j in range(num_split):
f_out = open(gene_dest_dir + 'gene_coords_' + "{:0>3d}".format(index) + '.bed', 'w')
for item in chr_dict[j*num_increment:(j+1)*num_increment]:
f_out.write('chr' + item[1] + '\t' + item[2] + '\t' + item[3] + '\t' + item[0] + '\n')
f_out.close()
index = index+1
f_chr_out.write('END' + '\t' + str(index-1))
f_chr_out.close()
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_gene_list.py
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_snp_list.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/ chr_index.txt 100000
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_snp_list.py
##################################################
# format_snp_list.py
# Format the snp list and slice them into pieces for eQTLBMA
#
##################################################
# eqtlbma_home_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/'
# snp_ref_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/'
# chr_index_file = 'chr_index.txt'
# window = '100000'
eqtlbma_home_dir = argv[1]
misc_dest_dir = eqtlbma_home_dir + 'input_files/misc/'
snp_ref_dir = argv[2]
snp_dest_dir = eqtlbma_home_dir + 'input_files/snp_list/'
chr_index_file = argv[3]
window = argv[4]
# reformat snps into BED format
for i in range(22):
print i
f = open(snp_ref_dir + 'SNP_positions_Chr' + str(i+1) + '.txt')
f_out = open(snp_ref_dir + 'SNP_positions_Chr' + str(i+1) + '.bed', 'w')
f.readline()
for line in f.readlines():
entry = str.split(line.strip(), '\t')
f_out.write(entry[1] + '\t' + entry[2] + '\t' + str(int(entry[2])+1) + '\t' + entry[0] + '\n')
f.close()
f_out.close()
# Reconstruct chromosome indices
f = open(misc_dest_dir + chr_index_file)
chr_index = []
for line in f.readlines():
entry = str.split(line.strip(), '\t')
chr_index.append([entry[0], entry[1]])
f.close()
last = int(chr_index[-1][1]) # 278 in our case
cur_chr = 0
f_script_out = open(snp_dest_dir + 'snp_collection_script.sh', 'w')
for i in range(last):
if i+1 == int(chr_index[cur_chr][1]) and i+1 != last:
cur_chr = cur_chr+1
# Use bedtools to capture a window of specific size
bashCommand = r"/tigress/BEE/gtex/tools/bedtools2/bin/bedtools window -w %s -a %sgene_coords_%s.bed -b %sSNP_positions_Chr%s.bed | cut -f8 | sort -V | uniq | gzip > %ssnp_coords_%s.txt.gz"%(window, gene_dest_dir, "{:0>3d}".format(i+1), snp_ref_dir, str(cur_chr), snp_dest_dir, "{:0>3d}".format(i+1))
f_script_out.write(bashCommand + '\n')
f_script_out.close()
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_snp_list.py
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_exp_list.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \
featurecounts _genes_certain_biotypes_counts_normalized.txt \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt chr_index.txt
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_exp_list.py
##################################################
# format_snp_list.py
# Format the expression matrices and lists for eQTLBMA
#
##################################################
# eqtlbma_home_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/'
# method = 'featurecounts'
# in_suffix = '_genes_certain_biotypes_counts_normalized.txt'
# exp_src_dir = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/' + method + '/'
# PEER_k_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt'
# chr_index_file = 'chr_index.txt'
# Make expression files
eqtlbma_home_dir = argv[1]
method = argv[2]
in_suffix = argv[3]
exp_src_dir = argv[4] + method + '/'
PEER_k_file = argv[5]
exp_dest_dir = eqtlbma_home_dir + 'input_files/expression_matrices/'
exp_list_dir = eqtlbma_home_dir + 'input_files/expfile_list/'
misc_dest_dir = eqtlbma_home_dir + 'input_files/misc/'
chr_index_file = argv[6]
# Which expression matrix to take?
f = open(PEER_k_file)
# header
f.readline()
tissue_peer = {}
for line in f.readlines():
entry = str.split(line.strip(), '\t')
tissue_peer[entry[0]] = entry[-1]
f.close()
# Take the given exp. mat., create the actual expression matrices that are sliced for eQTLBMA input.
for tissue in tissue_peer:
print tissue
tissue_exp_dict = {}
expr_src_file = exp_src_dir + 'TPM_GC_3GenPC_' + tissue_peer[tissue] + 'PEER1000/' + tissue + '_' + method + in_suffix
f_in = open(expr_src_file)
header = f_in.readline()
for line in f_in.readlines():
entry = str.split(line.strip(), '\t')
gene_id = str.split(entry[0], '.')[0]
tissue_exp_dict[gene_id] = entry[1:]
f_in.close()
print "Done reading in expfile."
for i in range(last):
if i%10 == 0:
print i
f_gene_list = open(gene_dest_dir + 'gene_coords_' + "{:0>3d}".format(i+1) + '.bed')
f_exp_file = open(exp_dest_dir + tissue + '_' + "{:0>3d}".format(i+1) + in_suffix, 'w')
f_exp_file.write(header)
for line in f_gene_list.readlines():
entry = str.split(line.strip(), '\t')
gene_id = entry[-1]
if gene_id in tissue_exp_dict:
f_exp_file.write(gene_id + '\t' + '\t'.join(tissue_exp_dict[gene_id]) + '\n')
f_exp_file.close()
f_gene_list.close()
# Reconstruct chromosome indices
f = open(misc_dest_dir + chr_index_file)
chr_index = []
for line in f.readlines():
entry = str.split(line.strip(), '\t')
chr_index.append([entry[0], entry[1]])
f.close()
last = int(chr_index[-1][1]) # 278 in our case
# Write the exp file lists
for i in range(last):
f_exp_list = open(exp_list_dir + 'list_expfiles_' + "{:0>3d}".format(i+1) + '.txt', 'w')
for tissue in tissue_peer:
f_exp_list.write(tissue + '\t' + exp_dest_dir + tissue + '_' + "{:0>3d}".format(i+1) + in_suffix + '\n')
f_exp_list.close()
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/format_exp_list.py
After running these scripts, we have the necessary input files under eqtlbma_home_dir + input_files/:
The list of seeds can be generated, like so:
%%bash
# Create list of seeds, No changes here:
nbSeeds=$(ls gene_list/* | wc -l); echo "set.seed(1859); x <- sample.int(n=1000000, size=${nbSeeds}); write(x, gzfile(\"list_seeds.txt.gz\"),1)" | R --vanilla --quiet
The grid files look different for original and extended runs that account for more tissue heterogeneity:
%%bash
echo "Original Grid"
cat /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/misc/grid_phi2_oma2_general.txt
echo "Extended Grid"
cat /tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/misc/grid_phi2_oma2_general_extended_grid.txt
Original Grid 0.0025 0.0075 0 0.01 0.01 0.03 0 0.04 0.04 0.12 0 0.16 0.16 0.48 0 0.64 0.64 1.92 0 2.56 Extended Grid 0.0075 0.0025 0.005 0.005 0.0025 0.0075 0 0.01 0.03 0.01 0.02 0.02 0.01 0.03 0 0.04 0.12 0.04 0.08 0.08 0.04 0.12 0 0.16 0.48 0.16 0.32 0.32 0.16 0.48 0 0.64 1.92 0.64 1.28 1.28 0.64 1.92 0 2.56
We have everything that we need. Now let's actually generate the eQTLBMA runs and submit them to the queue:
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \
0 0 chr_index.txt 100000
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype_permute/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list_permute/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \
1 0 chr_index.txt 100000
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis_extended/featurecounts/tissue_together/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \
0 1 chr_index.txt 100000
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis_extended/featurecounts/tissue_together/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype_permute/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list_permute/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \
1 1 chr_index.txt 100000
# temp
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py \
/tigress/bj5/eQTLBMA/cis_extend_grid/featurecounts/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype_permute/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/eQTLBMA/cis/featurecounts/tissue_together/input_files/geno_location_list_permute/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt \
1 1 chr_index.txt 100000
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py
##################################################
# prepare_eQTLBMA_jobs.py
# Prepare the eQTLBMA jobs to submit to queue
#
##################################################
from sys import argv
# Since we are Make genotype location list
eqtlbma_home_dir = argv[1]
input_dir = argv[2]
gene_orig_dir = argv[3]
geno_location_dir = argv[4]
PEER_k_file = argv[5]
f = open(PEER_k_file)
# header
f.readline()
tissue_peer = {}
for line in f.readlines():
entry = str.split(line.strip(), '\t')
tissue_peer[entry[0]] = entry[-1]
f.close()
for i in range(22):
f = open(geno_location_dir + 'list_genotypesChr' + str(i+1) + '.txt', 'w')
for tissue in tissue_peer:
f.write(tissue + '\t' + gene_orig_dir + 'genotypesChr' + str(i+1) + '_Imput.txt' + '\n')
f.close()
permute = bool(int(argv[6]))
extended = bool(int(argv[7]))
joblog_dir = eqtlbma_home_dir + "joblogs/" if not permute else eqtlbma_home_dir + "joblogs_permute/"
script_dir = eqtlbma_home_dir + "run_scripts/" if not permute else eqtlbma_home_dir + "run_scripts_permute/"
misc_dest_dir = input_dir + 'misc/'
chr_index_file = argv[8]
window = argv[9]
general_grid_file = misc_dest_dir + 'grid_phi2_oma2_general.txt' if not extended else misc_dest_dir + 'grid_phi2_oma2_general_extended_grid.txt'
config_grid_file = misc_dest_dir + 'grid_phi2_oma2_with-configs.txt' if not extended else misc_dest_dir + 'grid_phi2_oma2_with-configs_extended_grid.txt'
master_script = script_dir + 'eQTLBMA_parallel_master.sh' if not permute else script_dir + 'eQTLBMA_parallel_permute_master.sh'
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
# Reconstruct chromosome indices
f = open(misc_dest_dir + chr_index_file)
chr_index = []
for line in f.readlines():
entry = str.split(line.strip(), '\t')
chr_index.append([entry[0], entry[1]])
f.close()
last = int(chr_index[-1][1]) # 278 in our case
cur_chr = 0
for i in range(last):
if i+1 == int(chr_index[cur_chr][1]) and i+1 != last:
cur_chr = cur_chr+1
index = "{:0>3d}".format(i+1)
sbatchfile = script_dir + 'eQTLBMA_parallel_' + index + ".slurm" if not permute else script_dir + 'eQTLBMA_parallel_permute_' + index + ".slurm"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/usr/bin/env bash
#SBATCH -J %s_eQTLBMA # job name
#SBATCH --mem=8000 # 8 GB requested
#SBATCH -t 24:00:00
#SBATCH -e %s # err output directory
#SBATCH -o %s # out output directory
eqtlbma_bf_parallel.bash \
--p2b /tigress/BEE/gtex/test/bin/bin/eqtlbma_bf \
--geneD %sgene_list \
--snpD %ssnp_list \
--seedF %smisc/list_seeds.txt.gz \
--task %s \
--geno %slist_genotypesChr%s.txt \
--scoord /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr%s.bed \
--exp %sexpfile_list/list_expfiles_%s.txt \
--out %sout_eqtlbma \
--outss \
--analys join \
--gridL %s \
--gridS %s \
--anchor TSS+TES \
--cis %s \
--bfs sin \
--error hybrid \
"""%(index, joblog_dir+'err/eQTLBMA_'+index+'.err', joblog_dir+'out/eQTLBMA_'+index+'.out', input_dir, input_dir, input_dir, index, geno_location_dir, str(cur_chr), str(cur_chr), input_dir, index, joblog_dir, general_grid_file, config_grid_file, window)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/eQTLBMA/prepare_eQTLBMA_jobs.py
Let's collect cis-eQTLs by doing FDR control using R q-value package, since we have the observed statistics and null distribution.
# In R, run:
source("http://bioconductor.org/biocLite.R")
biocLite("qvalue")