%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/upload_all_gists.sh
#!/bin/bash
# upload_all_gists.sh
gist /home/bj5/notebooks/eQTL_cis.ipynb -u 0dd6bd0cc0252667a1dc
gist /home/bj5/notebooks/misc_meta_analysis.ipynb -u 646278c49a10b6307bd9
gist /home/bj5/notebooks/cis_pipeline_gallery.ipynb -u 633dddf85ad9dc54c07b
gist /home/bj5/notebooks/GEUVADIS_pipeline.ipynb -u 243bf73f1a75dbcfc549
gist /home/bj5/notebooks/eQTLBMA_pipeline.ipynb -u 250135a17a004b014434
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/upload_all_gists.sh
Let's place useful files under resources in cis-analysis pipeline:
%%bash
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references
# Gencode 19 gene annotation
ln -s /tigress/BEE/gtex/dbGaP-7716/45696/gtex/exchange/GTEx_phs000424/exchange/analysis_releases/GTEx_Analysis_2015-01-12/reference_files/gencode.v19.genes.patched_contigs.gtf.gz ./
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/summaries
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/summaries
cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_1pass_hg19_summary_report ./
cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_2pass_hg19_featurecounts_summary_report ./
cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_2pass_hg19_rsem_summary_report ./
cp /tigress/BEE/gtex/results/group_general/checkpoints/trim_summary_report ./
Convert the gencode annotation into a file of genes with corresponding names and positions. In particular, we will use the gene list that Ian made for non-overlapping genes, with and without certain biotypes.
%%bash
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references
cp /tigress/BEE/gtex/external_sources/hg19/gencode.v19.nonoverlap.genes_for_quantification_certain_and_uncertain.txt ./
ln -s /tigress/BEE/gtex/external_sources/hg19/gencode.v19.annotation.nonoverlap.gtf ./
The information about annotated genes are taken from gencode (v.19) and are saved in :
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_gene_names_positions.py
##################################################
# obtain_gene_names_positions.py
#
#
##################################################
import gzip
anno_file = gzip.open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gencode.v19.genes.patched_contigs.gtf.gz')
meta_file = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt', 'w')
meta_file.write('\t'.join(['gene_id','chr','start','end','gene_name','gene_type']))
for i in range(5):
anno_file.readline()
for line in anno_file.readlines():
annotation = str.split(line, '\t')
if annotation[2] == "transcript":
chr = annotation[0]
start = annotation[3]
end = annotation[4]
desc = str.split(annotation[8], ";")
gene_id = str.strip(str.split(desc[0], " ")[1], '"')
gene_name = str.strip(str.split(desc[4], " ")[2], '"')
gene_type = str.strip(str.split(desc[2], " ")[2], '"')
meta_file.write('\n')
meta_file.write('\t'.join([gene_id,chr,start,end,gene_name,gene_type]))
anno_file.close()
meta_file.close()
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_gene_names_positions.py
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_gene_names_positions.py
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references
# Deposit each chromosome-specific genes to a separate file:
for i in $(seq 1 22); \
do awk -v chr="$i" 'BEGIN{OFS=FS="\t"} $2==chr {print $1,$2,$3,$4}' gene_names_positions.txt > /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_positions/gene_positions_Chr$i.txt; \
echo $i; \
done
Let's also obtain genotypes, both imputed and unimputed, in a format that can be read in by various eQTL detection packages. First, we can obtain the genotypes without imputation from:
/tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/
And the files are named:
GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.allelic_dosage.txt
And are saved as :
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr${i}_Noimput.txt
%%bash
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype
for i in $(seq 1 22); \
do head -n 1 /tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.allelic_dosage.txt > genotypesChr${i}.txt; \
echo $i; \
done
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/modify_header.py
# After the genotype files have been given the correct header:
for i in $(seq 1 22); \
do grep -v "^SNP" /tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.allelic_dosage.txt >> genotypesChr${i}.txt; \
echo $i; \
# Remove snps that are "."
cat genotypesChr${i}.txt | grep -ve '^\.' > genotypesChr${i}_Noimput.txt; \
done
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
mkdir: cannot create directory `/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype': File exists
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/modify_header.py
##################################################
# modify_header.py
# Modifies the header from the genotype file
#
##################################################
for i in range(22):
f = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr' + str(i+1) + '.txt')
listHead = str.split(f.readline().strip(), "\t")
f.close()
# There are 450 individuals for phase 1:
for j in range(len(listHead)):
if "-" in listHead[j]:
listHead[j] = '.'.join(str.split(listHead[j], "-")[0:2])
# Overwrite with new header
f = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr' + str(i+1) + '.txt', 'w')
f.write('\t'.join(listHead) + '\n')
f.close()
Writing /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/modify_header.py
We can also obtain imputed genotypes, from the directory:
/tigress/BEE/gtex/data/genotype/imputed_genotypes/vcf
in vcf format:
GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.vcf.gz
And are saved as:
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr${i}_Imput.txt
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.py
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.sh Number of jobs: 23
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.py
##################################################
# obtain_imputed_genotype_wrapper.py
#
#
##################################################
vcf_dir = '/tigress/BEE/gtex/data/genotype/imputed_genotypes/vcf/'
output_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/'
master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
k = 0
for i in range(22):
k+=1
chr_num = str(i+1)
sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/obtain_imputed_genotype_Chr" + chr_num + ".slurm"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J geno_%s_obtain # job name
#SBATCH --mem=12000 # 12 GB requested
#SBATCH -t 00:59:00 # to be placed in the test queue
#SBATCH -D %s # set working directory
/usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype.py \
%s %s %s
cat genotypesChr%s_temp.txt | grep -ve '^\.' > genotypesChr%s_Imput.txt
rm genotypesChr%s_temp.txt
"""%(i, output_dir, vcf_dir, output_dir, chr_num, chr_num, chr_num, chr_num)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
print 'Number of jobs:', k+1
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.py
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype.py
##################################################
# obtain_imputed_genotype.py
# Obtain the imputed genotype and keep them in decimals.
#
##################################################
from sys import argv
import gzip
vcf_dir = argv[1]
output_dir = argv[2]
chr_num = argv[3]
# Take the imputed genotypes directly from vcf files
f = gzip.open(vcf_dir + "GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr" + chr_num + ".vcf.gz")
line = f.readline()
out_f = open(output_dir + "genotypesChr" + chr_num + "_temp.txt", 'w')
while True:
line = f.readline()
if line[0:2] != '##':
break
# We went through the annotations, now the header line:
listHead = str.split(line.strip(), "\t")
# First 9 entries are other annotations
for j in range(9, len(listHead)):
listHead[j] = '.'.join(str.split(listHead[j], "-")[0:2])
out_f.write('SNP' + '\t' + '\t'.join(listHead[9:len(listHead)]))
count = 0
for line in f.readlines():
count = count+1
genotype_list = []
listGen = str.split(line.strip(), "\t")
# Add in rs SNP ID
genotype_list.append(listGen[2])
for k in range(9, len(listGen)):
imput = str.split(listGen[k], ":")
# Confidence under 90% - take the decimal form
if imput[0] == './.':
genotype_list.append(imput[2])
# Confidence over 90% - take 0, 1 or 2
else:
genotype_list.append(str(int(listGen[k][0]) + int(listGen[k][2])))
if count%10000 == 0:
print count
out_f.write('\n' + '\t'.join(genotype_list))
f.close()
out_f.close()
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype.py
We also need, for the cis-analysis, the locations of the SNPS. For v19, we will use the dbSNP 142 for hg-19, which is given as UCSC snp142.txt.gz at:
/tigress/BEE/gtex/data/genotype/auxiliary/snp142.txt.gz
Which is a very large (2.6G gzipped) collection of the position and the properties of the known snps.
We can extract the needed information in the following fashion:
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.slurm
#!/bin/bash
#SBATCH -J snp_get # job name
#SBATCH --mem=12000 # 12 GB requested
#SBATCH -t 00:59:00 # to be placed in the test queue
/usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.py \
/tigress/BEE/gtex/data/genotype/auxiliary/snp142.txt.gz \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/
for i in $(seq 1 22); \
do /usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/select_snps.py \
$i /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/; \
echo $i; \
done
Writing /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.slurm
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.py
##################################################
# obtain_snp_positions.py
# Obtain the positions of the known snps from UCSC snp142 (hg-19)
#
##################################################
from sys import argv
import gzip
dbsnp = argv[1]
snp_pos_dir = argv[2]
# Assumes that the chromosomes appear sequentially
current_chr = 1
prev = 'chr1'
snp_file = gzip.open(dbsnp)
output_file = open(snp_pos_dir + 'SNP_positions_Chr' + str(current_chr) + '_temp.txt', 'w')
print "writing: SNP_positions_Chr" + str(current_chr) + '_temp.txt'
output_file.write('rsID' + '\t' + 'chr' + '\t' + 'pos')
count = 0
while True:
line = snp_file.readline()
if line == '':
break
count += 1
entry = str.split(line.strip(), '\t')
# Take care of alternative scaffolds - skip them for now
if len(str.split(entry[1], '_')) > 1:
continue
if entry[1] != prev:
output_file.close()
print "Finished: SNP_positions_Chr" + str(current_chr) + '_temp.txt'
# Autosomes only
current_chr = int(entry[1][3:])
output_file = open(snp_pos_dir + 'SNP_positions_Chr' + str(current_chr) + '_temp.txt', 'w')
print "writing: SNP_positions_Chr" + str(current_chr) + '_temp.txt'
output_file.write('rsID' + '\t' + 'chr' + '\t' + 'pos')
output_file.write('\n' + '\t'.join([entry[4], entry[1], entry[2]]))
prev = entry[1]
if count%1000000 == 0:
print count
output_file.close()
"Finished: SNP_positions_Chr" + str(current_chr) + '_temp.txt'
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.py
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/select_snps.py
##################################################
# select_snps.py
# Obtain only the snps that are in the genotype files
#
##################################################
from sys import argv
chr_num = argv[1]
snp_pos_dir = argv[2]
geno_file_dir = argv[3]
# construct a dictionary of all snps in the genotype file
snp_map = {}
geno_file = open(geno_file_dir + 'genotypesChr' + chr_num + '_Noimput.txt')
line = geno_file.readline()
for line in geno_file.readlines():
snp = str.split(line.strip(), '\t')[0]
snp_map[snp] = True
geno_file.close()
# Only take the snps that are in the genotype file
f_in = open(snp_pos_dir + 'SNP_positions_Chr' + chr_num + '_temp.txt')
f_out = open(snp_pos_dir + 'SNP_positions_Chr' + chr_num + '.txt', 'w')
f_out.write('rsID' + '\t' + 'chr' + '\t' + 'pos' + '\n')
for line in f_in.readlines():
if str.split(line.strip(), '\t')[0] in snp_map:
f_out.write(line)
f_in.close()
f_out.close()
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/select_snps.py
As a result, we have the gene expression files, genotype files, gene position files and snp position files all ready to be plugged into the eQTL pipeline.
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/tissue_rankings_by_samples.py
##################################################
#
# tissue_rankings_by_samples.py
#
##################################################
import glob
in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/'
mat_suffix = '_featurecounts_genes_certain_biotypes_counts_normalized.txt'
matrices = glob.glob(in_path + '*' + mat_suffix)
tissue_dict = {}
for file in matrices:
f_handle = open(file)
line = f_handle.readline().strip()
samples = str.split(line, '\t')
tissue_name = str.split(str.split(file, mat_suffix)[0], in_path)[1]
tissue_dict[tissue_name] = len(samples)
f_handle.close()
import operator
sorted_tissues = sorted(tissue_dict.items(), key=operator.itemgetter(1), reverse=True)
out_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'
f_handle = open(out_file, 'w')
for (key, value) in sorted_tissues:
f_handle.write(key + '\t' + str(value) + '\n')
f_handle.close()
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/tissue_rankings_by_samples.py
Running the following script outputs the tissues sorted by the available number of samples in /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt
%%bash
/usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/tissue_rankings_by_samples.py
head -n 10 /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt
Muscle-Skeletal 361 WholeBlood 337 Skin-SunExposed_Lowerleg_ 302 Adipose-Subcutaneous 297 Artery-Tibial 285 Thyroid 278 Lung 278 Cells-Transformedfibroblasts 272 Nerve-Tibial 256 Esophagus-Mucosa 241
/home/bj5/notebooks