This is the master notebook for the cis-pipeline. There are separate notebooks for:
misc. and meta analyses: http://nbviewer.ipython.org/gist/bjo/646278c49a10b6307bd9 eQTLBMA part of the pipeline: http://nbviewer.ipython.org/gist/bjo/250135a17a004b014434 Collection of relevant plots: http://nbviewer.ipython.org/gist/bjo/633dddf85ad9dc54c07b
The notebook is organized in a following fashion:
Now that the tissue-level quantification has been finished for GTEx Phase 1, we can move on to the cis-eQTL pipeline. First, we can take the TPM-converted matrices (view Ian's consortium notebook for more details) and control for the GC content, which is one of the standard practices in eQTL studies, and an example is shown in this work: http://www.nature.com/nature/journal/v464/n7289/full/nature08872.html
For now, we'll use the 'certain biotypes', and use FeatureCounts TPM data as an example, and we'll only take expression from individuals for which we have known genotypes. The correction of other matrices will be conducted later in the notebook. The TPM-converted matrices are located at:
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes
First, we need to calculate GC content for each transcripts, and we'll use the ENCODE v.19 annotations, as they were also used for the quantification. (This portion is very similar to the normalization step from Ian's notebook on RNA-seq mapping)
%%bash
cd /tigress/BEE/gtex/external_sources/hg19/
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.pc_transcripts.fa.gz
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.lncRNA_transcripts.fa.gz
Find the longest transcripts per gene
%%bash
cd /tigress/BEE/gtex/external_sources/hg19/
gzip -cd gencode.v19.lncRNA_transcripts.fa.gz
gzip -cd gencode.v19.pc_transcripts.fa.gz
mkdir /tigress/BEE/gtex/data/auxiliary_bj5
mkdir /tigress/BEE/gtex/data/auxiliary_bj5/normalization
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/isolate_longest_sequences_in_transcriptomes.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/isolate_longest_sequences_in_transcriptomes.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa
Compute sequence lengths
%%bash
# Transcripts
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt
cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.seq_length.txt
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.seq_length.txt
# Genes
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt
cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.seq_length.txt
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.seq_length.txt
Compute GC content
%%bash
# Transcripts
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt
cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.gc_content.txt
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.gc_content.txt
# Genes
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt
cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.gc_content.txt
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.gc_content.txt
Let's set up the jobs for GC-correction. Before that, we need to point the R library to the correction location:
%%bash
echo 'export R_LIBS=/tigress/BEE/tools/R_libs' >> ~/.bashrc
source ~/.bashrc
Note: This may throw an error depending on the situation of the R library and the R version. Another alternative that works is to install the required packages manually and point the R library to the user's folder, namely:
# In R:
source("http://bioconductor.org/biocLite.R")
biocLite("EDASeq")
%%bash
mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/
mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/
mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/
python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/ _certain_biotypes_TPM.txt \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/
sh /tigress/BEE/gtex/tmp/low_level_scripts/normalize_only_GC_wrapper.sh
%%writefile /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC_wrapper.py
##################################################
# normalize_only_GC.py
#
#
##################################################
import glob
from sys import argv
import os.path
in_path = argv[1]
in_suffix = argv[2]
in_path = in_path + '*' + in_suffix
out_path = argv[3]
# in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/*_certain_biotypes_TPM.txt'
# out_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/'
matrices = glob.glob(in_path + '*')
master_script = "/tigress/BEE/gtex/tmp/low_level_scripts/normalize_only_GC_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
seq_lengths, gc_content = {}, {}
seq_lengths['pc_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt'
seq_lengths['pc_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt'
seq_lengths['lncRNA_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt'
seq_lengths['lncRNA_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt'
gc_content['pc_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt'
gc_content['pc_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt'
gc_content['lncRNA_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt'
gc_content['lncRNA_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt'
k = 0
for i, matrix in enumerate(matrices):
out_matrix = out_path + matrix.split('/')[-1].replace('_TPM.txt','_TPM_GC.txt')
print out_matrix
if os.path.isfile(out_matrix):
continue
k+=1
genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
sbatchfile = "/tigress/BEE/gtex/tmp/low_level_scripts/norm_GC_" + str(i) + ".slurm"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J norm_%s # job name
#SBATCH --mem=12000 # 12 GB requested
#SBATCH -t 01:00:00 # to be placed in the test queue
/usr/bin/Rscript /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC.R \
%s %s %s %s %s %s
"""%(i, matrix, seq_lengths['pc_'+genes_or_transcripts], seq_lengths['lncRNA_'+genes_or_transcripts], \
gc_content['pc_'+genes_or_transcripts], gc_content['lncRNA_'+genes_or_transcripts], out_matrix)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
print 'Number of jobs:', k+1
Overwriting /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC_wrapper.py
%%writefile /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC.R
##################################################
# normalize_only_GC.R
#
# Use EDASeq R package to normalize gene expression within-lane by GC-content
#
# See:
# Risso, Davide, et al. "GC-content normalization for RNA-Seq data." BMC bioinformatics 12.1 (2011): 480.
##################################################
#!/usr/bin/Rscript
## Collect arguments
args <-commandArgs(TRUE)
## Default setting when no arguments passed
if (length(args) != 6 ){
args <- c("--help")
}
## Help section
if("--help" %in% args) {
cat("
normalize_only_GC.R
Usage:
./normalize_only_GC.R /path/to/expression_matrix.txt /path/to/seq_length_protein_coding.txt \
/path/to/seq_length_lncRNA.txt /path/to/GC_content_protein_coding.txt \
/path/to/GC_content_lncRNA.txt /path/to/normalized_expression_matrix.txt\n")
q(save="no")
}
library(EDASeq)
expr = args[1]
seq_length_pc = args[2]
seq_length_lncRNA = args[3]
GC_content_pc = args[4]
GC_content_lncRNA = args[5]
norm_expr_out = args[6]
# expr = "/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM.txt"
# seq_length_pc = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.pc_genes.seq_length.txt"
# seq_length_lncRNA = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.lncRNA_genes.seq_length.txt"
# GC_content_pc = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.pc_genes.gc_content.txt"
# GC_content_lncRNA = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.lncRNA_genes.gc_content.txt"
# norm_expr_out = "/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM_GC.txt"
# import data
expr = read.table(file=expr, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
seq_length_pc = read.table(file=seq_length_pc, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
seq_length_lncRNA = read.table(file=seq_length_lncRNA, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
GC_content_pc = read.table(file=GC_content_pc, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
GC_content_lncRNA = read.table(file=GC_content_lncRNA, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
# turn covariates into named vectors to be used by normalization functions
df_to_named_vector <- function(df, col){
tmp <- as.numeric(df[,col])
names(tmp) <- row.names(df)
return(tmp)
}
seq_length_pc <- df_to_named_vector(seq_length_pc, 'length')
seq_length_lncRNA <- df_to_named_vector(seq_length_lncRNA, 'length')
GC_content_pc <- df_to_named_vector(GC_content_pc, 'gccontent')
GC_content_lncRNA <- df_to_named_vector(GC_content_lncRNA, 'gccontent')
seqs_lncRNA <- intersect(rownames(expr), names(seq_length_lncRNA))
seqs_lncRNA <- intersect(seqs_lncRNA, names(GC_content_lncRNA))
expr_lncRNA <- as.matrix(expr[seqs_lncRNA, , drop=FALSE ])
seqs_pc <- intersect(rownames(expr), names(seq_length_pc))
seqs_pc <- intersect(seqs_pc, names(GC_content_pc))
expr_pc <- as.matrix(expr[seqs_pc, , drop=FALSE ])
# reduce expression matrices to only those genes/transcripts that are expressed (non-zero) in
# at least 10% of samples for this particular tissue
expr_lncRNA = as.matrix(expr_lncRNA[apply(expr_lncRNA, 1, function(x) length(x[x != 0])/length(x) >= 0.10),])
expr_pc = as.matrix(expr_pc[apply(expr_pc, 1, function(x) length(x[x != 0])/length(x) >= 0.10),])
# create eSet objects
data_lncRNA <- newSeqExpressionSet(expr_lncRNA, featureData=AnnotatedDataFrame(data.frame(gc=GC_content_lncRNA[row.names(expr_lncRNA)])), round=FALSE)
data_pc <- newSeqExpressionSet(expr_pc, featureData=AnnotatedDataFrame(data.frame(gc=GC_content_pc[row.names(expr_pc)])), round=FALSE)
# Perform within-lane normalizations separately for lincRNA and mRNA because we would expect
# different distributions
# Within-lane normalization for GC-content
norm_lncRNA <- normCounts(withinLaneNormalization(data_lncRNA, "gc", which="full", offset=FALSE, round=FALSE))
norm_pc <- normCounts(withinLaneNormalization(data_pc, "gc", which="full", offset=FALSE, round=FALSE))
# bind the pc and lncRNA dataframes together
norm <- rbind(norm_pc,norm_lncRNA)
write.table(norm, file=norm_expr_out, quote=FALSE, sep='\t', row.names=T, col.names=NA)
Overwriting /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC.R
First, let us set up the directory for our cis mapping pipeline:
%%bash
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/LFA
Once we have obtained the matrices, we can begin the process of explicitly modeling hidden covaraites through PEER:
http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3398141/
We first need to install PEER:
# In R:
source("http://bioconductor.org/biocLite.R")
biocLite("peer")
In the pipeline, we will take the top 3 genotype PCs given by the GTEx consortium. The genotype PCs have been calculated by the GTEx consortium:
%%bash
cp GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_PostImput_20genotPCs.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/
# In R, run the following:
genotype_PCs = read.table('GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_PostImput_20genotPCs.txt', sep = '\t', stringsAsFactors=FALSE, header=TRUE)
rownames(genotype_PCs) = sapply(c(1:dim(genotype_PCs)[1]), function(x) {paste(strsplit(genotype_PCs$FID[x], split = "-")[[1]][1], strsplit(genotype_PCs$FID[x], split = "-")[[1]][2], sep = ".")})
genotype_PCs_top3 = genotype_PCs[,c("C1","C2","C3")]
write.table(genotype_PCs_top3, file = "GTEx_Genotype_PCs_top3.txt", sep = '\t')
load("subSample1MLFA1.RData")
row.names(subSample1MLFA) = sapply(row.names(subSample1MLFA), function(x) {paste("GTEX.", x, sep = "")})
genotype_PCs_top3_LFA = subSample1MLFA[,c(1:3)]
colnames(genotype_PCs_top3_LFA) = c("C1","C2","C3")
write.table(genotype_PCs_top3_LFA, file = "LFA_Genotype_PCs_top3.txt", sep = "\t")
We can alternatively take take the top three genotype PCs that have been generated by the known genotypes and using LFA: http://arxiv.org/abs/1312.2041 and https://github.com/StoreyLab/lfa
# In R:
install.packages("devtools")
library("devtools")
install_github("Storeylab/lfa")
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/LFA/lfaWrapper.sh
#!/usr/bin/Rscript
# sbatch -t 24:00:00 -N 1 --ntasks-per-node=1 --mail-type=begin --mail-type=end --mail-user=bj5@princeton.edu --mem=125000 ./lfaWrapper.sh
library('lfa')
library('Matrix')
library('R.utils')
NUM_SUBJECTS = 450
NUM_CHRS = 22
# Take in subject names
tempRowNames = read.table('/tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr22.allelic_dosage.txt', header = TRUE)
indNames = colnames(tempRowNames)
indNames = indNames[2:NUM_SUBJECTS+1]
remove(tempRowNames)
indNames = substr(indNames,6,10)
# Tidy the subject IDs
for(i in seq(1,length(indNames))){
if(substr(indNames[i],5,5)=='.'){
indNames[i] = substr(indNames[i],1,4)
}
}
numRow = 0
for(i in seq(1,NUM_CHRS)){
numRow = numRow + countLines(paste("/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr",i,".bim",sep=''))
}
print(numRow)
genotypeHolder = matrix(nrow=numRow,ncol=NUM_SUBJECTS)
genoHolderIndex = 1
for(i in seq(1,NUM_CHRS)){
dataLength = countLines(paste('/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr',i,'.bim',sep=""))
temp = read.bed(paste('/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr',i,sep=""))
genotypeHolder[genoHolderIndex:(genoHolderIndex + dataLength - 1),] = temp
genoHolderIndex = genoHolderIndex + dataLength
remove(temp)
}
print(object.size(genotypeHolder))
# Sample 1M snps to form genotype PCs
for(i in seq(1,10)){
indices = sample(1:numRow,1000000,replace = FALSE)
genotypeSample = genotypeHolder[indices,]
subSample1MLFA = lfa(genotypeSample,d=51)
rownames(subSample1MLFA) = indNames
save(subSample1MLFA, file=paste('subSample1MLFA',i,'.RData',sep=''))
}
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/LFA/lfaWrapper.sh
Running the following code results in files subSample1MLFA1.RData through subSample1MLFA10.RData, and all ten iterations correlate well with each other in for the first 3 PCs.
Now let's prepare the scripts that set up tissue-specific PEER factor estimation; we'll start with featurecounts and PEER factors from 10 to 75 in increments of 5.
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/ \
_genes_certain_biotypes_counts_normalized.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \
featurecounts /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER/ 10 15 20 25 30 35 40 45 50 55 60
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_10_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_15_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_20_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_25_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_30_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_35_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_40_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_45_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_50_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_55_cis_peer_correction_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_60_cis_peer_correction_chr11_wrapper.sh
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_wrapper.py
##################################################
# cis_peer_correction_chr11_wrapper.py
#
#
##################################################
import glob
from sys import argv
import os
in_path = argv[1]
in_suffix = argv[2]
genotype_pcs = argv[3]
gene_metadata = argv[4]
method = argv[5]
# in_path = in_path + method + '/TPM_with_genotypes/' + '*' + '_' + method + in_suffix
# out_path = argv[6] + "to be appended later"
joblog_dir = argv[7]
# Array of PEER factors that are given in the input
peer_factors = argv[8:len(argv)]
# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/'
# or
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/'
# argv[2] = '_genes_certain_biotypes_TPM.txt'
# or
# argv[2] = '_genes_certain_biotypes_counts_normalized.txt'
# argv[3] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# argv[5] = 'featurecounts'
# argv[6] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER'
# argv[8:len(argv)] = ['10','20','30','40','50']
# Make the job log directories
if not os.path.exists(joblog_dir):
os.makedirs(joblog_dir)
if not os.path.exists(joblog_dir + 'err'):
os.makedirs(joblog_dir + 'err')
if not os.path.exists(joblog_dir + 'out'):
os.makedirs(joblog_dir + 'out')
matrices = glob.glob(in_path + '*' + in_suffix)
for peer_factor in peer_factors:
out_path = argv[6] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'
if not os.path.exists(out_path):
os.makedirs(out_path)
master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_" + peer_factor + "_cis_peer_correction_chr11_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
# Examples
# peer_factor = 10
# residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM.txt'
# tissue_name = 'Adipose-Subcutaneous'
factor_file_directory = out_path + 'PEER_factors'
if not os.path.exists(factor_file_directory):
os.makedirs(factor_file_directory)
k = 0
for i, matrix in enumerate(matrices):
filename = matrix.split('/')[-1]
tissue_name = str.split(filename, '_'+method)[0]
out_matrix = out_path + filename
# out_matrix = out_path + filename.replace('_TPM.txt','_TPM_GC.txt')
if os.path.isfile(out_matrix):
continue
k+=1
genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
outfilename = method+"_"+tissue_name+"_"+peer_factor
sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/PEER/" + method + '_TPM_GC_3GenPC_' + peer_factor + 'PEER1000_' + tissue_name + ".slurm"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J PEER_%s_%s # job name
#SBATCH --mem=6000 # 6 GB requested
#SBATCH -t 04:00:00
#SBATCH -e %s # err output directory
#SBATCH -o %s # out output directory
/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R \
%s %s %s %s %s %s %s
"""%(tissue_name, peer_factor, joblog_dir+'err/'+outfilename+'.err', joblog_dir+'out/'+outfilename+'.out', matrix, genotype_pcs, gene_metadata, peer_factor, out_matrix, tissue_name, factor_file_directory)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_wrapper.py
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R
####################################################
# cis_peer_correction_chr11.R
# Exploratory: Testing with chr 11.
#
####################################################
#!/usr/bin/Rscript
## Collect arguments
args <-commandArgs(TRUE)
## Default setting when no arguments passed
if (length(args) != 7 ){
args <- c("--help")
}
## Help section
if("--help" %in% args) {
cat("
cis_peer_correction.R
Usage:
./cis_peer_correction.R /path/to/expression_matrix.txt /path/to/genotype_pcs.RData \
/path/to/gene_metadata/ num_of_peer_factors \
/path/to/normalized_expression_matrix.txt tissue_name /path/to/factor_files\n")
q(save="no")
}
expr_matrix = args[1]
genotype_pcs = args[2]
gene_metadata = args[3]
peer_factors = as.numeric(args[4])
residual_matrix = args[5]
tissue_name = args[6]
factor_file_directory = args[7]
# expr_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM.txt'
# expr_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_counts_normalized.txt'
# genotype_pcs = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt'
# gene_metadata = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# peer_factors = '10'
# residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM_GC.txt'
# tissue_name = 'Brain-Hypothalamus'
# factor_file_directory = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/PEER_factors'
library(peer)
library(dplyr)
# Load in the genotype PCs (top 3 will be used for this pipeline), saved as samSample1MLFA
genotype_PCs_top3 = read.table(genotype_pcs, stringsAsFactors=FALSE, header=TRUE, sep='\t')
gene_metadata = read.table(gene_metadata, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
# Load in the expression matrices
# Also correct for covariates using PEER with K factors and N steps
expr_matrix = read.table(file=expr_matrix, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
# The samples included are already sorted for samples with genotypes.
# Data massaging: left_join is similar to a databaes left join, matching entries in first table to the second
expr_matrix$gene_id = row.names(expr_matrix)
gene_metadata$gene_id = row.names(gene_metadata)
joined_matrix = left_join(expr_matrix, gene_metadata, by="gene_id")
# Take only chr 11 for now.
joined_matrix = filter(joined_matrix, chr == 11)
joined_matrix = arrange(joined_matrix, start)
row.names(joined_matrix) = joined_matrix$gene_id
joined_matrix = select(joined_matrix, 1:(dim(joined_matrix)[2]-6))
# Optional: remove genes that are expressed in less than 10% of individuals.
joined_matrix = joined_matrix[sapply(1:dim(joined_matrix)[1], function(x) {(sum(joined_matrix[x,]==0)/dim(joined_matrix)[2])<0.1}),]
# PEER takes in an inverted matrix as input
inverted_matrix = as.data.frame(t(data.matrix(joined_matrix)))
genotype_PCs_top3_df = as.data.frame(genotype_PCs_top3[,1:3])
inverted_matrix$subject_id = row.names(inverted_matrix)
genotype_PCs_top3_df$subject_id = row.names(genotype_PCs_top3_df)
inverted_joined = left_join(inverted_matrix, genotype_PCs_top3_df, by="subject_id")
subject_list = inverted_joined$subject_id
# Top 3 PCs, can be changed for future tasks
genotype_top3 = as.matrix(inverted_joined[,(dim(inverted_joined)[2]-2):dim(inverted_joined)[2]])
inverted_joined = as.matrix(select(inverted_joined, c(1:(dim(inverted_joined)[2]-4))))
# Now set up PEER for detection of hidden covariates
model = PEER()
PEER_setPhenoMean(model,inverted_joined)
PEER_setCovariates(model, genotype_top3)
PEER_setNk(model,peer_factors)
PEER_update(model)
# Now that PEER has been set up, we can obtain the residuals and save the resulting matrix:
residuals = PEER_getResiduals(model)
colnames(residuals) = colnames(inverted_joined)
rownames(residuals) = subject_list
write.table(as.data.frame(t(data.matrix(residuals))), file=residual_matrix, quote = FALSE, sep = "\t")
# Also save the factors, weights, and alpha values:
write.table(PEER_getX(model), file=paste(factor_file_directory, '/', tissue_name, '_Factors.txt', sep=""), row.names=FALSE, col.names=FALSE)
write.table(PEER_getW(model), file=paste(factor_file_directory, '/', tissue_name, '_Loadings.txt', sep=""), row.names=FALSE, col.names=FALSE)
write.table(PEER_getAlpha(model), file=paste(factor_file_directory, '/', tissue_name, '_Alphas.txt', sep=""), row.names=FALSE, col.names=FALSE)
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R
The result is that we have the expression matrices that are the residuals of K peer factors and 3 genotype PCs, saved under:
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000
as well as the corresponding peer factors, loadings and alphas.
We can also repeat the procedure for htseq and rsem.
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_rerun_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/ \
_genes_certain_biotypes_counts_normalized.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \
featurecounts /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER/ 10 15 20 25 30 35 40 45 50 55 60
10 20 30 Missing: Vagina Missing: SmallIntestine-TerminalIleum Missing: Cells-EBV-transformedlymphocytes 40 Missing: Vagina Missing: Ovary Missing: SmallIntestine-TerminalIleum 50 Missing: Pituitary Missing: Brain-Hypothalamus Missing: Vagina Missing: Prostate Missing: Brain-CerebellarHemisphere Missing: Ovary Missing: Testis Missing: Cells-EBV-transformedlymphocytes Missing: Brain-Putamen_basalganglia_ Missing: Kidney-Cortex Missing: Colon-Transverse sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_cis_peer_correction_chr11_rerun_wrapper.sh Number of jobs: 17
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_rerun_wrapper.py
##################################################
# cis_peer_correction_chr11_rerun_wrapper.py
# provide jobs that have failed with more time and memory
#
##################################################
import glob
from sys import argv
import os
in_path = argv[1]
in_suffix = argv[2]
genotype_pcs = argv[3]
gene_metadata = argv[4]
method = argv[5]
# in_path = in_path + method + '/TPM_with_genotypes/' + '*' + '_' + method + in_suffix
# out_path = argv[6] + "to be appended later"
joblog_dir = argv[7]
# Array of PEER factors that are given in the input
peer_factors = argv[8:len(argv)]
# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/'
# or
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/'
# argv[2] = '_genes_certain_biotypes_TPM.txt'
# or
# argv[2] = '_genes_certain_biotypes_counts_normalized.txt'
# argv[3] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# argv[5] = 'featurecounts'
# argv[6] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER'
# argv[8:len(argv)] = ['10','20','30','40','50']
# Create full tissue_list.
tissue_list = []
matrices = glob.glob(in_path + '*' + in_suffix)
for i, matrix in enumerate(matrices):
filename = matrix.split('/')[-1]
tissue_name = str.split(filename, '_'+method)[0]
tissue_list.append(tissue_name)
master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_cis_peer_correction_chr11_rerun_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
k = 0
for peer_factor in peer_factors:
print(peer_factor)
out_path = argv[6] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'
# Examples
# peer_factor = 10
# residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM.txt'
# tissue_name = 'Adipose-Subcutaneous'
factor_file_directory = out_path + 'PEER_factors'
for tissue in tissue_list:
out_matrix = out_path + tissue + "_" + method + in_suffix
# out_matrix = out_path + tissue + "_" + method + in_suffix.replace('_TPM.txt','_TPM_GC.txt')
if os.path.isfile(out_matrix):
continue
print("Missing: " + tissue)
genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
outfilename = method+"_"+tissue+"_"+peer_factor
sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/PEER/" + method + '_TPM_GC_3GenPC_' + peer_factor + 'PEER1000_' + tissue + ".slurm"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J PEER_%s_%s # job name
#SBATCH --mem=12000 # 12 GB requested
#SBATCH -t 08:00:00
#SBATCH -e %s # err output directory
#SBATCH -o %s # out output directory
/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R \
%s %s %s %s %s %s %s
"""%(tissue, peer_factor, joblog_dir+'err/'+outfilename+'.err', joblog_dir+'out/'+outfilename+'.out', matrix, genotype_pcs, gene_metadata, peer_factor, out_matrix, tissue, factor_file_directory)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
k+=1
master_handle.close()
print 'sh %s'%(master_script)
print 'Number of jobs:', k
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_rerun_wrapper.py
With the expression matrices, and other necessary inputs generated in the accompanying notebook misc_meta_analysis.ipynb, we can go ahead and perform univariate eQTL detection using MatrixEQTL:
http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
%%bash
# Set up the MatrixEQTL env.
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_10PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_20PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_30PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_40PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_50PEER1000
mkdir: cannot create directory `/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL': File exists
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 0 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_10_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_15_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_20_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_25_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_30_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_35_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_40_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_45_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_50_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_55_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_60_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_10_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_15_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_20_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_25_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_30_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_35_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_40_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_45_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_50_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_55_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_60_Imput_cis_matrix_eqtl_chr11_wrapper.sh
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py
##################################################
# cis_matrix_eqtl_wrapper.py
# Wrapper for the cis_matrix_eqtl.R file and submitting the batch jobs
#
##################################################
import glob
from sys import argv
import os.path
in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
mapping_dir = argv[4]
joblog_dir = argv[6]
incl_imput = argv[7]
analysis_dir = argv[8]
# Array of PEER factors that are given in the input
peer_factors = argv[9:len(argv)]
# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[2] = '_genes_certain_biotypes_TPM_GC.txt'
# argv[3] = 'featurecounts'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/'
# argv[7] = '0' or '1'
# argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis'
# argv[9:len(argv)] = ['10','20','30','40','50']
# Make the job log directories
if not os.path.exists(joblog_dir):
os.makedirs(joblog_dir)
if not os.path.exists(joblog_dir + 'err'):
os.makedirs(joblog_dir + 'err')
if not os.path.exists(joblog_dir + 'out'):
os.makedirs(joblog_dir + 'out')
if incl_imput == "0":
imput = "Noimput"
else:
imput = "Imput"
for peer_factor in peer_factors:
print(peer_factor)
in_path = argv[1] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' + '*' + '_' + method + in_suffix
out_path = argv[5] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'
if not os.path.exists(out_path):
os.makedirs(out_path)
matrices = glob.glob(in_path + '*')
print("Number of matrices: " + str(len(matrices)))
master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_" + peer_factor + '_' + imput + "_cis_matrix_eqtl_chr11_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
# MatrixEQTL parameters
incl_trans = '0' # Don't include trans
chr_number = '11' # chromosome number
cis_dist = '1e5' # cis window
k = 0
for i, matrix in enumerate(matrices):
filename = matrix.split('/')[-1]
tissue_name = str.split(filename, '_'+method)[0]
out_file = out_path + filename.replace('.txt','_MatrixEQTL')
if os.path.isfile(out_file):
continue
k+=1
genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_' + peer_factor + '_MatrixEQTL_chr' + chr_number + '_' + tissue_name + '_' + imput + ".slurm"
outfilename = method+"_"+tissue_name+"_"+peer_factor
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J meqtl_%s_%s # job name
#SBATCH --mem=6000 # 6 GB requested
#SBATCH -t 04:00:00
#SBATCH -e %s # err output directory
#SBATCH -o %s # out output directory
/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \
%s %s %s %s %s %s %s %s %s %s
"""%(tissue_name, peer_factor, joblog_dir+'err/'+outfilename+'_'+imput+'.err', joblog_dir+'out/'+outfilename+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
print 'Number of jobs:', k
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R
####################################################
# cis_matrix_eqtl.R
# Performs univariate cis-eQTL analysis using Matrix eQTL
#
####################################################
#!/usr/bin/Rscript
# Collect arguments
args <-commandArgs(TRUE)
## Default setting when no arguments passed
if (length(args) != 10 ){
args <- c("--help")
}
## Help section
if("--help" %in% args) {
cat("
cis_peer_correction.R
Usage:
./cis_matrix_eqtl.R /path/to/expression_matrix.txt include_trans_binary \
include_imputed_binary chromosome_number cis_disance_def peer_factor \
/path/to/mapping/directory /path/to/output/files method /path/to/analysis/dir\n")
q(save="no")
}
expression_file_location = args[1]
incl_trans = args[2]
incl_imput = args[3]
chr_number = args[4]
cis_dist = as.numeric(args[5])
peer_factor = args[6]
# Example
# args[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_45PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC.txt'
# args[2] = '0'
# args[3] = '0'
# args[4] = '11'
# args[5] = '1e5'
# args[6] = '10'
imput = 'Imput'
if (incl_imput == '0') {imput = 'Noimput'}
SNP_file_name = paste(args[7], '/genotype/genotypesChr', chr_number, '_', imput, '.txt', sep="")
covariates_file_name = character()
SNP_position_file = paste(args[7], '/references/SNP_positions/SNP_positions_Chr', chr_number, '.txt', sep="")
gene_position_file = paste(args[7], '/references/gene_positions/gene_positions_Chr', chr_number, '.txt', sep="")
output_file_name_cis = paste(args[8], '_Chr', chr_number, '_', imput, '_cis.txt', sep="")
output_file_name_trans = paste(args[8], '_Chr', chr_number, '_', imput, '_trans.txt', sep="")
method = paste(args[9])
expression_file_name = strsplit(expression_file_location, split="/")[[1]][length(strsplit(expression_file_location, split="/")[[1]])]
tissue_name = strsplit(expression_file_name, split = paste('_', method, sep=""))[[1]][1]
analysis_dir = paste(args[10], "/", tissue_name, sep="")
# Example
# args[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# args[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/
# ... TPM_GC_3GenPC_30PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL'
# args[9] = 'featurecounts'
# args[10] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis'
# Need to check that the subjects read in from expression matrices match the subjects in genotype data
library(dplyr)
expression_matrix = read.csv(file = expression_file_location, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
genotype_matrix = read.csv(file = SNP_file_name, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
row.names(genotype_matrix) = genotype_matrix$SNP
genotype_matrix = genotype_matrix[,sapply(colnames(expression_matrix), function(x) {match(x, colnames(genotype_matrix))})]
# Also obtain genes and snps that are in the expression and genotype files, respectively.
gene_positions = read.table(file = gene_position_file, header = FALSE, sep = '\t', stringsAsFactors = FALSE)
snp_positions = read.table(file = SNP_position_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
colnames(gene_positions) = c("gene_ID", "chr_num", "start", "end")
gene_positions = mutate(gene_positions, chr = paste("chr", chr_num, sep = ""))
gene_positions = select(gene_positions, c(gene_ID, chr, start, end))
# Make sure the list of genes and their ordering are the same
expression_matrix = expression_matrix[rownames(expression_matrix) %in% gene_positions$gene_ID,]
gene_positions = gene_positions[sapply(rownames(expression_matrix), function(x) {match(x, gene_positions$gene_ID)}),]
# Import MatrixEQTL
library(MatrixEQTL)
# Use linear model with p threshold of 0.01
useModel = modelLINEAR;
pvOutputThreshold_cis = 1e-2
pvOutputThreshold_trans = 0
if (incl_trans == '1') {pvOutputThreshold_trans = 1e-2}
snps = SlicedData$new()
snps$CreateFromMatrix(as.matrix(genotype_matrix));
gene = SlicedData$new()
gene$CreateFromMatrix(as.matrix(expression_matrix));
gene_qnorm = gene$copy();
# Quantile-normalize the expression values - ties broken by averaging
for( sl in 1:length(gene_qnorm) ) {
mat = gene_qnorm[[sl]];
mat = t(apply(mat, 1, rank, ties.method = "average"));
mat = qnorm(mat / (ncol(gene_qnorm)+1));
gene_qnorm[[sl]] = mat;
}
rm(sl, mat);
# me = Matrix_eQTL_main(
# snps = snps,
# gene = gene,
# output_file_name = output_file_trans,
# pvOutputThreshold = pvOutputThreshold,
# useModel = useModel,
# verbose = TRUE,
# pvalue.hist = "qqplot",
# min.pv.by.genesnp = FALSE,
# noFDRsaveMemory = FALSE);
# Only for generating the P-value hist:
me = Matrix_eQTL_main(
snps = snps,
gene = gene_qnorm,
output_file_name = output_file_name_trans,
pvOutputThreshold = pvOutputThreshold_trans,
useModel = useModel,
verbose = TRUE,
output_file_name.cis = output_file_name_cis,
pvOutputThreshold.cis = pvOutputThreshold_cis,
snpspos = snp_positions,
genepos = gene_positions,
cisDist = cis_dist,
pvalue.hist = 100,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
# Save the histogram
output_file_name_hist = paste(analysis_dir, '/', tissue_name, '_', imput, '_Chr', chr_number, '_', peer_factor, '_hist.pdf', sep="")
print(paste("Saving to:", output_file_name_hist))
pdf(output_file_name_hist)
plot(me)
dev.off()
# repeat for the qqplot:
me_q = Matrix_eQTL_main(
snps = snps,
gene = gene_qnorm,
output_file_name = output_file_name_trans,
pvOutputThreshold = pvOutputThreshold_trans,
useModel = useModel,
verbose = TRUE,
output_file_name.cis = output_file_name_cis,
pvOutputThreshold.cis = pvOutputThreshold_cis,
snpspos = snp_positions,
genepos = gene_positions,
cisDist = cis_dist,
pvalue.hist = "qqplot",
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
# Save the qqplot
output_file_name_qqplot = paste(analysis_dir, '/', tissue_name, '_', imput, '_Chr', chr_number, '_', peer_factor, '_qqplot.pdf', sep="")
print(paste("Saving to:", output_file_name_hist))
pdf(output_file_name_qqplot)
plot(me_q)
dev.off()
# Save the Matrix EQTL object as RData
save(gene_qnorm, me, snps, file=paste(args[8], '_Chr', chr_number, '_', imput, '_me.RData', sep=""))
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 0 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py
##################################################
# cis_matrix_eqtl_rerun_wrapper.py
# provide jobs that have failed with more time and memory
#
##################################################
import glob
from sys import argv
import os.path
in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
mapping_dir = argv[4]
joblog_dir = argv[6]
incl_imput = argv[7]
analysis_dir = argv[8]
# Array of PEER factors that are given in the input
peer_factors = argv[9:len(argv)]
# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[2] = '_genes_certain_biotypes_TPM_GC.txt'
# argv[3] = 'featurecounts'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/'
# argv[7] = '0' or '1'
# argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis'
# argv[9:len(argv)] = ['10','20','30','40','50']
# Make tissue list:
tissue_list = []
# Use 'TPM_GC_3GenPC_10PEER1000' to build tissue list
in_path = in_path + method + '/TPM_GC_3GenPC_10PEER1000/'
matrices = glob.glob(in_path + '*' + in_suffix)
for i, matrix in enumerate(matrices):
filename = matrix.split('/')[-1]
tissue_name = str.split(filename, '_'+method)[0]
tissue_list.append(tissue_name)
if incl_imput == "0":
imput = "Noimput"
else:
imput = "Imput"
master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + '_' + imput + "_cis_matrix_eqtl_chr11_rerun_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
k = 0
for peer_factor in peer_factors:
print(peer_factor)
out_path = argv[5] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'
# MatrixEQTL parameters
incl_trans = '0' # Don't include trans
chr_number = '11' # chromosome number
cis_dist = '1e5' # cis window
for tissue in tissue_list:
out_file = out_path + tissue + "_" + method + in_suffix.replace('.txt','_MatrixEQTL')
check_file = out_path + tissue + "_" + method + in_suffix.replace('.txt','_MatrixEQTL_Chr') + chr_number + '_' + imput + '_me.RData'
if os.path.isfile(check_file):
continue
k+=1
print "Missing: " + check_file
genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_' + peer_factor + '_MatrixEQTL_chr' + chr_number + '_' + tissue + "_" + imput + ".slurm"
outfilename = method+"_"+tissue+"_"+peer_factor
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J meqtl_%s_%s # job name
#SBATCH --mem=12000 # 12 GB requested
#SBATCH -t 08:00:00 # to be placed in the test queue
#SBATCH -e %s # err output directory
#SBATCH -o %s # out output directory
/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \
%s %s %s %s %s %s %s %s %s %s
"""%(tissue, peer_factor, joblog_dir+'err/'+outfilename+'_'+imput+'.err', joblog_dir+'out/'+outfilename+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
print 'Number of jobs:', k
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py
MatrixEQTL outputs the following metric: effect size, p-value, FDR, t-statistic, as well as the list of significant SNP-gene pairs.
Since we have a lot of statistics, let's try to construct plots that will hopefully help with the analyses. But we need to create the necessary directories first:
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py
##################################################
# create_dirs.py
# create necessary directories
#
##################################################
import glob
from sys import argv
import os.path
in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
out_path = argv[4] + method + '/'
in_path = in_path + method + '/TPM_GC_3GenPC_10PEER1000/' + '*' + '_' + method + in_suffix
# in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/'
# in_suffix = '_genes_certain_biotypes_TPM.txt'
# method = 'featurecounts'
# out_path = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/"
directory = out_path + 'analysis'
if not os.path.exists(directory):
os.makedirs(directory)
matrices = glob.glob(in_path + '*')
print(len(matrices))
for i, matrix in enumerate(matrices):
filename = matrix.split('/')[-1]
tissue_name = str.split(filename, '_'+method)[0]
if not os.path.exists(directory + '/' + tissue_name):
os.makedirs(directory + '/' + tissue_name)
if not os.path.exists(directory + '/' + tissue_name + '/Examples/'):
os.makedirs(directory + '/' + tissue_name + '/Examples')
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/
Now let's actually create the plots:
# In R:
install.packages("ggplot2")
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_post_wrapper.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ featurecounts 1e5 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt 100
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_cis_matrix_eqtl_post_wrapper.sh
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_post_wrapper.py
##################################################
# cis_matrix_eqtl_post_wrapper.py
#
#
##################################################
import glob
from sys import argv
import os.path
in_path = argv[1]
method = argv[2]
cis_cutoff = argv[3]
gene_name_file = argv[4]
snp_positions_file = argv[5]
tissue_list_file = argv[6]
num_indiv_cutoff = int(argv[7])
# Examples
# argv[1] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# argv[2] = 'featurecounts'
# argv[3] = '1e5'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'
# argv[7] = '100'
master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_cis_matrix_eqtl_post_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
# Make tissue list:
f = open(tissue_list_file)
for line in f.readlines():
entry = str.split(line.strip(),'\t')
# If the number of individuals is greater than the cutoff:
if int(entry[1]) >= num_indiv_cutoff:
tissue = entry[0]
sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/" + method + '_' + tissue + "_cis_matrix_eqtl_post.slurm"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J %s_Matrix_post # job name
#SBATCH --mem=4000 # 4 GB requested
#SBATCH -e /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/err/Matrix_%s_post.err # err output directory
#SBATCH -o /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/out/Matrix_%s_post.out # out output directory
#SBATCH -t 5:00:00
/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL.R \
%s %s %s %s \
%s \
%s \
10 15 20 25 30 35 40 45 50 55 60
"""%(tissue, tissue, tissue, in_path, method, cis_cutoff, tissue, gene_name_file, snp_positions_file)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_post_wrapper.py
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL.R
####################################################
# post_analyses_MatrixEQTL.R
# Iterates through K PEER factors for every tissue and produces interesting graphs.
#
####################################################
#!/usr/bin/Rscript
# Collect arguments
args <-commandArgs(TRUE)
library(dplyr)
library(MatrixEQTL)
library(ggplot2)
# Load in data
# load("Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL_Chr11_Imput_me.RData")
# file_name = Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL_Chr11_Imput_me.RData
in_dir = args[1]
method = args[2]
cis_dist = as.numeric(args[3])
in_dir = paste(in_dir, method, "/", sep='')
tissue = args[4]
gene_name_file = args[5]
snp_position_file = args[6]
peer_factors = args[7:length(args)]
# args[1] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# args[2] = 'featurecounts'
# args[3] = '1e5'
# args[4] = 'WholeBlood'
# args[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# args[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt'
# args[7] = c(10,15,20,25,30,35,40,45,50,55,60)
# Save the numer of significant pairs and unique genes detected at each level.
Counts_by_tissue = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE)
#Genes_snps_by_tissue = data.frame(tissue = character(0), imputed = character(0), gene_list = character(0), snp_list = character(0))
gene_names = read.csv(gene_name_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
row.names(gene_names) = gene_names$gene_id
print(dim(gene_names))
snp_positions = read.csv(snp_position_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
row.names(snp_positions) = snp_positions$rsID
print(dim(snp_positions))
print(tissue)
for (peer in peer_factors) {
print(peer)
# First iterate through nonimputed:
result_path = paste(in_dir, "TPM_GC_3GenPC_", peer, "PEER1000/", sep='')
analysis_path = paste(in_dir, "analysis/", tissue, '/', sep='')
# Match a regular expression pattern:
result_file <- dir(result_path, pattern = paste(tissue,".*\\Noimput_me.RData",sep=''))
# The result file of MatrixEQTL exists but the summary file (output of this) doesn't
if (length(result_file) > 0) {
load(paste(result_path, result_file[1], sep=''))
# Plot top 5 eQTL - unique gene pairs, if they exist.
temp = data.frame(me$cis$eqtls)
temp$snps = as.character(temp$snps)
temp$gene = as.character(temp$gene)
top_genes = head(unique(temp$gene), n=5)
# Only proceed if there are at least 5 genes.
if (length(top_genes) == 5) {
temp$TSS = gene_names[temp$gene,][,"start"]
temp$TES = gene_names[temp$gene,][,"end"]
temp$gene_name = gene_names[temp$gene,][,"gene_name"]
temp$SNP_pos = snp_positions[temp$snps,][,"pos"]
# Difference mapping: -1e5 to 0 if before TSS
# scale from 1 to 1e5 if in gene
# 1e5 to 2e5 if after TES
temp$diff = sapply(c(1:dim(temp)[1]), function(x) {
if ((temp$SNP_pos[x] - temp$TSS[x])<0) {
temp$SNP_pos[x] - temp$TSS[x]
} else if ((temp$SNP_pos[x] - temp$TES[x])>0) {
temp$SNP_pos[x] - temp$TES[x] + cis_dist
} else {
cis_dist * (temp$SNP_pos[x] - temp$TSS[x]) / (temp$TES[x] - temp$TSS[x])
}})
tryCatch({
g = ggplot(temp[temp$FDR < 0.01,], aes(x = diff)) + geom_histogram(binwidth = 10000) + ggtitle('Distribution of distances to TSS') + xlab('Distance') + scale_x_discrete(breaks=c(0, 1e5), labels=c("TSS", "TES"))
print(paste("Trying to save: ", analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep=''))
ggsave(filename = paste(analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep=''), plot=g, width = 7, height = 7)
}, error = function(e) {
print(paste("Could not save: ", analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep=''))
})
for (j in 1:5) {
index = which(temp$gene == top_genes[j])[1]
test_snp = snps$FindRow(temp[index,"snps"])
test_gene_q = gene_qnorm$FindRow(temp[index,"gene"])
test.df = data.frame(snp = as.numeric(test_snp$row), gene = as.numeric(test_gene_q$row))
row.names(test.df) = colnames(snps)
test.df = test.df[!is.na(test.df$snp),]
test.df$snp_chr = as.character(test.df$snp)
test_name = as.character(temp[index,"gene_name"])
test_rsID = as.character(temp[index,"snps"])
tryCatch({
g = ggplot(test.df, aes(x = snp_chr, y = gene, color=snp_chr)) + geom_violin() + geom_jitter() + ggtitle(paste(test_name, test_rsID, sep='_')) + xlab(test_rsID) + ylab(test_name)
print(paste("Trying to save: ", analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
ggsave(filename = paste(analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''), plot = g, width = 7, height = 7)
}, error = function(e) {
print(paste("Could not save: ", analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
})
}
}
# Take all
n_pairs = dim(me$cis$eqtls)[1]
n_genes = length(unique(me$cis$eqtls$gene))
Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "pval", imputed = "Noimput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))
# Take FDR-controlled at 5%
n_pairs = dim(filter(me$cis$eqtls, FDR <= 0.01))[1]
n_genes = length(unique(filter(me$cis$eqtls, FDR <= 0.01)$gene))
Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "FDR", imputed = "Noimput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))
}
}
if (dim(Counts_by_tissue)[1] != 0) {
print(paste("Save: ", analysis_path, tissue, "_Summary_Noimput.RData", sep=""))
save(Counts_by_tissue, file=paste(analysis_path, tissue, "_Summary_Noimput.RData", sep=""))
}
Counts_by_tissue = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE)
for (peer in peer_factors) {
print(peer)
# Now iterate through imputed:
# First iterate through nonimputed:
result_path = paste(in_dir, "TPM_GC_3GenPC_", peer, "PEER1000/", sep='')
analysis_path = paste(in_dir, "analysis/", tissue, '/', sep='')
# Match a regular expression pattern:
result_file <- dir(result_path, pattern = paste(tissue,".*\\Imput_me.RData",sep=''))
if (length(result_file) > 0) {
load(paste(result_path, result_file[1], sep=''))
temp = data.frame(me$cis$eqtls)
temp$snps = as.character(temp$snps)
temp$gene = as.character(temp$gene)
top_genes = head(unique(temp$gene), n=5)
# Only proceed if there is at least 5 genes.
if (length(top_genes) == 5) {
temp$TSS = gene_names[temp$gene,][,"start"]
temp$TES = gene_names[temp$gene,][,"end"]
temp$gene_name = gene_names[temp$gene,][,"gene_name"]
temp$SNP_pos = snp_positions[temp$snps,][,"pos"]
# Difference mapping: -1e5 to 0 if before TSS
# scale from 1 to 1e5 if in gene
# 1e5 to 2e5 if after TES
temp$diff = sapply(c(1:dim(temp)[1]), function(x) {
if ((temp$SNP_pos[x] - temp$TSS[x])<0) {
temp$SNP_pos[x] - temp$TSS[x]
} else if ((temp$SNP_pos[x] - temp$TES[x])>0) {
temp$SNP_pos[x] - temp$TES[x] + cis_dist
} else {
cis_dist * (temp$SNP_pos[x] - temp$TSS[x]) / (temp$TES[x] - temp$TSS[x])
}})
tryCatch({
g = ggplot(temp[temp$FDR < 0.01,], aes(x = diff)) + geom_histogram(binwidth = 10000) + ggtitle('Distribution of distances to TSS') + xlab('Distance') + scale_x_discrete(breaks=c(0, 1e5), labels=c("TSS", "TES"))
print(paste("Trying to save: ", analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep=''))
ggsave(filename = paste(analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep=''), plot=g, width = 7, height = 7)
}, error = function(e) {
print(paste("Could not save: ", analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep=''))
})
for (j in 1:5) {
index = which(temp$gene == top_genes[j])[1]
test_snp = snps$FindRow(temp[index,"snps"])
test_gene_q = gene_qnorm$FindRow(temp[index,"gene"])
test.df = data.frame(snp = as.numeric(test_snp$row), gene = as.numeric(test_gene_q$row))
row.names(test.df) = colnames(snps)
test.df = test.df[!is.na(test.df$snp),]
test_name = as.character(temp[index,"gene_name"])
test_rsID = as.character(temp[index,"snps"])
tryCatch({
g = ggplot(test.df, aes(x = snp, y = gene, color=snp)) + geom_jitter() + ggtitle(paste(test_name, test_rsID, sep='_')) + xlab(test_rsID) + ylab(test_name)
print(paste("Trying to save: ", analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
ggsave(filename = paste(analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''), plot = g, width = 7, height = 7)
}, error = function(e) {
print(paste("Could not save: ", analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
})
}
}
# Take all
n_pairs = dim(me$cis$eqtls)[1]
n_genes = length(unique(me$cis$eqtls$gene))
Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "pval", imputed = "Imput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))
# Take FDR-controlled at 1%
n_pairs = dim(filter(me$cis$eqtls, FDR <= 0.01))[1]
n_genes = length(unique(filter(me$cis$eqtls, FDR <= 0.01)$gene))
Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "FDR", imputed = "Imput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))
}
}
if (dim(Counts_by_tissue)[1] != 0) {
print(paste("Save: ", analysis_path, tissue, "_Summary_Imput.RData", sep=""))
save(Counts_by_tissue, file=paste(analysis_path, tissue, "_Summary_Imput.RData", sep=""))
}
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL.R
TODO: Upload the plots of pair sas function PEER factors in this section
For the actual cis-eQTL detection, we will use the optimal number of PEER factors for each tissue, and feed the matrix into eQTLBMA.
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL_aggregate.R
####################################################
# post_analyses_MatrixEQTL_aggregate.R
# Aggregates all results and optimizes for K PEER factors in a tissue-specific manner.
#
####################################################
#!/usr/bin/Rscript
library(ggplot2)
library(dplyr)
analysis_path = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis/'
tissue_list_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'
num_indiv_cutoff = 100
tissue_list = read.table(tissue_list_file, header=FALSE, sep="\t", stringsAsFactors=FALSE)
colnames(tissue_list) = c("Tissue", "Count")
rownames(tissue_list) = tissue_list["Tissue"][[1]]
tissue_list = tissue_list[tissue_list["Count"]>=num_indiv_cutoff,]
Counts_by_tissue_aggregate = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE)
for (i in c(1:dim(tissue_list)[1])) {
tissue = tissue_list["Tissue"][[1]][i]
load(paste(analysis_path, tissue, "/", tissue, "_Summary_Imput.RData", sep=""))
Counts_by_tissue_aggregate = rbind(Counts_by_tissue_aggregate, Counts_by_tissue)
load(paste(analysis_path, tissue, "/", tissue, "_Summary_Noimput.RData", sep=""))
Counts_by_tissue_aggregate = rbind(Counts_by_tissue_aggregate, Counts_by_tissue)
}
for (i in c(1:dim(tissue_list)[1])) {
tissue = tissue_list["Tissue"][[1]][i]
print(tissue)
print(sum(Counts_by_tissue_aggregate["tissue"] == tissue))
}
# We have the data frame with all counts.
save(Counts_by_tissue_aggregate, file=paste(analysis_path, "All_Counts.RData", sep=""))
# Save the plots of number for all tissues:
g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "pval"), aes(x = peer_K, y = pairs, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_pval_pairs.pdf", sep=''), plot = g, width = 10, height = 10)
g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "FDR"), aes(x = peer_K, y = pairs, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_FDR_pairs.pdf", sep=''), plot = g, width = 10, height = 10)
g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "pval"), aes(x = peer_K, y = genes, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_pval_genes.pdf", sep=''), plot = g, width = 10, height = 10)
g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "FDR"), aes(x = peer_K, y = genes, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_FDR_genes.pdf", sep=''), plot = g, width = 10, height = 10)
# Optimize on the number of K PEER factors to use for each tissue:
# Criteria 1: when it decreases
peer_K_to_iterate = c(15, 20, 25, 30, 35, 40, 45, 50, 55, 60)
tissue_list["optimal_dec"] = 10
for (i in c(1:dim(tissue_list)[1])) {
tissue_name = tissue_list["Tissue"][[1]][i]
current_opt = 10
current_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == 10)["pairs"][[1]][1]
for (j in peer_K_to_iterate) {
next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1]
if (next_val <= current_val) {
tissue_list[tissue_name, "optimal_dec"] = current_opt
break
}
current_opt = j
current_val = next_val
}
if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) {
tissue_list[tissue_name, "optimal_dec"] = current_opt
}
}
# Another Criteria: when it reaches 5% of maximal
# Tunable alpha, when it reaches N% of maximal
alpha = 0.05
tissue_list["optimal_95"] = 10
for (i in c(1:dim(tissue_list)[1])) {
tissue_name = tissue_list["Tissue"][[1]][i]
current_opt = 10
thresh = max(filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput")["pairs"]) * (1-alpha)
for (j in peer_K_to_iterate) {
next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1]
if (next_val >= thresh) {
tissue_list[tissue_name, "optimal_95"] = current_opt
break
}
current_opt = j
}
if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) {
tissue_list[tissue_name, "optimal_95"] = current_opt
}
}
# Another Criteria: when it reaches 1% of maximal
# Tunable alpha, when it reaches N% of maximal
alpha = 0.01
tissue_list["optimal_99"] = 10
for (i in c(1:dim(tissue_list)[1])) {
tissue_name = tissue_list["Tissue"][[1]][i]
current_opt = 10
thresh = max(filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput")["pairs"]) * (1-alpha)
for (j in peer_K_to_iterate) {
next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1]
if (next_val >= thresh) {
tissue_list[tissue_name, "optimal_99"] = current_opt
break
}
current_opt = j
}
if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) {
tissue_list[tissue_name, "optimal_99"] = current_opt
}
}
# Save the optimal K for future use:
write.table(tissue_list, file='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt', row.names=FALSE, quote=FALSE, sep="\t")
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL_aggregate.R
Now, we have the optimal number of PEER factors for each tissue, for tissues that have the number of experiments above 100:
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt
We will use the matrices chosen by this metric to perform cis-eQTL detection, both with MatrixEQTL and eQTLBMA.
The following block summarizes what we have so far, and run the pipeline for all autosomal chromosomes (1-22)
For the actual cis-eQTL detection, we will use the optimal number of PEER factors for each tissue, and feed the matrix into eQTLBMA.
%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper_all.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/featurecounts/analysis \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_Imput_cis_matrix_eqtl_wrapper_all.sh Number of jobs: 616
%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper_all.py
##################################################
# cis_matrix_eqtl_wrapper_all.py
# Wrapper for the cis_matrix_eqtl.R file and submitting the batch jobs
#
##################################################
import glob
from sys import argv
import os.path
in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
mapping_dir = argv[4]
joblog_dir = argv[6]
incl_imput = argv[7]
analysis_dir = argv[8]
peer_factors_file = argv[9]
# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[2] = '_genes_certain_biotypes_counts_normalized.txt'
# argv[3] = 'featurecounts'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/'
# argv[7] = '0' or '1'
# argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/featurecounts/analysis'
# argv[9] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt'
# Make the job log directories
if not os.path.exists(joblog_dir):
os.makedirs(joblog_dir)
if not os.path.exists(joblog_dir + 'err'):
os.makedirs(joblog_dir + 'err')
if not os.path.exists(joblog_dir + 'out'):
os.makedirs(joblog_dir + 'out')
if not os.path.exists(analysis_dir):
os.makedirs(analysis_dir)
if incl_imput == "0":
imput = "Noimput"
else:
imput = "Imput"
# Create the array of jobs to run:
tissue_list = []
peer_factors = []
f = open(peer_factors_file)
# First line is a header:
f.readline()
for line in f.readlines():
entry = str.split(line.strip(),'\t')
tissue_list.append(entry[0])
# We use the optimal_99 critera:
peer_factors.append(entry[-1])
out_path = argv[5] + method + '/'
if not os.path.exists(out_path):
os.makedirs(out_path)
master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + '_' + imput + "_cis_matrix_eqtl_wrapper_all.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
# MatrixEQTL parameters
incl_trans = '0' # Don't include trans
cis_dist = '1e5' # cis window
k = 0
for i in range(len(tissue_list)):
tissue_name = tissue_list[i]
peer_factor = peer_factors[i]
filename = tissue_name + '_' + method + in_suffix
matrix = argv[1] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' + filename
out_file = out_path + filename.replace('.txt','_MatrixEQTL')
# Iterate through 22 chromosomes
for j in range(22):
chr_number = str(j+1)
sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_MatrixEQTL_chr' + chr_number + '_' + tissue_name + '_' + imput + ".slurm"
job_outfile = method+"_"+tissue_name+"_cis_all"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J meqtl_%s # job name
#SBATCH --mem=8000 # 8 GB requested
#SBATCH -t 06:00:00 # to be placed in the test queue
#SBATCH -e %s # err output directory
#SBATCH -o %s # out output directory
/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \
%s %s %s %s %s %s %s %s %s %s
"""%(tissue_name, joblog_dir+'err/'+job_outfile+'_'+imput+'.err', joblog_dir+'out/'+job_outfile+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
k += 1
master_handle.close()
print 'sh %s'%(master_script)
print 'Number of jobs:', k
Overwriting /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper_all.py
We first look through MatrixEQTL results.
%%bash
python /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control_wrapper.py
%%writefile /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control_wrapper.py
####################################################
# MatrixEQTL_cis_FDR_control_wrapper.py
#
#
####################################################
tissue_list_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'
f = open(tissue_list_file)
tissue_list = []
for line in f.readlines():
tissue_list.append(str.split(line.strip(), '\t')[0])
f.close()
original_dir = "/tigress/bj5/MatrixEQTL/cis/featurecounts/"
write_dir = "/tigress/bj5/cis_meta/MatrixEQTL/"
master_script = "/tigress/bj5/cis_meta/scripts/MatrixEQTL_FDR_control.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")
for item in tissue_list:
sbatchfile = "/tigress/bj5/cis_meta/scripts/batch/MatrixEQTL_FDR_control_" + item + ".slurm"
sbatchhandle=open(sbatchfile, 'w')
cmd=r"""#!/bin/bash
#SBATCH -J Mat_%s_FDR # job name
#SBATCH --mem=4000 # 8 GB requested
#SBATCH -t 01:00:00 # to be placed in the test queue
#SBATCH -e %s # err output directory
#SBATCH -o %s # out output directory
/usr/bin/Rscript /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control.R \
%s %s %s
"""%(item, write_dir + item + ".err", write_dir + item + ".out", original_dir, item, write_dir)
sbatchhandle.write(cmd)
sbatchhandle.close()
master_handle.write("sbatch " + sbatchfile + " \n")
master_handle.close()
print 'sh %s'%(master_script)
Overwriting /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control_wrapper.py
%%writefile /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control.R
####################################################
# MatrixEQTL_cis_FDR_control.R
#
#
####################################################
#!/usr/bin/Rscript
# Collect arguments
args <-commandArgs(TRUE)
# FDR Control of p-values from the null distribution
original_dir = '/tigress/bj5/MatrixEQTL/cis/featurecounts/'
tissue = 'Adipose-Subcutaneous'
method = 'featurecounts'
file_attr = '_genes_certain_biotypes_counts_normalized_MatrixEQTL_Chr'
imputed = 'Imput'
suffix = '_me.RData'
write_dir = '/tigress/bj5/cis_meta/MatrixEQTL/'
original_dir = args[1]
tissue = args[2]
write_dir = args[3]
snps_list = character()
genes_list = character()
pvals_list = numeric()
betas_list = numeric()
statistic_list = numeric()
total_tests = 0
total_called_eqtls = 0
for (i in c(1:22)) {
print(i)
filename = paste(original_dir, tissue, '_', method, file_attr, i, '_', imputed, suffix, sep='')
load(filename)
snps_list = append(snps_list, sapply(me$cis$eqtls$snps, as.character))
genes_list = append(genes_list, sapply(me$cis$eqtls$gene, as.character))
pvals_list = append(pvals_list, me$cis$eqtls$pvalue)
betas_list = append(betas_list, me$cis$eqtls$beta)
statistic_list = append(statistic_list, me$cis$eqtls$statistic)
total_tests = total_tests + me$cis$ntests
total_called_eqtls = total_called_eqtls + me$cis$neqtls
}
ordering = order(pvals_list)
ordered_pvals_list = pvals_list[ordering]
ordered_snps_list = snps_list[ordering]
ordered_genes_list = genes_list[ordering]
ordered_betas_list = betas_list[ordering]
ordered_statistic_list = statistic_list[ordering]
# Perform Benjamini-Hochberg
alpha_hat = sapply(c(1:total_called_eqtls), function(x) {ordered_pvals_list[x] * total_tests / x})
alpha_hat[(alpha_hat >= 1.0)] = 1.0
cur_index = 1
alpha = numeric()
ptm <- proc.time()
while (cur_index <= length(alpha_hat)) {
temp = alpha_hat[cur_index:length(alpha_hat)]
min_index = which.min(temp)
min_alpha = min(temp)
alpha = append(alpha, rep(min_alpha, min_index))
cur_index = cur_index + min_index
}
print("FDR calculation time: ")
proc.time() - ptm
num_eqtls = sum(alpha <= 0.05)
result = data.frame(snp = ordered_snps_list[1:num_eqtls], gene = ordered_genes_list[1:num_eqtls], pval = ordered_pvals_list[1:num_eqtls], beta = ordered_betas_list[1:num_eqtls], statistic = ordered_statistic_list[1:num_eqtls], FDR = alpha[1:num_eqtls])
write.table(result, file=paste(write_dir, tissue, "_FDR_5percent.txt", sep=""), quote = FALSE, sep = "\t", row.names = FALSE)
print(paste("Total number of tests:", total_tests))
print(paste("Total eQTLs called at 5% FDR:", num_eqtls))
Overwriting /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control.R