%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/upload_all_gists.sh
#!/bin/bash

# upload_all_gists.sh

gist /home/bj5/notebooks/eQTL_cis_master.ipynb -u 7cc21527485b72ad1686
gist /home/bj5/notebooks/eQTL_cis_master_scripts.ipynb -u bdf9366166b8474bb299
gist /home/bj5/notebooks/eQTL_misc_setup_scripts.ipynb -u 646278c49a10b6307bd9

gist /home/bj5/notebooks/cis_pipeline_gallery.ipynb -u 633dddf85ad9dc54c07b
gist /home/bj5/notebooks/GEUVADIS_pipeline.ipynb -u 243bf73f1a75dbcfc549
gist /home/bj5/notebooks/eQTLBMA_pipeline.ipynb -u 250135a17a004b014434


%%bash

mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references

# Gencode 19 gene annotation
ln -s /tigress/BEE/gtex/dbGaP-7716/45696/gtex/exchange/GTEx_phs000424/exchange/analysis_releases/GTEx_Analysis_2015-01-12/reference_files/gencode.v19.genes.patched_contigs.gtf.gz ./

mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/summaries
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/summaries
cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_1pass_hg19_summary_report ./
cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_2pass_hg19_featurecounts_summary_report ./
cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_2pass_hg19_rsem_summary_report ./
cp /tigress/BEE/gtex/results/group_general/checkpoints/trim_summary_report ./

%%bash

cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references
cp /tigress/BEE/gtex/external_sources/hg19/gencode.v19.nonoverlap.genes_for_quantification_certain_and_uncertain.txt ./
ln -s /tigress/BEE/gtex/external_sources/hg19/gencode.v19.annotation.nonoverlap.gtf ./

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_gene_names_positions.py

##################################################
#  obtain_gene_names_positions.py
#
#
##################################################

import gzip

anno_file = gzip.open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gencode.v19.genes.patched_contigs.gtf.gz')
meta_file = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt', 'w')
meta_file.write('\t'.join(['gene_id','chr','start','end','gene_name','gene_type']))

for i in range(5):
    anno_file.readline()
    
for line in anno_file.readlines():
    annotation = str.split(line, '\t')
    if annotation[2] == "transcript":
        chr = annotation[0]
        start = annotation[3]
        end = annotation[4]
        desc = str.split(annotation[8], ";")
        gene_id = str.strip(str.split(desc[0], " ")[1], '"')
        gene_name = str.strip(str.split(desc[4], " ")[2], '"')
        gene_type = str.strip(str.split(desc[2], " ")[2], '"')
        meta_file.write('\n')
        meta_file.write('\t'.join([gene_id,chr,start,end,gene_name,gene_type])) 
        
anno_file.close()
meta_file.close()

%%bash
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_gene_names_positions.py

cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references
# Deposit each chromosome-specific genes to a separate file:

for i in $(seq 1 22); \
do awk -v chr="$i" 'BEGIN{OFS=FS="\t"} $2==chr {print $1,$2,$3,$4}' gene_names_positions.txt > /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_positions/gene_positions_Chr$i.txt; \
echo $i; \
done

%%bash 

mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype
cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype

for i in $(seq 1 22); \
do head -n 1 /tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.allelic_dosage.txt > genotypesChr${i}.txt; \
echo $i; \
done

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/modify_header.py

# After the genotype files have been given the correct header:
for i in $(seq 1 22); \
do grep -v "^SNP" /tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.allelic_dosage.txt >> genotypesChr${i}.txt; \
echo $i; \
# Remove snps that are "."
cat genotypesChr${i}.txt | grep -ve '^\.' > genotypesChr${i}_Noimput.txt; \
done


%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/modify_header.py

##################################################
#  modify_header.py
#  Modifies the header from the genotype file
#
##################################################

for i in range(22):
    f = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr' + str(i+1) + '.txt')
    listHead = str.split(f.readline().strip(), "\t")
    f.close()
    
    # There are 450 individuals for phase 1:
    for j in range(len(listHead)):
        if "-" in listHead[j]:
            listHead[j] = '.'.join(str.split(listHead[j], "-")[0:2])
            
    # Overwrite with new header
    f = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr' + str(i+1) + '.txt', 'w')
    f.write('\t'.join(listHead) + '\n')
    f.close()

%%bash

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.py

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.py

##################################################
#  obtain_imputed_genotype_wrapper.py
#
#
##################################################

vcf_dir = '/tigress/BEE/gtex/data/genotype/imputed_genotypes/vcf/'
output_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/'

master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")

k = 0
for i in range(22):
    k+=1
    chr_num = str(i+1)
    sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/obtain_imputed_genotype_Chr" + chr_num + ".slurm"
    sbatchhandle=open(sbatchfile, 'w')
    cmd=r"""#!/bin/bash
#SBATCH -J geno_%s_obtain      # job name
#SBATCH --mem=12000           # 12 GB requested
#SBATCH -t 00:59:00           # to be placed in the test queue
#SBATCH -D %s               # set working directory

/usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype.py \
%s %s %s

cat genotypesChr%s_temp.txt | grep -ve '^\.' > genotypesChr%s_Imput.txt
rm genotypesChr%s_temp.txt

"""%(i, output_dir, vcf_dir, output_dir, chr_num, chr_num, chr_num, chr_num)
    sbatchhandle.write(cmd)
    sbatchhandle.close()
    master_handle.write("sbatch " + sbatchfile  + " \n")

master_handle.close()

print 'sh %s'%(master_script)
print 'Number of jobs:', k+1

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype.py

##################################################
#  obtain_imputed_genotype.py
#  Obtain the imputed genotype and keep them in decimals.
#
##################################################

from sys import argv
import gzip

vcf_dir = argv[1]
output_dir = argv[2]
chr_num = argv[3]

# Take the imputed genotypes directly from vcf files
f = gzip.open(vcf_dir + "GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr" + chr_num + ".vcf.gz")
line = f.readline()
out_f = open(output_dir + "genotypesChr" + chr_num + "_temp.txt", 'w')
    
while True:
    line = f.readline()
    if line[0:2] != '##':
        break
            
# We went through the annotations, now the header line:
listHead = str.split(line.strip(), "\t")
# First 9 entries are other annotations
for j in range(9, len(listHead)):
    listHead[j] = '.'.join(str.split(listHead[j], "-")[0:2])
        
out_f.write('SNP' + '\t' + '\t'.join(listHead[9:len(listHead)]))
    
count = 0
for line in f.readlines():
    count = count+1
    genotype_list = []
    listGen = str.split(line.strip(), "\t")
    # Add in rs SNP ID
    genotype_list.append(listGen[2])
    for k in range(9, len(listGen)):
        imput = str.split(listGen[k], ":")
            
        # Confidence under 90% - take the decimal form
        if imput[0] == './.':
            genotype_list.append(imput[2])
        # Confidence over 90% - take 0, 1 or 2
        else:
            genotype_list.append(str(int(listGen[k][0]) + int(listGen[k][2])))
            
    if count%10000 == 0:
        print count
    out_f.write('\n' + '\t'.join(genotype_list))
        
f.close()
out_f.close()

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.slurm

#!/bin/bash
#SBATCH -J snp_get      # job name
#SBATCH --mem=12000           # 12 GB requested
#SBATCH -t 00:59:00           # to be placed in the test queue

/usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.py \
/tigress/BEE/gtex/data/genotype/auxiliary/snp142.txt.gz \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/

for i in $(seq 1 22); \
do /usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/select_snps.py \
$i /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/; \
echo $i; \
done


%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.py

##################################################
#  obtain_snp_positions.py
#  Obtain the positions of the known snps from UCSC snp142 (hg-19)
#
##################################################

from sys import argv
import gzip

dbsnp = argv[1]
snp_pos_dir = argv[2]

# Assumes that the chromosomes appear sequentially
current_chr = 1
prev = 'chr1'

snp_file = gzip.open(dbsnp)
output_file = open(snp_pos_dir + 'SNP_positions_Chr' + str(current_chr) + '_temp.txt', 'w')
print "writing: SNP_positions_Chr" + str(current_chr) + '_temp.txt'
output_file.write('rsID' + '\t' + 'chr' + '\t' + 'pos')

count = 0
while True:
    line = snp_file.readline()
    if line == '':
        break
    count += 1
    entry = str.split(line.strip(), '\t')
    # Take care of alternative scaffolds - skip them for now
    if len(str.split(entry[1], '_')) > 1:
        continue
    if entry[1] != prev:
        output_file.close()
        print "Finished: SNP_positions_Chr" + str(current_chr) + '_temp.txt'
        # Autosomes only
        current_chr = int(entry[1][3:])
        output_file = open(snp_pos_dir + 'SNP_positions_Chr' + str(current_chr) + '_temp.txt', 'w')
        print "writing: SNP_positions_Chr" + str(current_chr) + '_temp.txt'
        output_file.write('rsID' + '\t' + 'chr' + '\t' + 'pos')
    output_file.write('\n' + '\t'.join([entry[4], entry[1], entry[2]]))
    prev = entry[1]
    if count%1000000 == 0:
        print count
    
output_file.close()
"Finished: SNP_positions_Chr" + str(current_chr) + '_temp.txt'

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/select_snps.py

##################################################
#  select_snps.py
#  Obtain only the snps that are in the genotype files
#
##################################################

from sys import argv

chr_num = argv[1]
snp_pos_dir = argv[2]
geno_file_dir = argv[3]

# construct a dictionary of all snps in the genotype file
snp_map = {}
geno_file = open(geno_file_dir + 'genotypesChr' + chr_num + '_Noimput.txt')
line = geno_file.readline()

for line in geno_file.readlines():
    snp = str.split(line.strip(), '\t')[0]
    snp_map[snp] = True
geno_file.close()

# Only take the snps that are in the genotype file
f_in = open(snp_pos_dir + 'SNP_positions_Chr' + chr_num + '_temp.txt')
f_out = open(snp_pos_dir + 'SNP_positions_Chr' + chr_num + '.txt', 'w')

f_out.write('rsID' + '\t' + 'chr' + '\t' + 'pos' + '\n')

for line in f_in.readlines():
    if str.split(line.strip(), '\t')[0] in snp_map:
        f_out.write(line)

f_in.close()
f_out.close()

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/tissue_rankings_by_samples.py
##################################################
#
#  tissue_rankings_by_samples.py
#
##################################################

import glob
in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/'
mat_suffix = '_featurecounts_genes_certain_biotypes_counts_normalized.txt'
matrices = glob.glob(in_path + '*' + mat_suffix)

tissue_dict = {}
for file in matrices:
    f_handle = open(file)
    line = f_handle.readline().strip()
    samples = str.split(line, '\t')
    tissue_name = str.split(str.split(file, mat_suffix)[0], in_path)[1]
    tissue_dict[tissue_name] = len(samples)
    f_handle.close()

import operator
sorted_tissues = sorted(tissue_dict.items(), key=operator.itemgetter(1), reverse=True)

out_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'

f_handle = open(out_file, 'w')
for (key, value) in sorted_tissues:
    f_handle.write(key + '\t' + str(value) + '\n')
f_handle.close()

%%bash 
/usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/tissue_rankings_by_samples.py

head -n 10 /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt

%%bash
# Obtain three expression matrices for comparison
python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py \
"Muscle - Skeletal" Muscle-Skeletal

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py \
"Whole Blood" WholeBlood

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py \
"Adipose - Subcutaneous" Adipose-Subcutaneous

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py
##################################################
#  obtain_GTEx_tissue_matrix.py
#  obtains tissue-specific matrix from dbGAP
#
##################################################

from sys import argv
# Choose which matrix to extract
# search_string = 'Muscle - Skeletal'
# converted_tissue_name = 'Muscle-Skeletal'
search_string = argv[1]
converted_tissue_name = argv[2]

matrix_file_location = '/tigress/bj5/dbGAP/expr_mat/GTEx_Analysis_2015-01-12_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct'

experiment_list = []
# Obtain from dbGAP
sample_annotation_file_location = '/tigress/BEE/gtex/dbGaP-7716/45696/gtex/exchange/GTEx_phs000424/exchange/analysis_releases/GTEx_Analysis_2015-01-12/sample_annotations/GTEx_Analysis_2015-01-12_Annotations_SampleAttributesDS.txt'

anno_f = open(sample_annotation_file_location)
# Header
anno_f.readline()
for line in anno_f.readlines():
    entry = str.split(line.strip(), '\t')
    # Column 14 in the file
    if entry[13] == search_string:
        experiment_list.append(entry[0])

# Finished getting all the experiments
anno_f.close()
print("Number of experiments for " + search_string + " : " + str(len(experiment_list)))

# Take only protein_coding 'genes'
gene_annotation_file_location = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'

protein_coding_gene_list = []

anno_f = open(gene_annotation_file_location)
# Header
anno_f.readline()
for line in anno_f.readlines():
    entry = str.split(line.strip(), '\t')
    # Column 14 in the file
    if entry[5] == 'protein_coding':
        protein_coding_gene_list.append(entry[0])

anno_f.close()
print("Number of protein-coding genes : " + str(len(protein_coding_gene_list)))

# Now we only obtain the relevant rows and columns

expr_f = open(matrix_file_location)
expr_out_f = open('/tigress/bj5/dbGAP/expr_mat/GTEx_Analysis_2015-01-12_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm_' + converted_tissue_name + '.gct', 'w')

# Header
expr_f.readline()
expr_f.readline()
line = expr_f.readline()
total_experiment_list = str.split(line.strip(),'\t')

# Create experiment list with genotypes
experiment_list_with_geno = []
for exp in experiment_list:
    if exp in total_experiment_list:
        experiment_list_with_geno.append(exp)

# Change the experiment list to subject list and write the header
subject_list = ['-'.join(str.split(x, '-')[0:2]) for x in experiment_list_with_geno]
expr_out_f.write('\t'.join(subject_list) + '\n')

# Obtain the indeces for each line
indeces = [total_experiment_list.index(x) for x in experiment_list_with_geno]

# Now write out the filtered expr_list:
for line in expr_f.readlines():
    entry = str.split(line.strip(), '\t')
    if entry[0] in protein_coding_gene_list:
        expr_out_f.write(entry[0] + '\t' + '\t'.join([entry[x] for x in indeces]) + '\n')

expr_out_f.close()
expr_f.close()