%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/upload_all_gists.sh #!/bin/bash # upload_all_gists.sh gist /home/bj5/notebooks/eQTL_cis_master.ipynb -u 7cc21527485b72ad1686 gist /home/bj5/notebooks/eQTL_cis_master_scripts.ipynb -u bdf9366166b8474bb299 gist /home/bj5/notebooks/eQTL_misc_setup_scripts.ipynb -u 646278c49a10b6307bd9 gist /home/bj5/notebooks/cis_pipeline_gallery.ipynb -u 633dddf85ad9dc54c07b gist /home/bj5/notebooks/GEUVADIS_pipeline.ipynb -u 243bf73f1a75dbcfc549 gist /home/bj5/notebooks/eQTLBMA_pipeline.ipynb -u 250135a17a004b014434 %%bash mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references # Gencode 19 gene annotation ln -s /tigress/BEE/gtex/dbGaP-7716/45696/gtex/exchange/GTEx_phs000424/exchange/analysis_releases/GTEx_Analysis_2015-01-12/reference_files/gencode.v19.genes.patched_contigs.gtf.gz ./ mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/summaries cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/summaries cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_1pass_hg19_summary_report ./ cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_2pass_hg19_featurecounts_summary_report ./ cp /tigress/BEE/gtex/results/group_general/checkpoints/STAR_2pass_hg19_rsem_summary_report ./ cp /tigress/BEE/gtex/results/group_general/checkpoints/trim_summary_report ./ %%bash cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references cp /tigress/BEE/gtex/external_sources/hg19/gencode.v19.nonoverlap.genes_for_quantification_certain_and_uncertain.txt ./ ln -s /tigress/BEE/gtex/external_sources/hg19/gencode.v19.annotation.nonoverlap.gtf ./ %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_gene_names_positions.py ################################################## # obtain_gene_names_positions.py # # ################################################## import gzip anno_file = gzip.open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gencode.v19.genes.patched_contigs.gtf.gz') meta_file = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt', 'w') meta_file.write('\t'.join(['gene_id','chr','start','end','gene_name','gene_type'])) for i in range(5): anno_file.readline() for line in anno_file.readlines(): annotation = str.split(line, '\t') if annotation[2] == "transcript": chr = annotation[0] start = annotation[3] end = annotation[4] desc = str.split(annotation[8], ";") gene_id = str.strip(str.split(desc[0], " ")[1], '"') gene_name = str.strip(str.split(desc[4], " ")[2], '"') gene_type = str.strip(str.split(desc[2], " ")[2], '"') meta_file.write('\n') meta_file.write('\t'.join([gene_id,chr,start,end,gene_name,gene_type])) anno_file.close() meta_file.close() %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_gene_names_positions.py cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references # Deposit each chromosome-specific genes to a separate file: for i in $(seq 1 22); \ do awk -v chr="$i" 'BEGIN{OFS=FS="\t"} $2==chr {print $1,$2,$3,$4}' gene_names_positions.txt > /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_positions/gene_positions_Chr$i.txt; \ echo $i; \ done %%bash mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype for i in $(seq 1 22); \ do head -n 1 /tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.allelic_dosage.txt > genotypesChr${i}.txt; \ echo $i; \ done python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/modify_header.py # After the genotype files have been given the correct header: for i in $(seq 1 22); \ do grep -v "^SNP" /tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr${i}.allelic_dosage.txt >> genotypesChr${i}.txt; \ echo $i; \ # Remove snps that are "." cat genotypesChr${i}.txt | grep -ve '^\.' > genotypesChr${i}_Noimput.txt; \ done %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/modify_header.py ################################################## # modify_header.py # Modifies the header from the genotype file # ################################################## for i in range(22): f = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr' + str(i+1) + '.txt') listHead = str.split(f.readline().strip(), "\t") f.close() # There are 450 individuals for phase 1: for j in range(len(listHead)): if "-" in listHead[j]: listHead[j] = '.'.join(str.split(listHead[j], "-")[0:2]) # Overwrite with new header f = open('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/genotypesChr' + str(i+1) + '.txt', 'w') f.write('\t'.join(listHead) + '\n') f.close() %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.py %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.py ################################################## # obtain_imputed_genotype_wrapper.py # # ################################################## vcf_dir = '/tigress/BEE/gtex/data/genotype/imputed_genotypes/vcf/' output_dir = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/' master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype_wrapper.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") k = 0 for i in range(22): k+=1 chr_num = str(i+1) sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/obtain_imputed_genotype_Chr" + chr_num + ".slurm" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J geno_%s_obtain # job name #SBATCH --mem=12000 # 12 GB requested #SBATCH -t 00:59:00 # to be placed in the test queue #SBATCH -D %s # set working directory /usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype.py \ %s %s %s cat genotypesChr%s_temp.txt | grep -ve '^\.' > genotypesChr%s_Imput.txt rm genotypesChr%s_temp.txt """%(i, output_dir, vcf_dir, output_dir, chr_num, chr_num, chr_num, chr_num) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) print 'Number of jobs:', k+1 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_imputed_genotype.py ################################################## # obtain_imputed_genotype.py # Obtain the imputed genotype and keep them in decimals. # ################################################## from sys import argv import gzip vcf_dir = argv[1] output_dir = argv[2] chr_num = argv[3] # Take the imputed genotypes directly from vcf files f = gzip.open(vcf_dir + "GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr" + chr_num + ".vcf.gz") line = f.readline() out_f = open(output_dir + "genotypesChr" + chr_num + "_temp.txt", 'w') while True: line = f.readline() if line[0:2] != '##': break # We went through the annotations, now the header line: listHead = str.split(line.strip(), "\t") # First 9 entries are other annotations for j in range(9, len(listHead)): listHead[j] = '.'.join(str.split(listHead[j], "-")[0:2]) out_f.write('SNP' + '\t' + '\t'.join(listHead[9:len(listHead)])) count = 0 for line in f.readlines(): count = count+1 genotype_list = [] listGen = str.split(line.strip(), "\t") # Add in rs SNP ID genotype_list.append(listGen[2]) for k in range(9, len(listGen)): imput = str.split(listGen[k], ":") # Confidence under 90% - take the decimal form if imput[0] == './.': genotype_list.append(imput[2]) # Confidence over 90% - take 0, 1 or 2 else: genotype_list.append(str(int(listGen[k][0]) + int(listGen[k][2]))) if count%10000 == 0: print count out_f.write('\n' + '\t'.join(genotype_list)) f.close() out_f.close() %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.slurm #!/bin/bash #SBATCH -J snp_get # job name #SBATCH --mem=12000 # 12 GB requested #SBATCH -t 00:59:00 # to be placed in the test queue /usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.py \ /tigress/BEE/gtex/data/genotype/auxiliary/snp142.txt.gz \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/ for i in $(seq 1 22); \ do /usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/select_snps.py \ $i /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/genotype/; \ echo $i; \ done %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_snp_positions.py ################################################## # obtain_snp_positions.py # Obtain the positions of the known snps from UCSC snp142 (hg-19) # ################################################## from sys import argv import gzip dbsnp = argv[1] snp_pos_dir = argv[2] # Assumes that the chromosomes appear sequentially current_chr = 1 prev = 'chr1' snp_file = gzip.open(dbsnp) output_file = open(snp_pos_dir + 'SNP_positions_Chr' + str(current_chr) + '_temp.txt', 'w') print "writing: SNP_positions_Chr" + str(current_chr) + '_temp.txt' output_file.write('rsID' + '\t' + 'chr' + '\t' + 'pos') count = 0 while True: line = snp_file.readline() if line == '': break count += 1 entry = str.split(line.strip(), '\t') # Take care of alternative scaffolds - skip them for now if len(str.split(entry[1], '_')) > 1: continue if entry[1] != prev: output_file.close() print "Finished: SNP_positions_Chr" + str(current_chr) + '_temp.txt' # Autosomes only current_chr = int(entry[1][3:]) output_file = open(snp_pos_dir + 'SNP_positions_Chr' + str(current_chr) + '_temp.txt', 'w') print "writing: SNP_positions_Chr" + str(current_chr) + '_temp.txt' output_file.write('rsID' + '\t' + 'chr' + '\t' + 'pos') output_file.write('\n' + '\t'.join([entry[4], entry[1], entry[2]])) prev = entry[1] if count%1000000 == 0: print count output_file.close() "Finished: SNP_positions_Chr" + str(current_chr) + '_temp.txt' %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/select_snps.py ################################################## # select_snps.py # Obtain only the snps that are in the genotype files # ################################################## from sys import argv chr_num = argv[1] snp_pos_dir = argv[2] geno_file_dir = argv[3] # construct a dictionary of all snps in the genotype file snp_map = {} geno_file = open(geno_file_dir + 'genotypesChr' + chr_num + '_Noimput.txt') line = geno_file.readline() for line in geno_file.readlines(): snp = str.split(line.strip(), '\t')[0] snp_map[snp] = True geno_file.close() # Only take the snps that are in the genotype file f_in = open(snp_pos_dir + 'SNP_positions_Chr' + chr_num + '_temp.txt') f_out = open(snp_pos_dir + 'SNP_positions_Chr' + chr_num + '.txt', 'w') f_out.write('rsID' + '\t' + 'chr' + '\t' + 'pos' + '\n') for line in f_in.readlines(): if str.split(line.strip(), '\t')[0] in snp_map: f_out.write(line) f_in.close() f_out.close() %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/tissue_rankings_by_samples.py ################################################## # # tissue_rankings_by_samples.py # ################################################## import glob in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/' mat_suffix = '_featurecounts_genes_certain_biotypes_counts_normalized.txt' matrices = glob.glob(in_path + '*' + mat_suffix) tissue_dict = {} for file in matrices: f_handle = open(file) line = f_handle.readline().strip() samples = str.split(line, '\t') tissue_name = str.split(str.split(file, mat_suffix)[0], in_path)[1] tissue_dict[tissue_name] = len(samples) f_handle.close() import operator sorted_tissues = sorted(tissue_dict.items(), key=operator.itemgetter(1), reverse=True) out_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt' f_handle = open(out_file, 'w') for (key, value) in sorted_tissues: f_handle.write(key + '\t' + str(value) + '\n') f_handle.close() %%bash /usr/lib/python2.7/bin/python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/tissue_rankings_by_samples.py head -n 10 /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt %%bash # Obtain three expression matrices for comparison python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py \ "Muscle - Skeletal" Muscle-Skeletal python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py \ "Whole Blood" WholeBlood python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py \ "Adipose - Subcutaneous" Adipose-Subcutaneous %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/obtain_GTEx_tissue_matrix.py ################################################## # obtain_GTEx_tissue_matrix.py # obtains tissue-specific matrix from dbGAP # ################################################## from sys import argv # Choose which matrix to extract # search_string = 'Muscle - Skeletal' # converted_tissue_name = 'Muscle-Skeletal' search_string = argv[1] converted_tissue_name = argv[2] matrix_file_location = '/tigress/bj5/dbGAP/expr_mat/GTEx_Analysis_2015-01-12_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct' experiment_list = [] # Obtain from dbGAP sample_annotation_file_location = '/tigress/BEE/gtex/dbGaP-7716/45696/gtex/exchange/GTEx_phs000424/exchange/analysis_releases/GTEx_Analysis_2015-01-12/sample_annotations/GTEx_Analysis_2015-01-12_Annotations_SampleAttributesDS.txt' anno_f = open(sample_annotation_file_location) # Header anno_f.readline() for line in anno_f.readlines(): entry = str.split(line.strip(), '\t') # Column 14 in the file if entry[13] == search_string: experiment_list.append(entry[0]) # Finished getting all the experiments anno_f.close() print("Number of experiments for " + search_string + " : " + str(len(experiment_list))) # Take only protein_coding 'genes' gene_annotation_file_location = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt' protein_coding_gene_list = [] anno_f = open(gene_annotation_file_location) # Header anno_f.readline() for line in anno_f.readlines(): entry = str.split(line.strip(), '\t') # Column 14 in the file if entry[5] == 'protein_coding': protein_coding_gene_list.append(entry[0]) anno_f.close() print("Number of protein-coding genes : " + str(len(protein_coding_gene_list))) # Now we only obtain the relevant rows and columns expr_f = open(matrix_file_location) expr_out_f = open('/tigress/bj5/dbGAP/expr_mat/GTEx_Analysis_2015-01-12_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm_' + converted_tissue_name + '.gct', 'w') # Header expr_f.readline() expr_f.readline() line = expr_f.readline() total_experiment_list = str.split(line.strip(),'\t') # Create experiment list with genotypes experiment_list_with_geno = [] for exp in experiment_list: if exp in total_experiment_list: experiment_list_with_geno.append(exp) # Change the experiment list to subject list and write the header subject_list = ['-'.join(str.split(x, '-')[0:2]) for x in experiment_list_with_geno] expr_out_f.write('\t'.join(subject_list) + '\n') # Obtain the indeces for each line indeces = [total_experiment_list.index(x) for x in experiment_list_with_geno] # Now write out the filtered expr_list: for line in expr_f.readlines(): entry = str.split(line.strip(), '\t') if entry[0] in protein_coding_gene_list: expr_out_f.write(entry[0] + '\t' + '\t'.join([entry[x] for x in indeces]) + '\n') expr_out_f.close() expr_f.close()