from IPython.display import HTML
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn
%pylab inline
snpeff_map = {"SNPeff":"Normalized SO Name",
"FRAME_SHIFT":"frameshift_variant",
"STOP_GAINED":"stop_gained",
"STOP_LOST":"stop_lost",
"SPLICE_SITE_DONOR": "splicing_variant",
"SPLICE_SITE_ACCEPTOR": "splicing_variant",
"CODON_INSERTION": "inframe_variant",
"CODON_CHANGE_PLUS_CODON_INSERTION": "inframe_variant",
"CODON_DELETION": "inframe_variant",
"CODON_CHANGE_PLUS_CODON_DELETION": "inframe_variant",
"NON_SYNONYMOUS_CODING": "nonsynonymous_variant",
"NON_SYNONYMOUS_START": "nonsynonymous_variant",
"NON_SYNONYMOUS_STOP": "nonsynonymous_variant",
"START_LOST": "nonsynonymous_variant",
"CODON_CHANGE": "nonsynonymous_variant",
"SYNONYMOUS_CODING": "synonymous_variant",
"SYNONYMOUS_START": "synonymous_variant",
"SYNONYMOUS_STOP": "synonymous_variant",
"UTR_3_PRIME": "3_prime_UTR_variant",
"UTR_3_DELETED": "3_prime_UTR_variant",
"UTR_5_PRIME": "5_prime_UTR_variant",
"UTR_5_DELETED": "5_prime_UTR_variant",
"START_GAINED": "5_prime_UTR_variant",
"UPSTREAM":"upstream_gene_variant",
"DOWNSTREAM":"downstream_gene_variant",
"REGULATION":"regulatory_region_variant",
"INTRON": "intron_variant",
"INTRON_CONSERVED": "intron_variant",
"INTERGENIC":"intergenic_variant",
"INTERGENIC_CONSERVED":"intergenic_variant",
"EXON": "ignored",
"GENE": "ignored",
"EXON_DELETED": "ignored",
"CDS": "ignored",
"CHROMOSOME ": "ignored",
"INTRAGENIC":"ignored",
"MICRO_RNA":"ignored",
"SPLICE_SITE_REGION": "ignored",
"SPLICE_SITE_BRANCH": "ignored",
"SPLICE_SITE_BRANCH_U12": "ignored"}
#annovar is not consistent with their naming of functional consequences, hence the multiple representations of frameshift_deletion, etc.
annovar_map = {
"frameshift deletion": "frameshift_variant",
"frameshift_deletion": "frameshift_variant",
"frameshift insertion": "frameshift_variant",
"frameshift_insertion": "frameshift_variant",
"frameshift_block_substitution": "frameshift_variant",
"stopgain": "stop_gained",
"stopgain SNV": "stop_gained",
"stoploss": "stop_lost",
"stoploss SNV": "stop_lost",
"splicing": "splicing_variant",
"nonframeshift_deletion": "inframe_variant",
"nonframeshift deletion": "inframe_variant",
"nonframeshift_insertion": "inframe_variant",
"nonframeshift insertion": "inframe_variant",
"nonframeshift_block_substitution": "nonsynonymous_variant",
"nonsynonymous_SNV": "nonsynonymous_variant",
"nonsynonymous SNV": "nonsynonymous_variant",
"synonymous_SNV": "synonymous_variant",
"synonymous SNV": "synonymous_variant",
"UTR3": "3_prime_UTR_variant",
"UTR5": "5_prime_UTR_variant",
"ncRNA": "ignored",
"ncRNA_exonic": "ignored",
"ncRNA_UTR3": "ignored",
"ncRNA_UTR5": "ignored",
"ncRNA_intronic": "ignored",
"upstream": "upstream_gene_variant",
"downstream": "downstream_gene_variant",
"intronic": "intron_variant",
"intergenic": "intergenic_variant",
"unknown": "ignored",
"exonic": "ignored"}
vep_map= {"frameshift_variant": "frameshift_variant",
"stop_gained": "stop_gained",
"stop_lost": "stop_lost",
"splice_donor_variant": "splicing_variant",
"splice_acceptor_variant": "splicing_variant",
"inframe_insertion": "inframe_variant",
"inframe_deletion": "inframe_variant",
"initiator_codon_variant": "nonsynonymous_variant",
"missense_variant": "nonsynonymous_variant",
"stop_retained_variant": "nonsynonymous_variant",
"incomplete_terminal_codon_variant": "nonsynonymous_variant",
"synonymous_variant": "synonymous_variant",
"3_prime_UTR_variant": "3_prime_UTR_variant",
"5_prime_UTR_variant": "5_prime_UTR_variant",
"upstream_gene_variant": "upstream_gene_variant",
"downstream_gene_variant": "downstream_gene_variant",
"regulatory_region_variant": "regulatory_region_variant",
"regulatory_region_ablation": "regulatory_region_variant",
"regulatory_region_amplification": "regulatory_region_variant",
"intron_variant": "intron_variant",
"intergenic_variant": "intergenic_variant",
"transcript_ablation": "ignored",
"coding_sequence_variant": "ignored",
"splice_region_variant": "ignored",
"feature_truncation": "ignored",
"feature_elongation": "ignored",
"TF_binding_site_variant": "ignored",
"TFBS_amplification": "ignored",
"TFBS_ablation": "ignored",
"NMD_transcript_variant": "ignored",
"nc_transcript_variant": "ignored",
"non_coding_exon_variant": "ignored",
"mature_miRNA_variant": "ignored"}
with pd.get_store('classified_variant_store.h5') as store:
print store.keys()
for snpeff_store in ["cftr_snpeff_ensembl", "cftr_snpeff_refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
snpeff_df = store.get(snpeff_store)
snpeff_df["normalized_so"] = snpeff_df.apply(lambda row: snpeff_map[row["Effect"]], axis=1)
subset_snpeff = snpeff_df[["POS", "ID", "REF", "ALT", "Effect", "Gene_Name", "Transcript_ID", "Amino_Acid_Change"]]
subset_snpeff.rename(columns={"Amino_Acid_Change": "hgvs_snpeff"}, inplace=True)
subset_snpeff["normalized_so_snpeff"] = subset_snpeff.apply(lambda row: snpeff_map[row["Effect"]], axis=1)
store[snpeff_store + "_subset"] = subset_snpeff
print(str.title(" ".join(snpeff_store.split("_")) + " Metrics"))
print subset_snpeff["Effect"].value_counts()
print("\n")
print subset_snpeff.count()
print("\n\n")
del snpeff_df
del subset_snpeff
for vep_store in ["cftr_vep_ensembl", "cftr_vep_refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
vep_df = store.get(vep_store)
vep_df["normalized_so"] = vep_df.apply(lambda row: vep_map[row["Consequence"]], axis=1)
subset_vep = vep_df[["POS", "ID", "REF", "ALT", "Gene", "Consequence", "Feature", "HGVSc", "HGVSp" ]]
subset_vep["hgvs_vep"] = subset_vep["HGVSp"].map(str) + subset_vep["HGVSc"]
del subset_vep["HGVSp"]
del subset_vep["HGVSc"]
subset_vep["normalized_so_vep"] = subset_vep.apply(lambda row: vep_map[row["Consequence"]], axis=1)
store[vep_store + "_subset"] = subset_vep
print(str.title(" ".join(vep_store.split("_")) + " Metrics"))
print subset_vep["Consequence"].value_counts()
print("\n")
print subset_vep.count()
print("\n\n")
del vep_df
del subset_vep
for annovar_store in ["cftr_annovar_ensembl", "cftr_annovar_refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
annovar_df = store.get(annovar_store)
annovar_df["normalized_so"] = annovar_df.apply(lambda row: annovar_map[row["combined_effect"]], axis=1)
subset_annovar = annovar_df[["POS", "REF", "ALT", "Gene", "combined_effect", "hgvs" ]]
subset_annovar["normalized_so_annovar"] = subset_annovar.apply(lambda row: annovar_map[row["combined_effect"]], axis=1)
store[annovar_store + "_subset"] = subset_annovar
print(str.title(" ".join(annovar_store.split("_")) + " Metrics"))
print subset_annovar["combined_effect"].value_counts()
print("\n")
print subset_annovar.count()
print("\n\n")
del annovar_df
del subset_annovar
for tx_set in ["ensembl", "refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
snpeff_df = store.get("cftr_snpeff_" + tx_set + "_subset")
vep_df = store.get("cftr_vep_" + tx_set + "_subset")
annovar_df = store.get("cftr_annovar_" + tx_set + "_subset")
vc_snpeff = snpeff_df.groupby(["normalized_so_snpeff"]).size()
vc_snpeff.name = "SNPeff"
vc_vep = vep_df.groupby(["normalized_so_vep"]).size()
vc_vep.name = "VEP"
vc_annovar = annovar_df.groupby(["normalized_so_annovar"]).size()
vc_annovar.name = "Annovar"
vc_df = pd.DataFrame([vc_snpeff, vc_vep, vc_annovar])
vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8))
print vc_df.transpose()