from IPython.display import HTML import pandas as pd import numpy as np import os import matplotlib.pyplot as plt import seaborn %pylab inline snpeff_map = {"SNPeff":"Normalized SO Name", "FRAME_SHIFT":"frameshift_variant", "STOP_GAINED":"stop_gained", "STOP_LOST":"stop_lost", "SPLICE_SITE_DONOR": "splicing_variant", "SPLICE_SITE_ACCEPTOR": "splicing_variant", "CODON_INSERTION": "inframe_variant", "CODON_CHANGE_PLUS_CODON_INSERTION": "inframe_variant", "CODON_DELETION": "inframe_variant", "CODON_CHANGE_PLUS_CODON_DELETION": "inframe_variant", "NON_SYNONYMOUS_CODING": "nonsynonymous_variant", "NON_SYNONYMOUS_START": "nonsynonymous_variant", "NON_SYNONYMOUS_STOP": "nonsynonymous_variant", "START_LOST": "nonsynonymous_variant", "CODON_CHANGE": "nonsynonymous_variant", "SYNONYMOUS_CODING": "synonymous_variant", "SYNONYMOUS_START": "synonymous_variant", "SYNONYMOUS_STOP": "synonymous_variant", "UTR_3_PRIME": "3_prime_UTR_variant", "UTR_3_DELETED": "3_prime_UTR_variant", "UTR_5_PRIME": "5_prime_UTR_variant", "UTR_5_DELETED": "5_prime_UTR_variant", "START_GAINED": "5_prime_UTR_variant", "UPSTREAM":"upstream_gene_variant", "DOWNSTREAM":"downstream_gene_variant", "REGULATION":"regulatory_region_variant", "INTRON": "intron_variant", "INTRON_CONSERVED": "intron_variant", "INTERGENIC":"intergenic_variant", "INTERGENIC_CONSERVED":"intergenic_variant", "EXON": "ignored", "GENE": "ignored", "EXON_DELETED": "ignored", "CDS": "ignored", "CHROMOSOME ": "ignored", "INTRAGENIC":"ignored", "MICRO_RNA":"ignored", "SPLICE_SITE_REGION": "ignored", "SPLICE_SITE_BRANCH": "ignored", "SPLICE_SITE_BRANCH_U12": "ignored"} #annovar is not consistent with their naming of functional consequences, hence the multiple representations of frameshift_deletion, etc. annovar_map = { "frameshift deletion": "frameshift_variant", "frameshift_deletion": "frameshift_variant", "frameshift insertion": "frameshift_variant", "frameshift_insertion": "frameshift_variant", "frameshift_block_substitution": "frameshift_variant", "stopgain": "stop_gained", "stopgain SNV": "stop_gained", "stoploss": "stop_lost", "stoploss SNV": "stop_lost", "splicing": "splicing_variant", "nonframeshift_deletion": "inframe_variant", "nonframeshift deletion": "inframe_variant", "nonframeshift_insertion": "inframe_variant", "nonframeshift insertion": "inframe_variant", "nonframeshift_block_substitution": "nonsynonymous_variant", "nonsynonymous_SNV": "nonsynonymous_variant", "nonsynonymous SNV": "nonsynonymous_variant", "synonymous_SNV": "synonymous_variant", "synonymous SNV": "synonymous_variant", "UTR3": "3_prime_UTR_variant", "UTR5": "5_prime_UTR_variant", "ncRNA": "ignored", "ncRNA_exonic": "ignored", "ncRNA_UTR3": "ignored", "ncRNA_UTR5": "ignored", "ncRNA_intronic": "ignored", "upstream": "upstream_gene_variant", "downstream": "downstream_gene_variant", "intronic": "intron_variant", "intergenic": "intergenic_variant", "unknown": "ignored", "exonic": "ignored"} vep_map= {"frameshift_variant": "frameshift_variant", "stop_gained": "stop_gained", "stop_lost": "stop_lost", "splice_donor_variant": "splicing_variant", "splice_acceptor_variant": "splicing_variant", "inframe_insertion": "inframe_variant", "inframe_deletion": "inframe_variant", "initiator_codon_variant": "nonsynonymous_variant", "missense_variant": "nonsynonymous_variant", "stop_retained_variant": "nonsynonymous_variant", "incomplete_terminal_codon_variant": "nonsynonymous_variant", "synonymous_variant": "synonymous_variant", "3_prime_UTR_variant": "3_prime_UTR_variant", "5_prime_UTR_variant": "5_prime_UTR_variant", "upstream_gene_variant": "upstream_gene_variant", "downstream_gene_variant": "downstream_gene_variant", "regulatory_region_variant": "regulatory_region_variant", "regulatory_region_ablation": "regulatory_region_variant", "regulatory_region_amplification": "regulatory_region_variant", "intron_variant": "intron_variant", "intergenic_variant": "intergenic_variant", "transcript_ablation": "ignored", "coding_sequence_variant": "ignored", "splice_region_variant": "ignored", "feature_truncation": "ignored", "feature_elongation": "ignored", "TF_binding_site_variant": "ignored", "TFBS_amplification": "ignored", "TFBS_ablation": "ignored", "NMD_transcript_variant": "ignored", "nc_transcript_variant": "ignored", "non_coding_exon_variant": "ignored", "mature_miRNA_variant": "ignored"} with pd.get_store('classified_variant_store.h5') as store: print store.keys() for snpeff_store in ["cftr_snpeff_ensembl", "cftr_snpeff_refseq"]: with pd.get_store('classified_variant_store.h5') as store: snpeff_df = store.get(snpeff_store) snpeff_df["normalized_so"] = snpeff_df.apply(lambda row: snpeff_map[row["Effect"]], axis=1) subset_snpeff = snpeff_df[["POS", "ID", "REF", "ALT", "Effect", "Gene_Name", "Transcript_ID", "Amino_Acid_Change"]] subset_snpeff.rename(columns={"Amino_Acid_Change": "hgvs_snpeff"}, inplace=True) subset_snpeff["normalized_so_snpeff"] = subset_snpeff.apply(lambda row: snpeff_map[row["Effect"]], axis=1) store[snpeff_store + "_subset"] = subset_snpeff print(str.title(" ".join(snpeff_store.split("_")) + " Metrics")) print subset_snpeff["Effect"].value_counts() print("\n") print subset_snpeff.count() print("\n\n") del snpeff_df del subset_snpeff for vep_store in ["cftr_vep_ensembl", "cftr_vep_refseq"]: with pd.get_store('classified_variant_store.h5') as store: vep_df = store.get(vep_store) vep_df["normalized_so"] = vep_df.apply(lambda row: vep_map[row["Consequence"]], axis=1) subset_vep = vep_df[["POS", "ID", "REF", "ALT", "Gene", "Consequence", "Feature", "HGVSc", "HGVSp" ]] subset_vep["hgvs_vep"] = subset_vep["HGVSp"].map(str) + subset_vep["HGVSc"] del subset_vep["HGVSp"] del subset_vep["HGVSc"] subset_vep["normalized_so_vep"] = subset_vep.apply(lambda row: vep_map[row["Consequence"]], axis=1) store[vep_store + "_subset"] = subset_vep print(str.title(" ".join(vep_store.split("_")) + " Metrics")) print subset_vep["Consequence"].value_counts() print("\n") print subset_vep.count() print("\n\n") del vep_df del subset_vep for annovar_store in ["cftr_annovar_ensembl", "cftr_annovar_refseq"]: with pd.get_store('classified_variant_store.h5') as store: annovar_df = store.get(annovar_store) annovar_df["normalized_so"] = annovar_df.apply(lambda row: annovar_map[row["combined_effect"]], axis=1) subset_annovar = annovar_df[["POS", "REF", "ALT", "Gene", "combined_effect", "hgvs" ]] subset_annovar["normalized_so_annovar"] = subset_annovar.apply(lambda row: annovar_map[row["combined_effect"]], axis=1) store[annovar_store + "_subset"] = subset_annovar print(str.title(" ".join(annovar_store.split("_")) + " Metrics")) print subset_annovar["combined_effect"].value_counts() print("\n") print subset_annovar.count() print("\n\n") del annovar_df del subset_annovar for tx_set in ["ensembl", "refseq"]: with pd.get_store('classified_variant_store.h5') as store: snpeff_df = store.get("cftr_snpeff_" + tx_set + "_subset") vep_df = store.get("cftr_vep_" + tx_set + "_subset") annovar_df = store.get("cftr_annovar_" + tx_set + "_subset") vc_snpeff = snpeff_df.groupby(["normalized_so_snpeff"]).size() vc_snpeff.name = "SNPeff" vc_vep = vep_df.groupby(["normalized_so_vep"]).size() vc_vep.name = "VEP" vc_annovar = annovar_df.groupby(["normalized_so_annovar"]).size() vc_annovar.name = "Annovar" vc_df = pd.DataFrame([vc_snpeff, vc_vep, vc_annovar]) vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8)) print vc_df.transpose()