The goal here is to determine if what Snpeff's label means (generally) the same thing as VEP's label means the same thing as what ANNOVAR's label. Unfortunately each classifier uses slightly differt terms so we need to normalize the terms over all three tools. There is some hope that all tools will adopt the terms as defined by The Sequence Ontology Project, but as of May 2014 we still need create this mapping.
First, let's take a look at the terms each tool uses.
SNPeff classifies variants by region and effect.
VEP uses the sequence ontology terms as the SO project defines them.
ANNOVAR does something similar to SNPeff in that it breaks it's classification into a region and an effect
I can't decide what resolution the buckets should be (ie feature type, or just variant category-LOF, Missense, Syn, etc). Note that SNPeff catagorizes variants that chage the start/stop codon to another start/stop codon as both synonymous and non-synonomous. Also note that VEP doesn't differentiate between SNPs and mod 3 indels. Lastly, the annotation of splice site variants is tricky. ANNOVAR annotates anything within 2bp of a splice site (on either side). This is similar to the splice_donor_variant and splice_acceptor_variant, except what with this classification variants on the exonic side of the splice site are not counted. The splice_region_variant has no analogous match in ANNOVAR and will be ignored in this analysis.
Normalized SO Name | ANNOVAR | VEP | SNPeff |
---|---|---|---|
frameshift_variant | frameshift_deletion, frameshift_insertion, frameshift_block_substitution | frameshift_variant | FRAME_SHIFT |
stop_gained | stopgain | stop_gained | STOP_GAINED |
stop_lost | stoploss | stop_lost | STOP_LOST |
splicing_variant | splicing | splice_donor_variant, splice_acceptor_variant | SPLICE_SITE_DONOR, SPLICE_SITE_ACCEPTOR |
inframe_variant | nonframeshift_deletion, nonframeshift_insertion | inframe_insertion, inframe_deletion | CODON_INSERTION, CODON_CHANGE_PLUS_CODON_INSERTION, CODON_DELETION, CODON_CHANGE_PLUS_CODON_DELETION |
nonsynonymous_variant | nonsynonymous_SNV, nonframeshift_block_substitution | initiator_codon_variant, missense_variant, stop_retained_variant, incomplete_terminal_codon_variant | CODON_CHANGE, NON_SYNONYMOUS_CODING, NON_SYNONYMOUS_START, NON_SYNONYMOUS_STOP, START_LOST |
synonymous_variant | synonymous_SNV | synonymous_variant | SYNONYMOUS_CODING, SYNONYMOUS_START, SYNONYMOUS_STOP |
3_prime_UTR_variant | UTR3 | 3_prime_UTR_variant | UTR_3_PRIME, UTR_3_DELETED |
5_prime_UTR_variant | UTR5 | 5_prime_UTR_variant | UTR_5_PRIME, UTR_5_DELETED, START_GAINED |
upstream_gene_variant | upstream | upstream_gene_variant | UPSTREAM |
downstream_gene_variant | downstream | downstream_gene_variant | DOWNSTREAM |
regulatory_region_variant | N/A | regulatory_region_variant, regulatory_region_ablation, regulatory_region_amplification | REGULATION |
intron_variant | intronic | intron_variant | INTRON, INTRON_CONSERVED |
intergenic_variant | intergenic | intergenic_variant | INTERGENIC, INTERGENIC_CONSERVED |
Ignored | unknown, exonic, ncRNA | transcript_ablation , coding_sequence_variant, splice_region_variant, feature_truncation, feature_elongation, TF_binding_site_variant, TFBS_amplification, TFBS_ablation, NMD_transcript_variant, nc_transcript_variant, non_coding_exon_variant, mature_miRNA_variant | EXON, GENE, EXON_DELETED, CDS, CHROMOSOME, SPLICE_SITE_REGION, SPLICE_SITE_BRANCH, SPLICE_SITE_BRANCH_U12, MICRO_RNA, INTRAGENIC |
from IPython.display import HTML
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn
%pylab inline
snpeff_map = {"SNPeff":"Normalized SO Name",
"FRAME_SHIFT":"frameshift_variant",
"STOP_GAINED":"stop_gained",
"STOP_LOST":"stop_lost",
"SPLICE_SITE_DONOR": "splicing_variant",
"SPLICE_SITE_ACCEPTOR": "splicing_variant",
"CODON_INSERTION": "inframe_variant",
"CODON_CHANGE_PLUS_CODON_INSERTION": "inframe_variant",
"CODON_DELETION": "inframe_variant",
"CODON_CHANGE_PLUS_CODON_DELETION": "inframe_variant",
"NON_SYNONYMOUS_CODING": "nonsynonymous_variant",
"NON_SYNONYMOUS_START": "nonsynonymous_variant",
"NON_SYNONYMOUS_STOP": "nonsynonymous_variant",
"START_LOST": "nonsynonymous_variant",
"CODON_CHANGE": "nonsynonymous_variant",
"SYNONYMOUS_CODING": "synonymous_variant",
"SYNONYMOUS_START": "synonymous_variant",
"SYNONYMOUS_STOP": "synonymous_variant",
"UTR_3_PRIME": "3_prime_UTR_variant",
"UTR_3_DELETED": "3_prime_UTR_variant",
"UTR_5_PRIME": "5_prime_UTR_variant",
"UTR_5_DELETED": "5_prime_UTR_variant",
"START_GAINED": "5_prime_UTR_variant",
"UPSTREAM":"upstream_gene_variant",
"DOWNSTREAM":"downstream_gene_variant",
"REGULATION":"regulatory_region_variant",
"INTRON": "intron_variant",
"INTRON_CONSERVED": "intron_variant",
"INTERGENIC":"intergenic_variant",
"INTERGENIC_CONSERVED":"intergenic_variant",
"EXON": "ignored",
"GENE": "ignored",
"EXON_DELETED": "ignored",
"CDS": "ignored",
"CHROMOSOME ": "ignored",
"INTRAGENIC":"ignored",
"MICRO_RNA":"ignored",
"SPLICE_SITE_REGION": "ignored",
"SPLICE_SITE_BRANCH": "ignored",
"SPLICE_SITE_BRANCH_U12": "ignored"}
#annovar is not consistent with their naming of functional consequences, hence the multiple representations of frameshift_deletion, etc.
annovar_map = {
"frameshift deletion": "frameshift_variant",
"frameshift_deletion": "frameshift_variant",
"frameshift insertion": "frameshift_variant",
"frameshift_insertion": "frameshift_variant",
"frameshift_block_substitution": "frameshift_variant",
"stopgain": "stop_gained",
"stopgain SNV": "stop_gained",
"stoploss": "stop_lost",
"stoploss SNV": "stop_lost",
"splicing": "splicing_variant",
"nonframeshift_deletion": "inframe_variant",
"nonframeshift deletion": "inframe_variant",
"nonframeshift_insertion": "inframe_variant",
"nonframeshift insertion": "inframe_variant",
"nonframeshift_block_substitution": "nonsynonymous_variant",
"nonsynonymous_SNV": "nonsynonymous_variant",
"nonsynonymous SNV": "nonsynonymous_variant",
"synonymous_SNV": "synonymous_variant",
"synonymous SNV": "synonymous_variant",
"UTR3": "3_prime_UTR_variant",
"UTR5": "5_prime_UTR_variant",
"ncRNA": "ignored",
"ncRNA_exonic": "ignored",
"ncRNA_UTR3": "ignored",
"ncRNA_UTR5": "ignored",
"ncRNA_intronic": "ignored",
"upstream": "upstream_gene_variant",
"downstream": "downstream_gene_variant",
"intronic": "intron_variant",
"intergenic": "intergenic_variant",
"unknown": "ignored",
"exonic": "ignored"}
vep_map= {"frameshift_variant": "frameshift_variant",
"stop_gained": "stop_gained",
"stop_lost": "stop_lost",
"splice_donor_variant": "splicing_variant",
"splice_acceptor_variant": "splicing_variant",
"inframe_insertion": "inframe_variant",
"inframe_deletion": "inframe_variant",
"initiator_codon_variant": "nonsynonymous_variant",
"missense_variant": "nonsynonymous_variant",
"stop_retained_variant": "nonsynonymous_variant",
"incomplete_terminal_codon_variant": "nonsynonymous_variant",
"synonymous_variant": "synonymous_variant",
"3_prime_UTR_variant": "3_prime_UTR_variant",
"5_prime_UTR_variant": "5_prime_UTR_variant",
"upstream_gene_variant": "upstream_gene_variant",
"downstream_gene_variant": "downstream_gene_variant",
"regulatory_region_variant": "regulatory_region_variant",
"regulatory_region_ablation": "regulatory_region_variant",
"regulatory_region_amplification": "regulatory_region_variant",
"intron_variant": "intron_variant",
"intergenic_variant": "intergenic_variant",
"transcript_ablation": "ignored",
"coding_sequence_variant": "ignored",
"splice_region_variant": "ignored",
"feature_truncation": "ignored",
"feature_elongation": "ignored",
"TF_binding_site_variant": "ignored",
"TFBS_amplification": "ignored",
"TFBS_ablation": "ignored",
"NMD_transcript_variant": "ignored",
"nc_transcript_variant": "ignored",
"non_coding_exon_variant": "ignored",
"mature_miRNA_variant": "ignored"}
with pd.get_store('classified_variant_store.h5') as store:
print store.keys()
for snpeff_store in ["cftr_snpeff_ensembl", "cftr_snpeff_refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
snpeff_df = store.get(snpeff_store)
snpeff_df["normalized_so"] = snpeff_df.apply(lambda row: snpeff_map[row["Effect"]], axis=1)
subset_snpeff = snpeff_df[["POS", "ID", "REF", "ALT", "Effect", "Gene_Name", "Transcript_ID", "Amino_Acid_Change"]]
subset_snpeff.rename(columns={"Amino_Acid_Change": "hgvs_snpeff"}, inplace=True)
subset_snpeff["normalized_so_snpeff"] = subset_snpeff.apply(lambda row: snpeff_map[row["Effect"]], axis=1)
store[snpeff_store + "_subset"] = subset_snpeff
print(str.title(" ".join(snpeff_store.split("_")) + " Metrics"))
print subset_snpeff["Effect"].value_counts()
print("\n")
print subset_snpeff.count()
print("\n\n")
del snpeff_df
del subset_snpeff
for vep_store in ["cftr_vep_ensembl", "cftr_vep_refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
vep_df = store.get(vep_store)
vep_df["normalized_so"] = vep_df.apply(lambda row: vep_map[row["Consequence"]], axis=1)
subset_vep = vep_df[["POS", "ID", "REF", "ALT", "Gene", "Consequence", "Feature", "HGVSc", "HGVSp" ]]
subset_vep["hgvs_vep"] = subset_vep["HGVSp"].map(str) + subset_vep["HGVSc"]
del subset_vep["HGVSp"]
del subset_vep["HGVSc"]
subset_vep["normalized_so_vep"] = subset_vep.apply(lambda row: vep_map[row["Consequence"]], axis=1)
store[vep_store + "_subset"] = subset_vep
print(str.title(" ".join(vep_store.split("_")) + " Metrics"))
print subset_vep["Consequence"].value_counts()
print("\n")
print subset_vep.count()
print("\n\n")
del vep_df
del subset_vep
for annovar_store in ["cftr_annovar_ensembl", "cftr_annovar_refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
annovar_df = store.get(annovar_store)
annovar_df["normalized_so"] = annovar_df.apply(lambda row: annovar_map[row["combined_effect"]], axis=1)
subset_annovar = annovar_df[["POS", "REF", "ALT", "Gene", "combined_effect", "hgvs" ]]
subset_annovar["normalized_so_annovar"] = subset_annovar.apply(lambda row: annovar_map[row["combined_effect"]], axis=1)
store[annovar_store + "_subset"] = subset_annovar
print(str.title(" ".join(annovar_store.split("_")) + " Metrics"))
print subset_annovar["combined_effect"].value_counts()
print("\n")
print subset_annovar.count()
print("\n\n")
del annovar_df
del subset_annovar
for tx_set in ["ensembl", "refseq"]:
with pd.get_store('classified_variant_store.h5') as store:
snpeff_df = store.get("cftr_snpeff_" + tx_set + "_subset")
vep_df = store.get("cftr_vep_" + tx_set + "_subset")
annovar_df = store.get("cftr_annovar_" + tx_set + "_subset")
vc_snpeff = snpeff_df.groupby(["normalized_so_snpeff"]).size()
vc_snpeff.name = "SNPeff"
vc_vep = vep_df.groupby(["normalized_so_vep"]).size()
vc_vep.name = "VEP"
vc_annovar = annovar_df.groupby(["normalized_so_annovar"]).size()
vc_annovar.name = "Annovar"
vc_df = pd.DataFrame([vc_snpeff, vc_vep, vc_annovar])
vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8))
print vc_df.transpose()