Unfortunately, due to the fact that on VEP's web interface, the refseq transcript set is grouped with "other transcripts" makes this analysis invalid. I've included this for completeness however.
from IPython.display import HTML
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
import seaborn
%pylab inline
#These are defined by the way annovar defines precedence. I found empirically that stop_gain > frame_shift in annovar, hence the reverse
precedence_dict = {
"splicing_variant": 1,
"frameshift_variant": 4,
"stop_gained": 2,
"stop_lost": 3,
"inframe_variant": 5,
"nonsynonymous_variant": 6,
"synonymous_variant": 7,
"5_prime_UTR_variant": 8,
"3_prime_UTR_variant": 9,
"intron_variant": 10,
"upstream_gene_variant": 11,
"downstream_gene_variant": 12,
"intergenic_variant": 13,
"intron_variant": 14,
"upstream_gene_variant": 15,
"regulatory_region_variant": 16,
"ignored": 17
}
def ranked(col):
return max(col, key=lambda val: -1*precedence_dict[val])
with pd.get_store('classified_variant_store.h5') as store:
snpeff_subset = store.get("cftr_snpeff_refseq_subset")
grouped_snpeff_subset = snpeff_subset.groupby(["Gene_Name", "POS", "REF", "ALT"])
grouped_snpeff_subset = grouped_snpeff_subset.agg({"normalized_so_snpeff": ranked})
grouped_snpeff_subset = grouped_snpeff_subset.rename(columns={"normalized_so_snpeff": "normalized_so_snpeff_max"}).reset_index()
grouped_snpeff_subset = pd.merge(grouped_snpeff_subset, snpeff_subset, how="left", on=["POS", "REF", "ALT", "Gene_Name"])
grouped_snpeff_subset = grouped_snpeff_subset[grouped_snpeff_subset["normalized_so_snpeff_max"] == grouped_snpeff_subset["normalized_so_snpeff"]]
#kludge ties are broken by taking the first element in the group (ie randomly; this should only really effect the transcript level comparisons, ie hgvs etc)
grouped_snpeff_subset = grouped_snpeff_subset.groupby(["Gene_Name", "POS", "REF", "ALT"]).first()
agg_snpeff = grouped_snpeff_subset.reset_index()
del agg_snpeff["normalized_so_snpeff_max"]
del grouped_snpeff_subset
del snpeff_subset
agg_snpeff.rename(columns={"Gene_Name":"Gene"}, inplace=True)
agg_snpeff[100000:100050]
with pd.get_store('classified_variant_store.h5') as store:
vep_subset = store.get("cftr_vep_refseq_subset")
del vep_subset["Feature"]
vep_subset.drop_duplicates(inplace=True)
grouped_vep_subset = vep_subset.groupby(["Gene", "POS", "REF", "ALT"])
grouped_vep_subset = grouped_vep_subset.agg({"normalized_so_vep": ranked})
grouped_vep_subset = grouped_vep_subset.rename(columns={"normalized_so_vep": "normalized_so_vep_max"}).reset_index()
grouped_vep_subset = pd.merge(grouped_vep_subset, vep_subset, how="left", on=["POS", "REF", "ALT", "Gene"])
grouped_vep_subset = grouped_vep_subset[grouped_vep_subset["normalized_so_vep_max"] == grouped_vep_subset["normalized_so_vep"]]
grouped_vep_subset = grouped_vep_subset.groupby(["Gene", "POS", "REF", "ALT"]).first()
agg_vep = grouped_vep_subset.reset_index()
del grouped_vep_subset
del vep_subset
del agg_vep["normalized_so_vep_max"]
agg_vep[80000:80023]
with pd.get_store('classified_variant_store.h5') as store:
annovar_subset = store.get("cftr_annovar_ensembl_subset")
grouped_annovar_subset = annovar_subset.groupby(["Gene", "POS", "REF", "ALT"])
agg_annovar = grouped_annovar_subset.agg({"normalized_so_annovar": ranked}).reset_index()
del annovar_subset['normalized_so_annovar']
agg_annovar = pd.merge(agg_annovar, annovar_subset, on=["Gene", "POS", "REF", "ALT"])
del grouped_annovar_subset
del annovar_subset
agg_annovar[2000:2050]
vc_snpeff = agg_snpeff.groupby(["normalized_so_snpeff"]).size()
vc_snpeff.name = "SNPeff"
vc_vep = agg_vep.groupby(["normalized_so_vep"]).size()
vc_vep.name = "VEP"
vc_annovar = agg_annovar.groupby(["normalized_so_annovar"]).size()
vc_annovar.name = "Annovar"
vc_df = pd.DataFrame([vc_snpeff, vc_vep, vc_annovar])
vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8))
It's worth noting that the refseq + other transcripts set was used in VEP's web app. This explain the huge number of extra annotations in
master_df = pd.merge(agg_snpeff, agg_vep, how="outer", on=["Gene", "POS", "REF", "ALT"])
master_df = pd.merge(master_df, agg_annovar, how="outer", on=["Gene", "POS", "REF", "ALT"])
master_df[100000:100001]
for effect in master_df["normalized_so_snpeff"].unique():
vep_effect = master_df[master_df["normalized_so_vep"] == effect]
annovar_effect = master_df[master_df["normalized_so_annovar"] == effect]
snpeff_effect = master_df[master_df["normalized_so_snpeff"] == effect]
fig = plt.figure(figsize=(10,10), dpi=300)
fig.suptitle(effect, fontsize=14, fontweight='bold')
v = venn3_unweighted([set(vep_effect.index.values), set(snpeff_effect.index.values), set(annovar_effect.index.values)], set_labels=("VEP", "SNPeff", "Annovar"))
plt.plot(fontsize=24)
sampletables = '<h1>Other algo\'s agree, but...</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
sampletables += "<h2> Annovar doesn't match for <em>" + str(effect) + "</em></h2>"
query = master_df.loc[(master_df["normalized_so_annovar"]!=effect) &
(master_df["normalized_so_snpeff"]==effect) &
(master_df["normalized_so_vep"]==effect)]
num_rows = query.count()[0]
if num_rows > 0:
sampletables += query.head(5).to_html()
sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
sampletables = '<h1>At least 1 column doesn\'t match</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
sampletables += "<h2> Annovar doesn't match for <em>" + str(effect) + "</em></h2>"
query = master_df.loc[(master_df["normalized_so_annovar"]!=effect) &
((master_df["normalized_so_snpeff"]==effect) | (master_df["normalized_so_vep"]==effect))]
num_rows = query.count()[0]
if num_rows > 0:
sampletables += query.head(5).to_html()
sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
sampletables =''
for effect in master_df["normalized_so_snpeff"].unique():
sampletables += "<h2> Snpeff doesn't match for <em>" + str(effect) + "</em></h2>"
query = master_df.loc[(master_df["normalized_so_annovar"]==effect) &
(master_df["normalized_so_snpeff"]!=effect) &
(master_df["normalized_so_vep"]==effect)]
num_rows = query.count()[0]
if num_rows > 0:
sampletables += query.head(5).to_html()
sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
sampletables = '<h1>Other algo\'s agree, but...</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
sampletables += "<h2> Snpeff doesn't match for <em>" + str(effect) + "</em></h2>"
query = master_df.loc[(master_df["normalized_so_annovar"]==effect) &
(master_df["normalized_so_snpeff"]!=effect) &
(master_df["normalized_so_vep"]==effect)]
num_rows = query.count()[0]
if num_rows > 0:
sampletables += query.head(5).to_html()
sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
sampletables = '<h1>At least 1 column doesn\'t match</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
sampletables += "<h2> Snpeff doesn't match for <em>" + str(effect) + "</em></h2>"
query = num_rows = master_df.loc[(master_df["normalized_so_snpeff"]!=effect) &
((master_df["normalized_so_annovar"]==effect) | (master_df["normalized_so_vep"]==effect))]
num_rows = query.count()[0]
if num_rows > 0:
sampletables += query.head(5).to_html()
sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
sampletables = '<h1>Other algo\'s agree, but...</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
sampletables += "<h2> VEP doesn't match for <em>" + str(effect) + "</em></h2>"
query = master_df.loc[(master_df["normalized_so_annovar"]==effect) &
(master_df["normalized_so_snpeff"]==effect) &
(master_df["normalized_so_vep"]!=effect)]
num_rows = query.count()[0]
if num_rows > 0:
sampletables += query.head(5).to_html()
sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
sampletables = '<h1>At least 1 column doesn\'t match</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
sampletables += "<h2> VEP doesn't match for <em>" + str(effect) + "</em></h2>"
query = master_df.loc[(master_df["normalized_so_vep"] != effect) &
((master_df["normalized_so_annovar"]==effect) | (master_df["normalized_so_snpeff"]==effect))]
num_rows = query.count()[0]
if num_rows > 0:
sampletables += query.head(5).to_html()
sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)