from IPython.display import HTML import pandas as pd import numpy as np import os import matplotlib.pyplot as plt from matplotlib_venn import venn3, venn3_circles, venn3_unweighted import seaborn %pylab inline #These are defined by the way annovar defines precedence. I found empirically that stop_gain > frame_shift in annovar, hence the reverse precedence_dict = { "splicing_variant": 1, "frameshift_variant": 4, "stop_gained": 2, "stop_lost": 3, "inframe_variant": 5, "nonsynonymous_variant": 6, "synonymous_variant": 7, "5_prime_UTR_variant": 8, "3_prime_UTR_variant": 9, "intron_variant": 10, "upstream_gene_variant": 11, "downstream_gene_variant": 12, "intergenic_variant": 13, "intron_variant": 14, "upstream_gene_variant": 15, "regulatory_region_variant": 16, "ignored": 17 } def ranked(col): return max(col, key=lambda val: -1*precedence_dict[val]) with pd.get_store('classified_variant_store.h5') as store: snpeff_subset = store.get("cftr_snpeff_ensembl_subset") ensembl_symbol_mapping = {"CFTR":"ENSG00000001626", "AC000111.3": "ENSG00000232661", "AC000111.4":"ENSG00000237974", "AC000111.5": "ENSG00000234001", "AC000111.6": "ENSG00000083622", "CTTNBP2": "ENSG00000077063", "":""} snpeff_subset["EnsemblGene"] = snpeff_subset["Gene_Name"].apply(lambda x: ensembl_symbol_mapping[x]) grouped_snpeff_subset = snpeff_subset.groupby(["EnsemblGene", "POS", "REF", "ALT"]) grouped_snpeff_subset = grouped_snpeff_subset.agg({"normalized_so_snpeff": ranked}) grouped_snpeff_subset = grouped_snpeff_subset.rename(columns={"normalized_so_snpeff": "normalized_so_snpeff_max"}).reset_index() grouped_snpeff_subset = pd.merge(grouped_snpeff_subset, snpeff_subset, how="left", on=["POS", "REF", "ALT", "EnsemblGene"]) grouped_snpeff_subset = grouped_snpeff_subset[grouped_snpeff_subset["normalized_so_snpeff_max"] == grouped_snpeff_subset["normalized_so_snpeff"]] #kludge ties are broken by taking the first element in the group (ie randomly; this should only really effect the transcript level comparisons, ie hgvs etc) grouped_snpeff_subset = grouped_snpeff_subset.groupby(["EnsemblGene", "POS", "REF", "ALT"]).first() agg_snpeff = grouped_snpeff_subset.reset_index() del agg_snpeff["normalized_so_snpeff_max"] del grouped_snpeff_subset del snpeff_subset agg_snpeff.rename(columns={"EnsemblGene":"Gene"}, inplace=True) agg_snpeff[100000:100050] with pd.get_store('classified_variant_store.h5') as store: vep_subset = store.get("cftr_vep_ensembl_subset") del vep_subset["Feature"] vep_subset.drop_duplicates(inplace=True) grouped_vep_subset = vep_subset.groupby(["Gene", "POS", "REF", "ALT"]) grouped_vep_subset = grouped_vep_subset.agg({"normalized_so_vep": ranked}) grouped_vep_subset = grouped_vep_subset.rename(columns={"normalized_so_vep": "normalized_so_vep_max"}).reset_index() grouped_vep_subset = pd.merge(grouped_vep_subset, vep_subset, how="left", on=["POS", "REF", "ALT", "Gene"]) grouped_vep_subset = grouped_vep_subset[grouped_vep_subset["normalized_so_vep_max"] == grouped_vep_subset["normalized_so_vep"]] grouped_vep_subset = grouped_vep_subset.groupby(["Gene", "POS", "REF", "ALT"]).first() agg_vep = grouped_vep_subset.reset_index() del grouped_vep_subset del vep_subset del agg_vep["normalized_so_vep_max"] agg_vep[80000:80023] with pd.get_store('classified_variant_store.h5') as store: annovar_subset = store.get("cftr_annovar_ensembl_subset") grouped_annovar_subset = annovar_subset.groupby(["Gene", "POS", "REF", "ALT"]) agg_annovar = grouped_annovar_subset.agg({"normalized_so_annovar": ranked}).reset_index() del annovar_subset['normalized_so_annovar'] agg_annovar = pd.merge(agg_annovar, annovar_subset, on=["Gene", "POS", "REF", "ALT"]) del grouped_annovar_subset del annovar_subset agg_annovar[2000:2050] vc_snpeff = agg_snpeff.groupby(["normalized_so_snpeff"]).size() vc_snpeff.name = "SNPeff" vc_vep = agg_vep.groupby(["normalized_so_vep"]).size() vc_vep.name = "VEP" vc_annovar = agg_annovar.groupby(["normalized_so_annovar"]).size() vc_annovar.name = "Annovar" vc_df = pd.DataFrame([vc_snpeff, vc_vep, vc_annovar]) vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8)) master_df = pd.merge(agg_snpeff, agg_vep, how="outer", on=["Gene", "POS", "REF", "ALT"]) master_df = pd.merge(master_df, agg_annovar, how="outer", on=["Gene", "POS", "REF", "ALT"]) #Number of matching all_count = 0 for effect in precedence_dict.keys(): all_count += master_df[(master_df["normalized_so_vep"] == effect) & (master_df["normalized_so_snpeff"] == effect) & (master_df["normalized_so_annovar"] == effect)].count() num_matching = all_count["POS"] num_total = master_df.count()["POS"] print num_matching print num_total print "Percent matching: " + str(100.0*num_matching/num_total) all_count = 0 total_count = 0 #list of coding level effects effects = [eff for eff, priority in precedence_dict.iteritems() if priority < 8] total_count = master_df[master_df["normalized_so_vep"].isin(effects) | master_df["normalized_so_snpeff"].isin(effects) | master_df["normalized_so_annovar"].isin(effects)].count() for effect in effects: all_count += master_df[(master_df["normalized_so_vep"] == effect) & (master_df["normalized_so_snpeff"] == effect) & (master_df["normalized_so_annovar"] == effect)].count() num_matching = all_count["POS"] num_total = total_count["POS"] print num_matching print num_total print "Percent matching: " + str(100.0*num_matching/num_total) for effect in master_df["normalized_so_snpeff"].unique(): vep_effect = master_df[master_df["normalized_so_vep"] == effect] annovar_effect = master_df[master_df["normalized_so_annovar"] == effect] snpeff_effect = master_df[master_df["normalized_so_snpeff"] == effect] fig = plt.figure(figsize=(10,10), dpi=300) fig.suptitle(effect, fontsize=14, fontweight='bold') v = venn3_unweighted([set(vep_effect.index.values), set(snpeff_effect.index.values), set(annovar_effect.index.values)], set_labels=("VEP", "SNPeff", "Annovar")) plt.plot(fontsize=24) sampletables = '

Other algo\'s agree, but...

' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "

Annovar doesn't match for " + str(effect) + "

" query = master_df.loc[(master_df["normalized_so_annovar"]!=effect) & (master_df["normalized_so_snpeff"]==effect) & (master_df["normalized_so_vep"]==effect)] num_rows = query.count()[0] if num_rows > 0: sampletables += query.head(5).to_html() sampletables += "

" + str(num_rows) + " rows

" HTML(sampletables) sampletables = '

At least 1 column doesn\'t match

' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "

Annovar doesn't match for " + str(effect) + "

" query = master_df.loc[(master_df["normalized_so_annovar"]!=effect) & ((master_df["normalized_so_snpeff"]==effect) | (master_df["normalized_so_vep"]==effect))] num_rows = query.count()[0] if num_rows > 0: sampletables += query.head(5).to_html() sampletables += "

" + str(num_rows) + " rows

" HTML(sampletables) sampletables ='' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "

Snpeff doesn't match for " + str(effect) + "

" query = master_df.loc[(master_df["normalized_so_annovar"]==effect) & (master_df["normalized_so_snpeff"]!=effect) & (master_df["normalized_so_vep"]==effect)] num_rows = query.count()[0] if num_rows > 0: sampletables += query.tail(5).to_html() sampletables += "

" + str(num_rows) + " rows

" HTML(sampletables) sampletables = '

Other algo\'s agree, but...

' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "

Snpeff doesn't match for " + str(effect) + "

" query = master_df.loc[(master_df["normalized_so_annovar"]==effect) & (master_df["normalized_so_snpeff"]!=effect) & (master_df["normalized_so_vep"]==effect)] num_rows = query.count()[0] if num_rows > 0: sampletables += query.tail(5).to_html() sampletables += "

" + str(num_rows) + " rows

" HTML(sampletables) sampletables = '

At least 1 column doesn\'t match

' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "

Snpeff doesn't match for " + str(effect) + "

" query = num_rows = master_df.loc[(master_df["normalized_so_snpeff"]!=effect) & ((master_df["normalized_so_annovar"]==effect) | (master_df["normalized_so_vep"]==effect))] num_rows = query.count()[0] if num_rows > 0: sampletables += query.head(5).to_html() sampletables += "

" + str(num_rows) + " rows

" HTML(sampletables) sampletables = '

Other algo\'s agree, but...

' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "

VEP doesn't match for " + str(effect) + "

" query = master_df.loc[(master_df["normalized_so_annovar"]==effect) & (master_df["normalized_so_snpeff"]==effect) & (master_df["normalized_so_vep"]!=effect)] num_rows = query.count()[0] if num_rows > 0: sampletables += query.head(5).to_html() sampletables += "

" + str(num_rows) + " rows

" HTML(sampletables) sampletables = '

At least 1 column doesn\'t match

' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "

VEP doesn't match for " + str(effect) + "

" query = master_df.loc[(master_df["normalized_so_vep"] != effect) & ((master_df["normalized_so_annovar"]==effect) | (master_df["normalized_so_snpeff"]==effect))] num_rows = query.count()[0] if num_rows > 0: sampletables += query.head(5).to_html() sampletables += "

" + str(num_rows) + " rows

" HTML(sampletables) master_df.loc[(master_df["normalized_so_annovar"]=="splicing_variant") & (master_df["normalized_so_snpeff"]!="splicing_variant") & (master_df["normalized_so_vep"]!="splicing_variant")].head(50) master_df.loc[(master_df["normalized_so_annovar"]=="splicing_variant") & (master_df["normalized_so_snpeff"]!="splicing_variant")].tail()