from IPython.display import HTML
import pandas as pd
import numpy as np
import os
# snp_eff_ensembl_df = pd.read_table("./cftr_snpeff_ensembl.vcf", skiprows=7, header=0, usecols=range(10))
snp_eff_ensembl_df = pd.read_table("./snpeff_test.vcf", skiprows=7, header=0, usecols=range(10))
snp_eff_ensembl_df.head()
info_df = snp_eff_ensembl_df["INFO"].str.replace("EFF=", "").str.split(',').apply(pd.Series, 1).stack()
var_class_df = info_df.apply(lambda x: pd.Series(x.split('(')))
detail_df = var_class_df[1].str.rstrip(")").apply(lambda x: pd.Series(x.split('|')))
detail_df.columns = ['Effect_Impact', 'Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType', 'Gene_Coding',
'Transcript_ID', 'Exon_Rank', 'Genotype_Number', 'WARNINGS']
del var_class_df[1]
var_class_df.columns = ['Effect']
var_class_df = var_class_df.join(detail_df)
var_class_df.index = var_class_df.index.droplevel(-1)
var_class_df.head()
combo_df = snp_eff_ensembl_df.join(var_class_df)
combo_df.replace(to_replace='', value=np.nan, inplace=True)
HTML(combo_df[combo_df["Effect"] == "XON"].head().to_html())
# combo_df_drop_nulls = combo_df[combo_df["Transcript_ID"].isnull() == False]
# combo_df_drop_nulls = combo_df_drop_nulls.set_index(keys=['POS', 'Transcript_ID'])
# HTML(combo_df_drop_nulls.head().to_html())
store_file="classified_variant_store.h5"
try:
os.remove(store_file)
except OSError:
pass
store = pd.HDFStore(store_file)
store.close()
def build_snpeff_hdf():
with pd.get_store("./classified_variant_store.h5") as store:
for snpeff_file in ["cftr_snpeff_ensembl.vcf", "cftr_snpeff_refseq.vcf"]:
snpeff_df = pd.read_table(snpeff_file, skiprows=7, header=0, usecols=range(10))
var_class_df = snpeff_df["INFO"].str.replace("EFF=", "").str.split(',').apply(pd.Series, 1).stack()
var_class_df = var_class_df.apply(lambda x: pd.Series(x.split('(')))
detail_df = var_class_df[1].str.rstrip(")").apply(lambda x: pd.Series(x.split('|')))
num_col = len(detail_df.columns)
detail_df.columns = ['Effect_Impact', 'Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType', 'Gene_Coding',
'Transcript_ID', 'Exon_Rank', 'Genotype_Number', "Warnings", "Errors"][:num_col]
del var_class_df[1]
var_class_df.columns = ['Effect']
var_class_df = var_class_df.join(detail_df)
var_class_df.index = var_class_df.index.droplevel(-1)
var_class_df.head()
snpeff_df = snpeff_df.join(var_class_df)
# combo_df.replace(to_replace='', value=np.nan, inplace=True)
# combo_df_drop_nulls = combo_df[combo_df["Transcript_ID"].isnull() == False]
# combo_df_drop_nulls = combo_df_drop_nulls.set_index(keys=['POS', 'Transcript_ID'])
snpeff_df.dropna(axis=1, how='all')
store[snpeff_file.rstrip(".vcf")] = snpeff_df
build_snpeff_hdf()
def build_vep_hdf():
with pd.get_store("./classified_variant_store.h5") as store:
for vep_file in [ "cftr_vep_refseq.vcf", "cftr_vep_ensembl.vcf"]:
vep_df = pd.read_table(vep_file, skiprows=5, header=0, usecols=range(10))
info_df = vep_df["INFO"].str.lstrip("CSQ=").str.split(',').apply(pd.Series, 1).stack()
detail_df = info_df.apply(lambda x: pd.Series(x.split('|')))
headers = "Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|AA_MAF|EA_MAF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|DISTANCE|STRAND|CLIN_SIG|SYMBOL|SYMBOL_SOURCE|SIFT|PolyPhen|GMAF|BIOTYPE|HGVSc|HGVSp|AFR_MAF|AMR_MAF|ASN_MAF|EUR_MAF"
headers_list = headers.split("|")
detail_df.columns = headers_list
consq_series = detail_df["Consequence"].str.split('&').apply(pd.Series, 1).stack()
consq_series.index = consq_series.index.droplevel(-1)
consq_series.name = "Consequence"
del detail_df["Consequence"]
detail_df = detail_df.join(consq_series)
# Note that it is key to drop index level in the reverse order that they are built up
detail_df.index = detail_df.index.droplevel(-1)
vep_df = vep_df.join(detail_df)
# vep_ensembl_df.replace(to_replace='', value=np.nan, inplace=True)
vep_df = vep_df.dropna(axis=1, how='all')
vep_df = vep_df.drop_duplicates()
store[vep_file.rstrip(".vcf")] = vep_df
build_vep_hdf()
def combine_effects(row, suffix):
if row["Func" + suffix] == "exonic":
return row["ExonicFunc" + suffix]
else:
return row["Func" + suffix]
def combine_hgvs(row, suffix):
if row["Func" + suffix] == "splicing":
return row["hgvs"]
else:
return row["AAChange" + suffix]
def build_annovar_hdf():
with pd.get_store("./classified_variant_store.h5") as store:
for annovar_file in ["cftr_annovar_ensembl.tsv", "cftr_annovar_refseq.tsv"]:
annovar_df = pd.read_table(annovar_file)
if annovar_file == "cftr_annovar_ensembl.tsv":
suffix = ".ensgene"
else:
suffix = ".refgene"
func = annovar_df["Func" + suffix].str.split(";").apply(pd.Series, 1).stack()
func.index = func.index.droplevel(-1)
func.name = "Func" + suffix
del annovar_df["Func" + suffix]
annovar_df = annovar_df.join(func)
annovar_df.loc[annovar_df["Func" + suffix] == 'intergenic', ["Gene" + suffix]] = np.nan
gene_series = annovar_df["Gene" + suffix].str.extract(r'(\w+)\(*')
print gene_series.head()
gene_series.name = "Gene"
annovar_df = annovar_df.join(gene_series)
annovar_df["combined_effect"] = annovar_df.apply(combine_effects, axis=1, args=(suffix,))
annovar_df.drop_duplicates(inplace=True)
splice_hgvs_series = annovar_df["Gene" + suffix].str.findall(r'\w+\((.+)\)').apply(pd.Series).stack()
splice_hgvs_series.index = splice_hgvs_series.index.droplevel(-1)
splice_hgvs_series.name = "hgvs"
annovar_df = annovar_df.join(splice_hgvs_series)
annovar_df["hgvs"] = annovar_df.apply(combine_hgvs, axis=1, args=(suffix,))
del annovar_df["Gene" + suffix]
#We need to keep our original labels to join on in the next step
annovar_df = annovar_df.rename(columns={'Otherinfo.2':'POS', "Otherinfo.4": "REF", "Otherinfo.5": "ALT"})
annovar_df = annovar_df.dropna(axis=1, how='all')
store[annovar_file.rstrip(".tsv")] = annovar_df
build_annovar_hdf()