from IPython.display import HTML import pandas as pd import numpy as np import os # snp_eff_ensembl_df = pd.read_table("./cftr_snpeff_ensembl.vcf", skiprows=7, header=0, usecols=range(10)) snp_eff_ensembl_df = pd.read_table("./snpeff_test.vcf", skiprows=7, header=0, usecols=range(10)) snp_eff_ensembl_df.head() info_df = snp_eff_ensembl_df["INFO"].str.replace("EFF=", "").str.split(',').apply(pd.Series, 1).stack() var_class_df = info_df.apply(lambda x: pd.Series(x.split('('))) detail_df = var_class_df[1].str.rstrip(")").apply(lambda x: pd.Series(x.split('|'))) detail_df.columns = ['Effect_Impact', 'Functional_Class', 'Codon_Change', 'Amino_Acid_Change', 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType', 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number', 'WARNINGS'] del var_class_df[1] var_class_df.columns = ['Effect'] var_class_df = var_class_df.join(detail_df) var_class_df.index = var_class_df.index.droplevel(-1) var_class_df.head() combo_df = snp_eff_ensembl_df.join(var_class_df) combo_df.replace(to_replace='', value=np.nan, inplace=True) HTML(combo_df[combo_df["Effect"] == "XON"].head().to_html()) # combo_df_drop_nulls = combo_df[combo_df["Transcript_ID"].isnull() == False] # combo_df_drop_nulls = combo_df_drop_nulls.set_index(keys=['POS', 'Transcript_ID']) # HTML(combo_df_drop_nulls.head().to_html()) store_file="classified_variant_store.h5" try: os.remove(store_file) except OSError: pass store = pd.HDFStore(store_file) store.close() def build_snpeff_hdf(): with pd.get_store("./classified_variant_store.h5") as store: for snpeff_file in ["cftr_snpeff_ensembl.vcf", "cftr_snpeff_refseq.vcf"]: snpeff_df = pd.read_table(snpeff_file, skiprows=7, header=0, usecols=range(10)) var_class_df = snpeff_df["INFO"].str.replace("EFF=", "").str.split(',').apply(pd.Series, 1).stack() var_class_df = var_class_df.apply(lambda x: pd.Series(x.split('('))) detail_df = var_class_df[1].str.rstrip(")").apply(lambda x: pd.Series(x.split('|'))) num_col = len(detail_df.columns) detail_df.columns = ['Effect_Impact', 'Functional_Class', 'Codon_Change', 'Amino_Acid_Change', 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType', 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number', "Warnings", "Errors"][:num_col] del var_class_df[1] var_class_df.columns = ['Effect'] var_class_df = var_class_df.join(detail_df) var_class_df.index = var_class_df.index.droplevel(-1) var_class_df.head() snpeff_df = snpeff_df.join(var_class_df) # combo_df.replace(to_replace='', value=np.nan, inplace=True) # combo_df_drop_nulls = combo_df[combo_df["Transcript_ID"].isnull() == False] # combo_df_drop_nulls = combo_df_drop_nulls.set_index(keys=['POS', 'Transcript_ID']) snpeff_df.dropna(axis=1, how='all') store[snpeff_file.rstrip(".vcf")] = snpeff_df build_snpeff_hdf() def build_vep_hdf(): with pd.get_store("./classified_variant_store.h5") as store: for vep_file in [ "cftr_vep_refseq.vcf", "cftr_vep_ensembl.vcf"]: vep_df = pd.read_table(vep_file, skiprows=5, header=0, usecols=range(10)) info_df = vep_df["INFO"].str.lstrip("CSQ=").str.split(',').apply(pd.Series, 1).stack() detail_df = info_df.apply(lambda x: pd.Series(x.split('|'))) headers = "Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|AA_MAF|EA_MAF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|DISTANCE|STRAND|CLIN_SIG|SYMBOL|SYMBOL_SOURCE|SIFT|PolyPhen|GMAF|BIOTYPE|HGVSc|HGVSp|AFR_MAF|AMR_MAF|ASN_MAF|EUR_MAF" headers_list = headers.split("|") detail_df.columns = headers_list consq_series = detail_df["Consequence"].str.split('&').apply(pd.Series, 1).stack() consq_series.index = consq_series.index.droplevel(-1) consq_series.name = "Consequence" del detail_df["Consequence"] detail_df = detail_df.join(consq_series) # Note that it is key to drop index level in the reverse order that they are built up detail_df.index = detail_df.index.droplevel(-1) vep_df = vep_df.join(detail_df) # vep_ensembl_df.replace(to_replace='', value=np.nan, inplace=True) vep_df = vep_df.dropna(axis=1, how='all') vep_df = vep_df.drop_duplicates() store[vep_file.rstrip(".vcf")] = vep_df build_vep_hdf() def combine_effects(row, suffix): if row["Func" + suffix] == "exonic": return row["ExonicFunc" + suffix] else: return row["Func" + suffix] def combine_hgvs(row, suffix): if row["Func" + suffix] == "splicing": return row["hgvs"] else: return row["AAChange" + suffix] def build_annovar_hdf(): with pd.get_store("./classified_variant_store.h5") as store: for annovar_file in ["cftr_annovar_ensembl.tsv", "cftr_annovar_refseq.tsv"]: annovar_df = pd.read_table(annovar_file) if annovar_file == "cftr_annovar_ensembl.tsv": suffix = ".ensgene" else: suffix = ".refgene" func = annovar_df["Func" + suffix].str.split(";").apply(pd.Series, 1).stack() func.index = func.index.droplevel(-1) func.name = "Func" + suffix del annovar_df["Func" + suffix] annovar_df = annovar_df.join(func) annovar_df.loc[annovar_df["Func" + suffix] == 'intergenic', ["Gene" + suffix]] = np.nan gene_series = annovar_df["Gene" + suffix].str.extract(r'(\w+)\(*') print gene_series.head() gene_series.name = "Gene" annovar_df = annovar_df.join(gene_series) annovar_df["combined_effect"] = annovar_df.apply(combine_effects, axis=1, args=(suffix,)) annovar_df.drop_duplicates(inplace=True) splice_hgvs_series = annovar_df["Gene" + suffix].str.findall(r'\w+\((.+)\)').apply(pd.Series).stack() splice_hgvs_series.index = splice_hgvs_series.index.droplevel(-1) splice_hgvs_series.name = "hgvs" annovar_df = annovar_df.join(splice_hgvs_series) annovar_df["hgvs"] = annovar_df.apply(combine_hgvs, axis=1, args=(suffix,)) del annovar_df["Gene" + suffix] #We need to keep our original labels to join on in the next step annovar_df = annovar_df.rename(columns={'Otherinfo.2':'POS', "Otherinfo.4": "REF", "Otherinfo.5": "ALT"}) annovar_df = annovar_df.dropna(axis=1, how='all') store[annovar_file.rstrip(".tsv")] = annovar_df build_annovar_hdf()