#!/usr/bin/env python # coding: utf-8 # In[1]: import os import pandas # In[ ]: # In[2]: path = os.path.join('data', 'problems.tsv') problem_df = pandas.read_table(path) problem_df['name_lower'] = list(map(str.lower, problem_df.problem)) problem_df[:3] # In[3]: len(problem_df) # In[4]: url = 'http://git.dhimmel.com/disease-ontology/data/term-names.tsv' doterm_df = pandas.read_table(url) doterm_df['name_lower'] = list(map(str.lower, doterm_df.name)) set(doterm_df.type) # In[5]: doterm_df = doterm_df.query("type in ['exact-synonym', 'name']") doterm_df[:3] # In[6]: domap_df = problem_df.merge(doterm_df, how='left', on='name_lower') path = os.path.join('data', 'problem-to-doid.tsv') domap_df.to_csv(path, index=False, sep='\t') # In[7]: pair_df = domap_df[['problem_definition_id', 'doid']].dropna().drop_duplicates() len(pair_df) # In[8]: domap_df[:50] # In[ ]: # In[9]: mapped_problems = set(pair_df.problem_definition_id) # In[10]: len(mapped_problems) / len(problem_df) # In[18]: len(mapped_problems) # In[11]: # Find problems that mapped to multiple DO terms duplicates = set(pair_df[pair_df.duplicated('problem_definition_id')].problem_definition_id) domap_df[domap_df.problem_definition_id.isin(duplicates)] # In[ ]: # In[16]: url = 'http://git.dhimmel.com/disease-ontology/data/slim-terms.tsv' doslim_df = pandas.read_table(url) len(doslim_df) # In[17]: sum(doslim_df.doid.isin(set(pair_df.doid))) # In[24]: # with propagation url = 'http://git.dhimmel.com/disease-ontology/data/slim-terms-prop.tsv' doslim_df = pandas.read_table(url) len(set(doslim_df.slim_id[doslim_df.subsumed_id.isin(set(pair_df.doid))])) # In[ ]: