#!/usr/bin/env python # coding: utf-8 # In[52]: import os import pandas # In[53]: # downloaded from http://django.nubic.northwestern.edu/fundo/media/data/do_lite.txt dolite = pandas.read_csv('do_lite.txt', sep='\t', names=['disease_name', 'gene_entrez', 'gene_symbol']) dolite[:5] # In[54]: diseases = sorted(set(dolite['disease_name'])) len(diseases) # In[ ]: # In[55]: with open('dolite_terms.txt', 'w') as write_file: write_file.write('\n'.join(diseases)) # In[56]: path = os.path.join('..', 'data', 'term-names.tsv') donames = pandas.read_csv(path, sep='\t') donames[:5] # In[57]: donames['name_lower'] = [x.lower() for x in donames.name] doname_map = donames[['doid', 'name_lower']].drop_duplicates() doname_map[:5] # In[58]: dolite_df = pandas.DataFrame(data = diseases, columns = ['dolite_name']) dolite_df['name_lower'] = [x.lower() for x in dolite_df.dolite_name] dolite_df[:5] # In[59]: mapping_df = dolite_df.merge(doname_map, how='left') mapping_df = mapping_df[['doid', 'dolite_name']].drop_duplicates() mapping_df.to_csv('dolite_to_doid.tsv', sep='\t', index=False) mapping_df[:5] # In[60]: # number of matches sum(isinstance(x, str) for x in mapping_df.doid) # In[ ]: # In[ ]: # In[ ]: