import os
import pandas
# downloaded from http://django.nubic.northwestern.edu/fundo/media/data/do_lite.txt
dolite = pandas.read_csv('do_lite.txt', sep='\t', names=['disease_name', 'gene_entrez', 'gene_symbol'])
dolite[:5]
disease_name | gene_entrez | gene_symbol | |
---|---|---|---|
0 | Adenocarcinoma, Mucinous | 10801 | SEPT9 |
1 | Adenocarcinoma, Mucinous | 10164 | CHST4 |
2 | Adenocarcinoma, Mucinous | 3860 | KRT13 |
3 | Hemorrhagic fevers, Viral | 3383 | ICAM1 |
4 | Hemorrhagic fevers, Viral | 3569 | IL6 |
diseases = sorted(set(dolite['disease_name']))
len(diseases)
561
with open('dolite_terms.txt', 'w') as write_file:
write_file.write('\n'.join(diseases))
path = os.path.join('..', 'data', 'term-names.tsv')
donames = pandas.read_csv(path, sep='\t')
donames[:5]
doid | name | type | |
---|---|---|---|
0 | DOID:3301 | gonadoblastoma | name |
1 | DOID:3652 | Leigh disease | name |
2 | DOID:3652 | Infantile necrotizing encephalomyelopathy | exact-synonym |
3 | DOID:3652 | juvenile subacute necrotizing encephalomyelopathy | exact-synonym |
4 | DOID:3652 | Leigh syndrome | exact-synonym |
donames['name_lower'] = [x.lower() for x in donames.name]
doname_map = donames[['doid', 'name_lower']].drop_duplicates()
doname_map[:5]
doid | name_lower | |
---|---|---|
0 | DOID:3301 | gonadoblastoma |
1 | DOID:3652 | leigh disease |
2 | DOID:3652 | infantile necrotizing encephalomyelopathy |
3 | DOID:3652 | juvenile subacute necrotizing encephalomyelopathy |
4 | DOID:3652 | leigh syndrome |
dolite_df = pandas.DataFrame(data = diseases, columns = ['dolite_name'])
dolite_df['name_lower'] = [x.lower() for x in dolite_df.dolite_name]
dolite_df[:5]
dolite_name | name_lower | |
---|---|---|
0 | AIDS | aids |
1 | Abortion | abortion |
2 | Abruption placentae | abruption placentae |
3 | Achalasia and cardiospasm | achalasia and cardiospasm |
4 | Acne | acne |
mapping_df = dolite_df.merge(doname_map, how='left')
mapping_df = mapping_df[['doid', 'dolite_name']].drop_duplicates()
mapping_df.to_csv('dolite_to_doid.tsv', sep='\t', index=False)
mapping_df[:5]
doid | dolite_name | |
---|---|---|
0 | DOID:635 | AIDS |
1 | NaN | Abortion |
2 | NaN | Abruption placentae |
3 | NaN | Achalasia and cardiospasm |
4 | DOID:6543 | Acne |
# number of matches
sum(isinstance(x, str) for x in mapping_df.doid)
372