import io
import functools
import itertools
import gzip
import pandas
import eutility
# Read MeSH terms to MeSH names
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'
mesh_df = pandas.read_table(url)
# Read MeSH terms mapped to DO Slim terms
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/9fd75f14b17e01bebc97faf1bfa1b9025e9ce4de/data/xrefs-slim.tsv'
doslim_xref_df = pandas.read_table(url)
doslim_xref_df = doslim_xref_df[doslim_xref_df.resource == 'MSH'][['doid_code', 'doid_name', 'resource_id']].rename(columns={'resource_id': 'mesh_id'})
disease_df = doslim_xref_df.merge(mesh_df)
disease_df.to_csv('data/DO-slim-to-mesh.tsv', sep='\t', index=False)
disease_df.head()
doid_code | doid_name | mesh_id | mesh_name | |
---|---|---|---|---|
0 | DOID:2531 | hematologic cancer | D019337 | Hematologic Neoplasms |
1 | DOID:1319 | brain cancer | D001932 | Brain Neoplasms |
2 | DOID:263 | kidney cancer | D007680 | Kidney Neoplasms |
3 | DOID:1793 | pancreatic cancer | D010190 | Pancreatic Neoplasms |
4 | DOID:4159 | skin cancer | D012878 | Skin Neoplasms |
# Diseases
rows_out = list()
for i, row in disease_df.iterrows():
term_query = '{disease}[MeSH Major Topic]'.format(disease = row.mesh_name.lower())
payload = {'db': 'pubmed', 'term': term_query}
pmids = eutility.esearch_query(payload, retmax = 10000)
row['term_query'] = term_query
row['n_articles'] = len(pmids)
row['pubmed_ids'] = '|'.join(pmids)
rows_out.append(row)
print('{} articles for {}'.format(len(pmids), row.mesh_name))
disease_pmids_df = pandas.DataFrame(rows_out)
7199 articles for Hematologic Neoplasms 98466 articles for Brain Neoplasms 48992 articles for Kidney Neoplasms 45514 articles for Pancreatic Neoplasms 84775 articles for Skin Neoplasms 82806 articles for Bone Neoplasms 53364 articles for Ovarian Neoplasms 186097 articles for Breast Neoplasms 48742 articles for Glioma 88124 articles for Uterine Neoplasms 18380 articles for Adrenal Gland Neoplasms 32981 articles for Esophageal Neoplasms 12012 articles for Salivary Gland Neoplasms 81939 articles for Prostatic Neoplasms 63628 articles for Stomach Neoplasms 37204 articles for Urinary Bladder Neoplasms 14652 articles for Peripheral Nervous System Neoplasms 96253 articles for Liver Neoplasms 3491 articles for Vaginal Neoplasms 207632 articles for Head and Neck Neoplasms 32426 articles for Rectal Neoplasms 28522 articles for Eye Neoplasms 50324 articles for Colonic Neoplasms 19300 articles for Laryngeal Neoplasms 222943 articles for Neoplasms, Germ Cell and Embryonal 7270 articles for Thymus Neoplasms 8485 articles for Myosarcoma 1981 articles for Appendiceal Neoplasms 3430 articles for Ureteral Neoplasms 5828 articles for Vulvar Neoplasms 1636 articles for Jejunal Neoplasms 2432 articles for Vascular Neoplasms 9690 articles for Mesothelioma 59405 articles for Melanoma 2017 articles for Fallopian Tube Neoplasms 18355 articles for Testicular Neoplasms 5867 articles for Gallbladder Neoplasms 15100 articles for Meningeal Neoplasms 10948 articles for Bile Duct Neoplasms 9539 articles for Mediastinal Neoplasms 7679 articles for Spinal Cord Neoplasms 6221 articles for Retroperitoneal Neoplasms 24654 articles for Crohn Disease 38980 articles for Multiple Sclerosis 71135 articles for Diabetes Mellitus, Type 2 21812 articles for Colitis, Ulcerative 50368 articles for Diabetes Mellitus, Type 1 75897 articles for Arthritis, Rheumatoid 32486 articles for Coronary Artery Disease 143223 articles for Coronary Disease 293061 articles for Myocardial Ischemia 103287 articles for Obesity 13615 articles for Celiac Disease 39378 articles for Lupus Erythematosus, Systemic 19549 articles for Refractive Errors 5256 articles for Liver Cirrhosis, Biliary 3445 articles for Vitiligo 13218 articles for Macular Degeneration 15999 articles for Metabolic Syndrome X 87932 articles for Asthma 68694 articles for Schizophrenia 17883 articles for Migraine Disorders 54563 articles for Alzheimer Disease 10904 articles for Graves Disease 39786 articles for Parkinson Disease 11640 articles for Dermatitis, Atopic 24216 articles for Bipolar Disorder 9297 articles for Spondylitis, Ankylosing 8748 articles for Polycystic Ovary Syndrome 154312 articles for Hypertension 13910 articles for Scleroderma, Systemic 6705 articles for Behcet Syndrome 3802 articles for Osteitis Deformans 18506 articles for Leprosy 18546 articles for Intracranial Aneurysm 35084 articles for Glaucoma 11318 articles for Amyotrophic Lateral Sclerosis 2261 articles for Restless Legs Syndrome 4264 articles for Mucocutaneous Lymph Node Syndrome 17451 articles for Atherosclerosis 2110 articles for Alopecia Areata 32190 articles for Osteoporosis 20180 articles for Hypothyroidism 4156 articles for Glomerulonephritis, IGA 4443 articles for Creutzfeldt-Jakob Syndrome 841 articles for Azoospermia 102081 articles for Epilepsy 36282 articles for Hepatitis B 30074 articles for Pulmonary Disease, Chronic Obstructive 12701 articles for Aortic Aneurysm, Abdominal 54351 articles for Kidney Failure, Chronic 2930 articles for Arthritis, Psoriatic 1895 articles for Glomerulonephritis, Membranous 5868 articles for Diabetes, Gestational 43052 articles for Malaria 13789 articles for Autistic Disorder 9994 articles for Cardiomyopathy, Dilated 716 articles for Arthritis, Gouty 14214 articles for Leiomyoma 2271 articles for Cholangitis, Sclerosing 2353 articles for Narcolepsy 9434 articles for Cleft Lip 1223 articles for Idiopathic Pulmonary Fibrosis 16577 articles for Attention Deficit Disorder with Hyperactivity 3113 articles for Tourette Syndrome 7767 articles for Aortic Aneurysm, Thoracic 62986 articles for Depressive Disorder 34975 articles for Pancreatitis 12096 articles for Nephrolithiasis 16153 articles for Periodontitis 5172 articles for Barrett Esophagus 535 articles for Fuchs' Endothelial Dystrophy 3861 articles for Otosclerosis 1469 articles for Conduct Disorder 2932 articles for Glomerulosclerosis, Focal Segmental 25723 articles for Dental Caries 104377 articles for Anemia 4587 articles for Panic Disorder 58288 articles for Acquired Immunodeficiency Syndrome
with gzip.open('data/disease-pmids.tsv.gz', 'w') as write_file:
write_file = io.TextIOWrapper(write_file)
disease_pmids_df.to_csv(write_file, sep='\t', index=False)
# Symptoms
# Read MeSH Symptoms
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv'
symptom_df = pandas.read_table(url)
symptom_df.head()
mesh_id | mesh_name | in_hsdn | |
---|---|---|---|
0 | D000006 | Abdomen, Acute | 1 |
1 | D000270 | Adie Syndrome | 0 |
2 | D000326 | Adrenoleukodystrophy | 0 |
3 | D000334 | Aerophagy | 1 |
4 | D000370 | Ageusia | 1 |
rows_out = list()
for i, row in symptom_df.iterrows():
term_query = '{symptom}[MeSH Terms:noexp]'.format(symptom = row.mesh_name.lower())
payload = {'db': 'pubmed', 'term': term_query}
pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2)
row['term_query'] = term_query
row['n_articles'] = len(pmids)
row['pubmed_ids'] = '|'.join(pmids)
rows_out.append(row)
print('{} articles for {}'.format(len(pmids), row.mesh_name))
symptom_pmids_df = pandas.DataFrame(rows_out)
with gzip.open('data/symptom-pmids.tsv.gz', 'w') as write_file:
write_file = io.TextIOWrapper(write_file)
symptom_pmids_df.to_csv(write_file, sep='\t', index=False)
symptom_pmids_df.head()
mesh_id | mesh_name | in_hsdn | term_query | n_articles | pubmed_ids | |
---|---|---|---|---|---|---|
0 | D000006 | Abdomen, Acute | 1 | abdomen, acute[MeSH Terms:noexp] | 8465 | 25669229|25650451|25619050|25608417|25543890|2... |
1 | D000270 | Adie Syndrome | 0 | adie syndrome[MeSH Terms:noexp] | 311 | 24995781|24625775|24215593|23952008|23809464|2... |
2 | D000326 | Adrenoleukodystrophy | 0 | adrenoleukodystrophy[MeSH Terms:noexp] | 1506 | 25583825|25378668|25297370|25275259|25149411|2... |
3 | D000334 | Aerophagy | 1 | aerophagy[MeSH Terms:noexp] | 260 | 25073665|24796405|23772202|23772201|23636521|2... |
4 | D000370 | Ageusia | 1 | ageusia[MeSH Terms:noexp] | 220 | 24825557|24782205|24191925|24137848|24088167|2... |
def read_pmids_tsv(path, key, min_articles = 5):
term_to_pmids = dict()
pmids_df = pandas.read_table(path, compression='gzip')
pmids_df = pmids_df[pmids_df.n_articles >= min_articles]
for i, row in pmids_df.iterrows():
term = row[key]
pmids = row.pubmed_ids.split('|')
term_to_pmids[term] = set(pmids)
pmids_df.drop('pubmed_ids', axis=1, inplace=True)
return pmids_df, term_to_pmids
symptom_df, symptom_to_pmids = read_pmids_tsv('data/symptom-pmids.tsv.gz', key='mesh_id')
disease_df, disease_to_pmids = read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code')
symptom_pmids = set.union(*symptom_to_pmids.values())
len(symptom_pmids)
1741776
disease_pmids = set.union(*disease_to_pmids.values())
len(disease_pmids)
3413567
def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1'):
all_pmids0 = set.union(*term0_to_pmids.values())
all_pmids1 = set.union(*term1_to_pmids.values())
pmids_in_both = all_pmids0 & all_pmids1
total_pmids = len(pmids_in_both)
term0_to_pmids = term0_to_pmids.copy()
term1_to_pmids = term1_to_pmids.copy()
for d in term0_to_pmids, term1_to_pmids:
for key, value in list(d.items()):
d[key] = value & pmids_in_both
if not d[key]:
del d[key]
rows = list()
for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):
pmids0 = term0_to_pmids[term0]
pmids1 = term1_to_pmids[term1]
count = len(pmids0 & pmids1)
expected = len(pmids0) * len(pmids1) / total_pmids
enrichment = count / expected
contingency_table = [[count, total_pmids - count], [expected, total_pmids - expected]]
oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')
rows.append([term0, term1, count, expected, enrichment, oddsratio, pvalue])
columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']
df = pandas.DataFrame(rows, columns=columns)
return df
cooc_df = score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')
cooc_df = symptom_df[['mesh_id', 'mesh_name']].merge(cooc_df)
cooc_df = disease_df[['doid_code', 'doid_name']].merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])
cooc_df.to_csv('data/disease-symptom-cooccurrence.tsv', index=False, sep='\t')
cooc_df.head()
doid_code | doid_name | mesh_id | mesh_name | cooccurrence | expected | enrichment | odds_ratio | p_fisher | |
---|---|---|---|---|---|---|---|---|---|
26450 | DOID:10652 | Alzheimer's disease | D008569 | Memory Disorders | 1564 | 67.100289 | 23.308394 | 23.432635 | 0.000000e+00 |
26360 | DOID:10652 | Alzheimer's disease | D004314 | Down Syndrome | 800 | 32.051845 | 24.959561 | 25.048966 | 1.443810e-193 |
26494 | DOID:10652 | Alzheimer's disease | D011595 | Psychomotor Agitation | 331 | 13.675650 | 24.203603 | 25.482125 | 3.284937e-81 |
26299 | DOID:10652 | Alzheimer's disease | D000647 | Amnesia | 303 | 12.059837 | 25.124717 | 25.268675 | 2.382544e-74 |
26423 | DOID:10652 | Alzheimer's disease | D006816 | Huntington Disease | 251 | 10.884701 | 23.059889 | 25.115362 | 9.196812e-62 |
import numpy
import scipy
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline
sig_df = cooc_df[cooc_df.p_fisher < 0.05]
plt.hist(list(numpy.log(sig_df.enrichment)), bins = 50);