In [81]:
import io
import functools
import itertools
import gzip

import pandas

import eutility
In [5]:
# Read MeSH terms to MeSH names
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'
mesh_df = pandas.read_table(url)

# Read MeSH terms mapped to DO Slim terms
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/9fd75f14b17e01bebc97faf1bfa1b9025e9ce4de/data/xrefs-slim.tsv'
doslim_xref_df = pandas.read_table(url)
doslim_xref_df = doslim_xref_df[doslim_xref_df.resource == 'MSH'][['doid_code', 'doid_name', 'resource_id']].rename(columns={'resource_id': 'mesh_id'})
disease_df = doslim_xref_df.merge(mesh_df)
disease_df.to_csv('data/DO-slim-to-mesh.tsv', sep='\t', index=False)
disease_df.head()
Out[5]:
doid_code doid_name mesh_id mesh_name
0 DOID:2531 hematologic cancer D019337 Hematologic Neoplasms
1 DOID:1319 brain cancer D001932 Brain Neoplasms
2 DOID:263 kidney cancer D007680 Kidney Neoplasms
3 DOID:1793 pancreatic cancer D010190 Pancreatic Neoplasms
4 DOID:4159 skin cancer D012878 Skin Neoplasms
In [ ]:
# Diseases
In [17]:
rows_out = list()

for i, row in disease_df.iterrows():
    term_query = '{disease}[MeSH Major Topic]'.format(disease = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 10000)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))

disease_pmids_df = pandas.DataFrame(rows_out)
7199 articles for Hematologic Neoplasms
98466 articles for Brain Neoplasms
48992 articles for Kidney Neoplasms
45514 articles for Pancreatic Neoplasms
84775 articles for Skin Neoplasms
82806 articles for Bone Neoplasms
53364 articles for Ovarian Neoplasms
186097 articles for Breast Neoplasms
48742 articles for Glioma
88124 articles for Uterine Neoplasms
18380 articles for Adrenal Gland Neoplasms
32981 articles for Esophageal Neoplasms
12012 articles for Salivary Gland Neoplasms
81939 articles for Prostatic Neoplasms
63628 articles for Stomach Neoplasms
37204 articles for Urinary Bladder Neoplasms
14652 articles for Peripheral Nervous System Neoplasms
96253 articles for Liver Neoplasms
3491 articles for Vaginal Neoplasms
207632 articles for Head and Neck Neoplasms
32426 articles for Rectal Neoplasms
28522 articles for Eye Neoplasms
50324 articles for Colonic Neoplasms
19300 articles for Laryngeal Neoplasms
222943 articles for Neoplasms, Germ Cell and Embryonal
7270 articles for Thymus Neoplasms
8485 articles for Myosarcoma
1981 articles for Appendiceal Neoplasms
3430 articles for Ureteral Neoplasms
5828 articles for Vulvar Neoplasms
1636 articles for Jejunal Neoplasms
2432 articles for Vascular Neoplasms
9690 articles for Mesothelioma
59405 articles for Melanoma
2017 articles for Fallopian Tube Neoplasms
18355 articles for Testicular Neoplasms
5867 articles for Gallbladder Neoplasms
15100 articles for Meningeal Neoplasms
10948 articles for Bile Duct Neoplasms
9539 articles for Mediastinal Neoplasms
7679 articles for Spinal Cord Neoplasms
6221 articles for Retroperitoneal Neoplasms
24654 articles for Crohn Disease
38980 articles for Multiple Sclerosis
71135 articles for Diabetes Mellitus, Type 2
21812 articles for Colitis, Ulcerative
50368 articles for Diabetes Mellitus, Type 1
75897 articles for Arthritis, Rheumatoid
32486 articles for Coronary Artery Disease
143223 articles for Coronary Disease
293061 articles for Myocardial Ischemia
103287 articles for Obesity
13615 articles for Celiac Disease
39378 articles for Lupus Erythematosus, Systemic
19549 articles for Refractive Errors
5256 articles for Liver Cirrhosis, Biliary
3445 articles for Vitiligo
13218 articles for Macular Degeneration
15999 articles for Metabolic Syndrome X
87932 articles for Asthma
68694 articles for Schizophrenia
17883 articles for Migraine Disorders
54563 articles for Alzheimer Disease
10904 articles for Graves Disease
39786 articles for Parkinson Disease
11640 articles for Dermatitis, Atopic
24216 articles for Bipolar Disorder
9297 articles for Spondylitis, Ankylosing
8748 articles for Polycystic Ovary Syndrome
154312 articles for Hypertension
13910 articles for Scleroderma, Systemic
6705 articles for Behcet Syndrome
3802 articles for Osteitis Deformans
18506 articles for Leprosy
18546 articles for Intracranial Aneurysm
35084 articles for Glaucoma
11318 articles for Amyotrophic Lateral Sclerosis
2261 articles for Restless Legs Syndrome
4264 articles for Mucocutaneous Lymph Node Syndrome
17451 articles for Atherosclerosis
2110 articles for Alopecia Areata
32190 articles for Osteoporosis
20180 articles for Hypothyroidism
4156 articles for Glomerulonephritis, IGA
4443 articles for Creutzfeldt-Jakob Syndrome
841 articles for Azoospermia
102081 articles for Epilepsy
36282 articles for Hepatitis B
30074 articles for Pulmonary Disease, Chronic Obstructive
12701 articles for Aortic Aneurysm, Abdominal
54351 articles for Kidney Failure, Chronic
2930 articles for Arthritis, Psoriatic
1895 articles for Glomerulonephritis, Membranous
5868 articles for Diabetes, Gestational
43052 articles for Malaria
13789 articles for Autistic Disorder
9994 articles for Cardiomyopathy, Dilated
716 articles for Arthritis, Gouty
14214 articles for Leiomyoma
2271 articles for Cholangitis, Sclerosing
2353 articles for Narcolepsy
9434 articles for Cleft Lip
1223 articles for Idiopathic Pulmonary Fibrosis
16577 articles for Attention Deficit Disorder with Hyperactivity
3113 articles for Tourette Syndrome
7767 articles for Aortic Aneurysm, Thoracic
62986 articles for Depressive Disorder
34975 articles for Pancreatitis
12096 articles for Nephrolithiasis
16153 articles for Periodontitis
5172 articles for Barrett Esophagus
535 articles for Fuchs' Endothelial Dystrophy
3861 articles for Otosclerosis
1469 articles for Conduct Disorder
2932 articles for Glomerulosclerosis, Focal Segmental
25723 articles for Dental Caries
104377 articles for Anemia
4587 articles for Panic Disorder
58288 articles for Acquired Immunodeficiency Syndrome
In [25]:
with gzip.open('data/disease-pmids.tsv.gz', 'w') as write_file:
    write_file = io.TextIOWrapper(write_file)
    disease_pmids_df.to_csv(write_file, sep='\t', index=False)
In [23]:
# Symptoms
In [24]:
# Read MeSH Symptoms
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv'
symptom_df = pandas.read_table(url)
symptom_df.head()
Out[24]:
mesh_id mesh_name in_hsdn
0 D000006 Abdomen, Acute 1
1 D000270 Adie Syndrome 0
2 D000326 Adrenoleukodystrophy 0
3 D000334 Aerophagy 1
4 D000370 Ageusia 1
In [ ]:
rows_out = list()

for i, row in symptom_df.iterrows():
    term_query = '{symptom}[MeSH Terms:noexp]'.format(symptom = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))
In [41]:
symptom_pmids_df = pandas.DataFrame(rows_out)

with gzip.open('data/symptom-pmids.tsv.gz', 'w') as write_file:
    write_file = io.TextIOWrapper(write_file)
    symptom_pmids_df.to_csv(write_file, sep='\t', index=False)

symptom_pmids_df.head()
Out[41]:
mesh_id mesh_name in_hsdn term_query n_articles pubmed_ids
0 D000006 Abdomen, Acute 1 abdomen, acute[MeSH Terms:noexp] 8465 25669229|25650451|25619050|25608417|25543890|2...
1 D000270 Adie Syndrome 0 adie syndrome[MeSH Terms:noexp] 311 24995781|24625775|24215593|23952008|23809464|2...
2 D000326 Adrenoleukodystrophy 0 adrenoleukodystrophy[MeSH Terms:noexp] 1506 25583825|25378668|25297370|25275259|25149411|2...
3 D000334 Aerophagy 1 aerophagy[MeSH Terms:noexp] 260 25073665|24796405|23772202|23772201|23636521|2...
4 D000370 Ageusia 1 ageusia[MeSH Terms:noexp] 220 24825557|24782205|24191925|24137848|24088167|2...
In [ ]:
 
In [130]:
def read_pmids_tsv(path, key, min_articles = 5):
    term_to_pmids = dict()
    pmids_df = pandas.read_table(path, compression='gzip')
    pmids_df = pmids_df[pmids_df.n_articles >= min_articles]
    for i, row in pmids_df.iterrows():
        term = row[key]
        pmids = row.pubmed_ids.split('|')
        term_to_pmids[term] = set(pmids)
    pmids_df.drop('pubmed_ids', axis=1, inplace=True)
    return pmids_df, term_to_pmids
In [131]:
symptom_df, symptom_to_pmids = read_pmids_tsv('data/symptom-pmids.tsv.gz', key='mesh_id')
disease_df, disease_to_pmids = read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code')
In [79]:
symptom_pmids = set.union(*symptom_to_pmids.values())
len(symptom_pmids)
Out[79]:
1741776
In [80]:
disease_pmids = set.union(*disease_to_pmids.values())
len(disease_pmids)
Out[80]:
3413567
In [138]:
def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1'):
    all_pmids0 = set.union(*term0_to_pmids.values())
    all_pmids1 = set.union(*term1_to_pmids.values())
    pmids_in_both = all_pmids0 & all_pmids1
    total_pmids = len(pmids_in_both)
    
    term0_to_pmids = term0_to_pmids.copy()
    term1_to_pmids = term1_to_pmids.copy()
    for d in term0_to_pmids, term1_to_pmids:
        for key, value in list(d.items()):
            d[key] = value & pmids_in_both
            if not d[key]:
                del d[key]
    
    rows = list()
    for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):
        pmids0 = term0_to_pmids[term0]
        pmids1 = term1_to_pmids[term1]
        count = len(pmids0 & pmids1)
        expected = len(pmids0) * len(pmids1) / total_pmids
        enrichment = count / expected
        contingency_table = [[count, total_pmids - count], [expected, total_pmids - expected]]
        oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')
        rows.append([term0, term1, count, expected, enrichment, oddsratio, pvalue])
    columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']
    df = pandas.DataFrame(rows, columns=columns)
    return df
In [141]:
cooc_df = score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')
cooc_df = symptom_df[['mesh_id', 'mesh_name']].merge(cooc_df)
cooc_df = disease_df[['doid_code', 'doid_name']].merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])
cooc_df.to_csv('data/disease-symptom-cooccurrence.tsv', index=False, sep='\t')
cooc_df.head()
Out[141]:
doid_code doid_name mesh_id mesh_name cooccurrence expected enrichment odds_ratio p_fisher
26450 DOID:10652 Alzheimer's disease D008569 Memory Disorders 1564 67.100289 23.308394 23.432635 0.000000e+00
26360 DOID:10652 Alzheimer's disease D004314 Down Syndrome 800 32.051845 24.959561 25.048966 1.443810e-193
26494 DOID:10652 Alzheimer's disease D011595 Psychomotor Agitation 331 13.675650 24.203603 25.482125 3.284937e-81
26299 DOID:10652 Alzheimer's disease D000647 Amnesia 303 12.059837 25.124717 25.268675 2.382544e-74
26423 DOID:10652 Alzheimer's disease D006816 Huntington Disease 251 10.884701 23.059889 25.115362 9.196812e-62
In [ ]:
 
In [111]:
import numpy
import scipy
import seaborn
import matplotlib.pyplot as plt

%matplotlib inline
In [129]:
sig_df = cooc_df[cooc_df.p_fisher < 0.05]
plt.hist(list(numpy.log(sig_df.enrichment)), bins = 50);
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: