import io
import gzip
import pandas
import requests
import networkx
import eutility
import cooccurrence
# Read MeSH terms to MeSH names
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'
mesh_df = pandas.read_table(url)
# Read MeSH terms mapped to DO Slim terms
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs-slim.tsv'
doslim_xref_df = pandas.read_table(url)
doslim_xref_df = doslim_xref_df[doslim_xref_df.resource == 'MSH'][['doid_code', 'doid_name', 'resource_id']].rename(columns={'resource_id': 'mesh_id'})
disease_df = doslim_xref_df.merge(mesh_df)
disease_df.to_csv('data/DO-slim-to-mesh.tsv', sep='\t', index=False)
len(disease_df)
135
rows_out = list()
for i, row in disease_df.iterrows():
term_query = '{disease}[MeSH Major Topic]'.format(disease = row.mesh_name.lower())
payload = {'db': 'pubmed', 'term': term_query}
pmids = eutility.esearch_query(payload, retmax = 10000)
row['term_query'] = term_query
row['n_articles'] = len(pmids)
row['pubmed_ids'] = '|'.join(pmids)
rows_out.append(row)
print('{} articles for {}'.format(len(pmids), row.mesh_name))
disease_pmids_df = pandas.DataFrame(rows_out)
7213 articles for Hematologic Neoplasms 98558 articles for Brain Neoplasms 137542 articles for Lung Neoplasms 49057 articles for Kidney Neoplasms 45600 articles for Pancreatic Neoplasms 84831 articles for Skin Neoplasms 82879 articles for Bone Neoplasms 21863 articles for Pharyngeal Neoplasms 53403 articles for Ovarian Neoplasms 186325 articles for Breast Neoplasms 48809 articles for Glioma 88187 articles for Uterine Neoplasms 18395 articles for Adrenal Gland Neoplasms 33026 articles for Esophageal Neoplasms 12025 articles for Salivary Gland Neoplasms 82041 articles for Prostatic Neoplasms 63673 articles for Stomach Neoplasms 37244 articles for Urinary Bladder Neoplasms 14661 articles for Peripheral Nervous System Neoplasms 32853 articles for Thyroid Neoplasms 96367 articles for Liver Neoplasms 49717 articles for Uterine Cervical Neoplasms 3492 articles for Vaginal Neoplasms 207845 articles for Head and Neck Neoplasms 32455 articles for Rectal Neoplasms 28541 articles for Eye Neoplasms 50361 articles for Colonic Neoplasms 19305 articles for Laryngeal Neoplasms 223120 articles for Neoplasms, Germ Cell and Embryonal 7274 articles for Thymus Neoplasms 8490 articles for Myosarcoma 4342 articles for Duodenal Neoplasms 2147 articles for Ileal Neoplasms 89596 articles for Sarcoma 1984 articles for Appendiceal Neoplasms 3854 articles for Penile Neoplasms 3432 articles for Ureteral Neoplasms 2514 articles for Tracheal Neoplasms 5829 articles for Vulvar Neoplasms 1636 articles for Jejunal Neoplasms 9733 articles for Peritoneal Neoplasms 2437 articles for Vascular Neoplasms 9700 articles for Mesothelioma 59447 articles for Melanoma 2018 articles for Fallopian Tube Neoplasms 18367 articles for Testicular Neoplasms 5872 articles for Gallbladder Neoplasms 15110 articles for Meningeal Neoplasms 10961 articles for Bile Duct Neoplasms 9541 articles for Mediastinal Neoplasms 7693 articles for Spinal Cord Neoplasms 6224 articles for Retroperitoneal Neoplasms 24687 articles for Crohn Disease 39017 articles for Multiple Sclerosis 71286 articles for Diabetes Mellitus, Type 2 21833 articles for Colitis, Ulcerative 50405 articles for Diabetes Mellitus, Type 1 75972 articles for Arthritis, Rheumatoid 32563 articles for Coronary Artery Disease 143358 articles for Coronary Disease 293328 articles for Myocardial Ischemia 103441 articles for Obesity 13619 articles for Celiac Disease 39405 articles for Lupus Erythematosus, Systemic 19571 articles for Refractive Errors 5261 articles for Liver Cirrhosis, Biliary 3448 articles for Vitiligo 13232 articles for Macular Degeneration 16029 articles for Metabolic Syndrome X 87970 articles for Asthma 24985 articles for Psoriasis 68725 articles for Schizophrenia 17897 articles for Migraine Disorders 54613 articles for Alzheimer Disease 10918 articles for Graves Disease 39842 articles for Parkinson Disease 11649 articles for Dermatitis, Atopic 24237 articles for Bipolar Disorder 9306 articles for Spondylitis, Ankylosing 8762 articles for Polycystic Ovary Syndrome 154478 articles for Hypertension 13923 articles for Scleroderma, Systemic 6711 articles for Behcet Syndrome 3804 articles for Osteitis Deformans 18506 articles for Leprosy 18562 articles for Intracranial Aneurysm 35107 articles for Glaucoma 11330 articles for Amyotrophic Lateral Sclerosis 2263 articles for Restless Legs Syndrome 4271 articles for Mucocutaneous Lymph Node Syndrome 17504 articles for Atherosclerosis 2111 articles for Alopecia Areata 32226 articles for Osteoporosis 20187 articles for Hypothyroidism 4163 articles for Glomerulonephritis, IGA 49237 articles for Alcoholism 4445 articles for Creutzfeldt-Jakob Syndrome 842 articles for Azoospermia 102134 articles for Epilepsy 36304 articles for Hepatitis B 30117 articles for Pulmonary Disease, Chronic Obstructive 12709 articles for Aortic Aneurysm, Abdominal 54376 articles for Kidney Failure, Chronic 32784 articles for Osteoarthritis 2935 articles for Arthritis, Psoriatic 6253 articles for Tobacco Use Disorder 1897 articles for Glomerulonephritis, Membranous 5886 articles for Diabetes, Gestational 43086 articles for Malaria 13804 articles for Autistic Disorder 10003 articles for Cardiomyopathy, Dilated 717 articles for Arthritis, Gouty 14223 articles for Leiomyoma 2274 articles for Cholangitis, Sclerosing 2357 articles for Narcolepsy 1449 articles for Intervertebral Disc Degeneration 9441 articles for Cleft Lip 1226 articles for Idiopathic Pulmonary Fibrosis 16613 articles for Attention Deficit Disorder with Hyperactivity 3118 articles for Tourette Syndrome 7770 articles for Aortic Aneurysm, Thoracic 63035 articles for Depressive Disorder 13751 articles for Rhinitis, Allergic 34999 articles for Pancreatitis 12098 articles for Nephrolithiasis 16165 articles for Periodontitis 5175 articles for Barrett Esophagus 539 articles for Fuchs' Endothelial Dystrophy 3862 articles for Otosclerosis 1470 articles for Conduct Disorder 2943 articles for Glomerulosclerosis, Focal Segmental 25730 articles for Dental Caries 104427 articles for Anemia 4588 articles for Panic Disorder 58290 articles for Acquired Immunodeficiency Syndrome
with gzip.open('data/disease-pmids.tsv.gz', 'w') as write_file:
write_file = io.TextIOWrapper(write_file)
disease_pmids_df.to_csv(write_file, sep='\t', index=False)
# Read MeSH Symptoms
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv'
symptom_df = pandas.read_table(url)
symptom_df.head()
mesh_id | mesh_name | in_hsdn | |
---|---|---|---|
0 | D000006 | Abdomen, Acute | 1 |
1 | D000270 | Adie Syndrome | 0 |
2 | D000326 | Adrenoleukodystrophy | 0 |
3 | D000334 | Aerophagy | 1 |
4 | D000370 | Ageusia | 1 |
rows_out = list()
for i, row in symptom_df.iterrows():
term_query = '{symptom}[MeSH Terms:noexp]'.format(symptom = row.mesh_name.lower())
payload = {'db': 'pubmed', 'term': term_query}
pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2)
row['term_query'] = term_query
row['n_articles'] = len(pmids)
row['pubmed_ids'] = '|'.join(pmids)
rows_out.append(row)
print('{} articles for {}'.format(len(pmids), row.mesh_name))
symptom_pmids_df = pandas.DataFrame(rows_out)
with gzip.open('data/symptom-pmids.tsv.gz', 'w') as write_file:
write_file = io.TextIOWrapper(write_file)
symptom_pmids_df.to_csv(write_file, sep='\t', index=False)
symptom_pmids_df.head()
mesh_id | mesh_name | in_hsdn | term_query | n_articles | pubmed_ids | |
---|---|---|---|---|---|---|
0 | D000006 | Abdomen, Acute | 1 | abdomen, acute[MeSH Terms:noexp] | 8465 | 25669229|25650451|25619050|25608417|25543890|2... |
1 | D000270 | Adie Syndrome | 0 | adie syndrome[MeSH Terms:noexp] | 311 | 24995781|24625775|24215593|23952008|23809464|2... |
2 | D000326 | Adrenoleukodystrophy | 0 | adrenoleukodystrophy[MeSH Terms:noexp] | 1506 | 25583825|25378668|25297370|25275259|25149411|2... |
3 | D000334 | Aerophagy | 1 | aerophagy[MeSH Terms:noexp] | 260 | 25073665|24796405|23772202|23772201|23636521|2... |
4 | D000370 | Ageusia | 1 | ageusia[MeSH Terms:noexp] | 220 | 24825557|24782205|24191925|24137848|24088167|2... |
symptom_df, symptom_to_pmids = cooccurrence.read_pmids_tsv('data/symptom-pmids.tsv.gz', key='mesh_id')
disease_df, disease_to_pmids = cooccurrence.read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code')
symptom_pmids = set.union(*symptom_to_pmids.values())
len(symptom_pmids)
1741776
disease_pmids = set.union(*disease_to_pmids.values())
len(disease_pmids)
3686312
cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')
Total articles containing a doid_code: 3686312 Total articles containing a mesh_id: 1741776 Total articles containing both a doid_code and mesh_id: 405539 After removing terms without any cooccurences: + 133 doid_codes remain + 426 mesh_ids remain Cooccurrence scores calculated for 56658 doid_code -- mesh_id pairs
cooc_df = symptom_df[['mesh_id', 'mesh_name']].drop_duplicates().merge(cooc_df)
cooc_df = disease_df[['doid_code', 'doid_name']].drop_duplicates().merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])
cooc_df.to_csv('data/disease-symptom-cooccurrence.tsv', index=False, sep='\t')
cooc_df.head()
doid_code | doid_name | mesh_id | mesh_name | cooccurrence | expected | enrichment | odds_ratio | p_fisher | |
---|---|---|---|---|---|---|---|---|---|
30318 | DOID:10652 | Alzheimer's disease | D004314 | Down Syndrome | 800 | 31.340068 | 25.526428 | 45.694983 | 0.000000e+00 |
30408 | DOID:10652 | Alzheimer's disease | D008569 | Memory Disorders | 1564 | 66.801960 | 23.412487 | 47.098472 | 0.000000e+00 |
30452 | DOID:10652 | Alzheimer's disease | D011595 | Psychomotor Agitation | 331 | 13.502524 | 24.513936 | 39.286225 | 0.000000e+00 |
30257 | DOID:10652 | Alzheimer's disease | D000647 | Amnesia | 303 | 12.408109 | 24.419515 | 38.858995 | 4.940656e-324 |
30381 | DOID:10652 | Alzheimer's disease | D006816 | Huntington Disease | 251 | 10.617248 | 23.640778 | 36.650552 | 4.011670e-264 |
import numpy
import scipy
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline
sig_df = cooc_df[cooc_df.p_fisher < 0.05]
plt.hist(list(numpy.log(sig_df.enrichment)), bins = 50);