#!/usr/bin/env python # coding: utf-8 # In[81]: import io import functools import itertools import gzip import pandas import eutility # In[5]: # Read MeSH terms to MeSH names url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv' mesh_df = pandas.read_table(url) # Read MeSH terms mapped to DO Slim terms url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/9fd75f14b17e01bebc97faf1bfa1b9025e9ce4de/data/xrefs-slim.tsv' doslim_xref_df = pandas.read_table(url) doslim_xref_df = doslim_xref_df[doslim_xref_df.resource == 'MSH'][['doid_code', 'doid_name', 'resource_id']].rename(columns={'resource_id': 'mesh_id'}) disease_df = doslim_xref_df.merge(mesh_df) disease_df.to_csv('data/DO-slim-to-mesh.tsv', sep='\t', index=False) disease_df.head() # In[ ]: # Diseases # In[17]: rows_out = list() for i, row in disease_df.iterrows(): term_query = '{disease}[MeSH Major Topic]'.format(disease = row.mesh_name.lower()) payload = {'db': 'pubmed', 'term': term_query} pmids = eutility.esearch_query(payload, retmax = 10000) row['term_query'] = term_query row['n_articles'] = len(pmids) row['pubmed_ids'] = '|'.join(pmids) rows_out.append(row) print('{} articles for {}'.format(len(pmids), row.mesh_name)) disease_pmids_df = pandas.DataFrame(rows_out) # In[25]: with gzip.open('data/disease-pmids.tsv.gz', 'w') as write_file: write_file = io.TextIOWrapper(write_file) disease_pmids_df.to_csv(write_file, sep='\t', index=False) # In[23]: # Symptoms # In[24]: # Read MeSH Symptoms url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv' symptom_df = pandas.read_table(url) symptom_df.head() # In[ ]: rows_out = list() for i, row in symptom_df.iterrows(): term_query = '{symptom}[MeSH Terms:noexp]'.format(symptom = row.mesh_name.lower()) payload = {'db': 'pubmed', 'term': term_query} pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2) row['term_query'] = term_query row['n_articles'] = len(pmids) row['pubmed_ids'] = '|'.join(pmids) rows_out.append(row) print('{} articles for {}'.format(len(pmids), row.mesh_name)) # In[41]: symptom_pmids_df = pandas.DataFrame(rows_out) with gzip.open('data/symptom-pmids.tsv.gz', 'w') as write_file: write_file = io.TextIOWrapper(write_file) symptom_pmids_df.to_csv(write_file, sep='\t', index=False) symptom_pmids_df.head() # In[ ]: # In[130]: def read_pmids_tsv(path, key, min_articles = 5): term_to_pmids = dict() pmids_df = pandas.read_table(path, compression='gzip') pmids_df = pmids_df[pmids_df.n_articles >= min_articles] for i, row in pmids_df.iterrows(): term = row[key] pmids = row.pubmed_ids.split('|') term_to_pmids[term] = set(pmids) pmids_df.drop('pubmed_ids', axis=1, inplace=True) return pmids_df, term_to_pmids # In[131]: symptom_df, symptom_to_pmids = read_pmids_tsv('data/symptom-pmids.tsv.gz', key='mesh_id') disease_df, disease_to_pmids = read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code') # In[79]: symptom_pmids = set.union(*symptom_to_pmids.values()) len(symptom_pmids) # In[80]: disease_pmids = set.union(*disease_to_pmids.values()) len(disease_pmids) # In[138]: def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1'): all_pmids0 = set.union(*term0_to_pmids.values()) all_pmids1 = set.union(*term1_to_pmids.values()) pmids_in_both = all_pmids0 & all_pmids1 total_pmids = len(pmids_in_both) term0_to_pmids = term0_to_pmids.copy() term1_to_pmids = term1_to_pmids.copy() for d in term0_to_pmids, term1_to_pmids: for key, value in list(d.items()): d[key] = value & pmids_in_both if not d[key]: del d[key] rows = list() for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids): pmids0 = term0_to_pmids[term0] pmids1 = term1_to_pmids[term1] count = len(pmids0 & pmids1) expected = len(pmids0) * len(pmids1) / total_pmids enrichment = count / expected contingency_table = [[count, total_pmids - count], [expected, total_pmids - expected]] oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater') rows.append([term0, term1, count, expected, enrichment, oddsratio, pvalue]) columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher'] df = pandas.DataFrame(rows, columns=columns) return df # In[141]: cooc_df = score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id') cooc_df = symptom_df[['mesh_id', 'mesh_name']].merge(cooc_df) cooc_df = disease_df[['doid_code', 'doid_name']].merge(cooc_df) cooc_df = cooc_df.sort(['doid_name', 'p_fisher']) cooc_df.to_csv('data/disease-symptom-cooccurrence.tsv', index=False, sep='\t') cooc_df.head() # In[ ]: # In[111]: import numpy import scipy import seaborn import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[129]: sig_df = cooc_df[cooc_df.p_fisher < 0.05] plt.hist(list(numpy.log(sig_df.enrichment)), bins = 50); # In[ ]: # In[ ]: # In[ ]: # In[ ]: