#!/usr/bin/env python
# coding: utf-8

# In[81]:


import io
import functools
import itertools
import gzip

import pandas

import eutility


# In[5]:


# Read MeSH terms to MeSH names
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'
mesh_df = pandas.read_table(url)

# Read MeSH terms mapped to DO Slim terms
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/9fd75f14b17e01bebc97faf1bfa1b9025e9ce4de/data/xrefs-slim.tsv'
doslim_xref_df = pandas.read_table(url)
doslim_xref_df = doslim_xref_df[doslim_xref_df.resource == 'MSH'][['doid_code', 'doid_name', 'resource_id']].rename(columns={'resource_id': 'mesh_id'})
disease_df = doslim_xref_df.merge(mesh_df)
disease_df.to_csv('data/DO-slim-to-mesh.tsv', sep='\t', index=False)
disease_df.head()


# In[ ]:


# Diseases


# In[17]:


rows_out = list()

for i, row in disease_df.iterrows():
    term_query = '{disease}[MeSH Major Topic]'.format(disease = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 10000)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))

disease_pmids_df = pandas.DataFrame(rows_out)


# In[25]:


with gzip.open('data/disease-pmids.tsv.gz', 'w') as write_file:
    write_file = io.TextIOWrapper(write_file)
    disease_pmids_df.to_csv(write_file, sep='\t', index=False)


# In[23]:


# Symptoms


# In[24]:


# Read MeSH Symptoms
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv'
symptom_df = pandas.read_table(url)
symptom_df.head()


# In[ ]:


rows_out = list()

for i, row in symptom_df.iterrows():
    term_query = '{symptom}[MeSH Terms:noexp]'.format(symptom = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))


# In[41]:


symptom_pmids_df = pandas.DataFrame(rows_out)

with gzip.open('data/symptom-pmids.tsv.gz', 'w') as write_file:
    write_file = io.TextIOWrapper(write_file)
    symptom_pmids_df.to_csv(write_file, sep='\t', index=False)

symptom_pmids_df.head()


# In[ ]:


# In[130]:


def read_pmids_tsv(path, key, min_articles = 5):
    term_to_pmids = dict()
    pmids_df = pandas.read_table(path, compression='gzip')
    pmids_df = pmids_df[pmids_df.n_articles >= min_articles]
    for i, row in pmids_df.iterrows():
        term = row[key]
        pmids = row.pubmed_ids.split('|')
        term_to_pmids[term] = set(pmids)
    pmids_df.drop('pubmed_ids', axis=1, inplace=True)
    return pmids_df, term_to_pmids


# In[131]:


symptom_df, symptom_to_pmids = read_pmids_tsv('data/symptom-pmids.tsv.gz', key='mesh_id')
disease_df, disease_to_pmids = read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code')


# In[79]:


symptom_pmids = set.union(*symptom_to_pmids.values())
len(symptom_pmids)


# In[80]:


disease_pmids = set.union(*disease_to_pmids.values())
len(disease_pmids)


# In[138]:


def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1'):
    all_pmids0 = set.union(*term0_to_pmids.values())
    all_pmids1 = set.union(*term1_to_pmids.values())
    pmids_in_both = all_pmids0 & all_pmids1
    total_pmids = len(pmids_in_both)
    
    term0_to_pmids = term0_to_pmids.copy()
    term1_to_pmids = term1_to_pmids.copy()
    for d in term0_to_pmids, term1_to_pmids:
        for key, value in list(d.items()):
            d[key] = value & pmids_in_both
            if not d[key]:
                del d[key]
    
    rows = list()
    for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):
        pmids0 = term0_to_pmids[term0]
        pmids1 = term1_to_pmids[term1]
        count = len(pmids0 & pmids1)
        expected = len(pmids0) * len(pmids1) / total_pmids
        enrichment = count / expected
        contingency_table = [[count, total_pmids - count], [expected, total_pmids - expected]]
        oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')
        rows.append([term0, term1, count, expected, enrichment, oddsratio, pvalue])
    columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']
    df = pandas.DataFrame(rows, columns=columns)
    return df


# In[141]:


cooc_df = score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')
cooc_df = symptom_df[['mesh_id', 'mesh_name']].merge(cooc_df)
cooc_df = disease_df[['doid_code', 'doid_name']].merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])
cooc_df.to_csv('data/disease-symptom-cooccurrence.tsv', index=False, sep='\t')
cooc_df.head()


# In[ ]:


# In[111]:


import numpy
import scipy
import seaborn
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')


# In[129]:


sig_df = cooc_df[cooc_df.p_fisher < 0.05]
plt.hist(list(numpy.log(sig_df.enrichment)), bins = 50);


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]: