#!/usr/bin/env python # coding: utf-8 # In[1]: import io import gzip import pandas import requests import networkx import eutility import cooccurrence # In[ ]: # In[ ]: # # Tissues # In[3]: # Read MeSH UBERON Anatomical structures url = 'https://raw.githubusercontent.com/dhimmel/uberon/7de0ed6238c26ea82accae6fc57accd1b845111d/data/mesh-map.tsv' uberon_df = pandas.read_table(url) uberon_df.head() # In[5]: rows_out = list() for i, row in uberon_df.iterrows(): term_query = '{tissue}[MeSH Terms:noexp]'.format(tissue = row.mesh_name.lower()) payload = {'db': 'pubmed', 'term': term_query} pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2) row['term_query'] = term_query row['n_articles'] = len(pmids) row['pubmed_ids'] = '|'.join(pmids) rows_out.append(row) print('{} articles for {}'.format(len(pmids), row.mesh_name)) uberon_pmids_df = pandas.DataFrame(rows_out) # In[6]: with gzip.open('data/uberon-pmids.tsv.gz', 'w') as write_file: write_file = io.TextIOWrapper(write_file) uberon_pmids_df.to_csv(write_file, sep='\t', index=False) uberon_pmids_df.head() # # Tissue-Disease Cooccurrence # In[2]: uberon_df, uberon_to_pmids = cooccurrence.read_pmids_tsv('data/uberon-pmids.tsv.gz', key='mesh_id') disease_df, disease_to_pmids = cooccurrence.read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code') # In[7]: cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, uberon_to_pmids, 'doid_code', 'mesh_id') # In[ ]: # In[8]: cooc_df = uberon_df[['mesh_id', 'mesh_name']].drop_duplicates().merge(cooc_df) cooc_df = disease_df[['doid_code', 'doid_name']].drop_duplicates().merge(cooc_df) cooc_df = cooc_df.sort(['doid_name', 'p_fisher']) cooc_df.to_csv('data/disease-uberon-cooccurrence.tsv', index=False, sep='\t') cooc_df.head() # In[ ]: