In [61]:
import os
import gzip
import io
import json

import xml.etree.ElementTree as ET

import networkx
In [2]:
 
In [3]:
# Read MeSH xml release
# ftp://nlmpubs.nlm.nih.gov/online/mesh/.xmlmesh/desc2015.gz
xml_path = os.path.join('download', 'desc2015.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()
In [55]:
# Parse MeSH xml release
terms = list()

for elem in root:
    term = dict()
    term['mesh_id'] = elem.findtext('DescriptorUI')
    term['mesh_name'] = elem.findtext('DescriptorName/String')
    term['semantic_types'] = list({x.text for x in elem.findall(
        'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})
    term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]
    terms.append(term)

len(terms)
Out[55]:
27455
In [56]:
# Determine ontology parents
tree_number_to_id = {tn: term['mesh_id'] for term in terms for tn in term['tree_numbers']}

for term in terms:
    parents = set()
    for tree_number in term['tree_numbers']:
        try:
            parent_tn, self_tn = tree_number.rsplit('.', 1)
            parents.add(tree_number_to_id[parent_tn])
        except ValueError:
            pass
    term['parents'] = list(parents)
In [ ]:
 
In [59]:
path = os.path.join('data', 'mesh.json')
with open(path, 'w') as write_file:
    json.dump(terms, write_file, indent=2)
In [64]:
# Create a newtorkx directed graph represented mesh
network = networkx.DiGraph()

# add nodes
for term in terms:
    network.add_node(term['mesh_id'], name=term['mesh_name'])

# add edges
for term in terms:
    for parent in term['parents']:
        network.add_edge(parent, term['mesh_id'])

assert networkx.is_directed_acyclic_graph(network)

networkx.write_gexf(network, 'data/ontology.gexf.gz')
In [49]:
import pandas

# Read UMLS semantic types
url = 'http://semanticnetwork.nlm.nih.gov/Download/RelationalFiles/SRDEF'
sty_df = pandas.read_table(url, sep='|', header=None)
In [ ]:
 
In [66]:
# Read mesh
path = os.path.join('data', 'mesh.json')
with open(path) as read_file:
    mesh = json.load(read_file)

mesh_df = pandas.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]
mesh_df.to_csv('data/terms.tsv', sep='\t', index=False)
In [ ]:
 
In [ ]:
# Symptoms
In [ ]:
url = 'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Symptom-Occurence-Output.tsv'
hsdn_symptom_df = pandas.read_table(url, index_col=0)
hsdn_symptoms = hsdn_symptom_df['MeSH Symptom ID']
In [ ]:
symptoms = networkx.descendants(network, 'D012816') # signs and symptoms
symptom_df = mesh_df[mesh_df.mesh_id.isin(symptoms)]
pandas.options.mode.chained_assignment = None
symptom_df['in_hsdn'] = symptom_df.mesh_id.isin(hsdn_symptoms).astype(int)
symptom_df.to_csv('data/symptoms.tsv', index=False, sep='\t')
sum(symptom_df.in_hsdn)
In [ ]:
# Side Effects
side_effects = networkx.descendants(network, 'D064420') # Drug-Related Side Effects and Adverse Reactions
side_effect_df = mesh_df[mesh_df.mesh_id.isin(side_effects)]
len(side_effect_df)