import os
import gzip
import io
import json
import xml.etree.ElementTree as ET
import networkx
# Read MeSH xml release
# ftp://nlmpubs.nlm.nih.gov/online/mesh/.xmlmesh/desc2015.gz
xml_path = os.path.join('download', 'desc2015.gz')
with gzip.open(xml_path) as xml_file:
tree = ET.parse(xml_file)
root = tree.getroot()
# Parse MeSH xml release
terms = list()
for elem in root:
term = dict()
term['mesh_id'] = elem.findtext('DescriptorUI')
term['mesh_name'] = elem.findtext('DescriptorName/String')
term['semantic_types'] = list({x.text for x in elem.findall(
'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})
term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]
terms.append(term)
len(terms)
27455
# Determine ontology parents
tree_number_to_id = {tn: term['mesh_id'] for term in terms for tn in term['tree_numbers']}
for term in terms:
parents = set()
for tree_number in term['tree_numbers']:
try:
parent_tn, self_tn = tree_number.rsplit('.', 1)
parents.add(tree_number_to_id[parent_tn])
except ValueError:
pass
term['parents'] = list(parents)
path = os.path.join('data', 'mesh.json')
with open(path, 'w') as write_file:
json.dump(terms, write_file, indent=2)
# Create a newtorkx directed graph represented mesh
network = networkx.DiGraph()
# add nodes
for term in terms:
network.add_node(term['mesh_id'], name=term['mesh_name'])
# add edges
for term in terms:
for parent in term['parents']:
network.add_edge(parent, term['mesh_id'])
assert networkx.is_directed_acyclic_graph(network)
networkx.write_gexf(network, 'data/ontology.gexf.gz')
import pandas
# Read UMLS semantic types
url = 'http://semanticnetwork.nlm.nih.gov/Download/RelationalFiles/SRDEF'
sty_df = pandas.read_table(url, sep='|', header=None)
# Read mesh
path = os.path.join('data', 'mesh.json')
with open(path) as read_file:
mesh = json.load(read_file)
mesh_df = pandas.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]
mesh_df.to_csv('data/terms.tsv', sep='\t', index=False)
# Symptoms
url = 'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Symptom-Occurence-Output.tsv'
hsdn_symptom_df = pandas.read_table(url, index_col=0)
hsdn_symptoms = hsdn_symptom_df['MeSH Symptom ID']
symptoms = networkx.descendants(network, 'D012816') # signs and symptoms
symptom_df = mesh_df[mesh_df.mesh_id.isin(symptoms)]
pandas.options.mode.chained_assignment = None
symptom_df['in_hsdn'] = symptom_df.mesh_id.isin(hsdn_symptoms).astype(int)
symptom_df.to_csv('data/symptoms.tsv', index=False, sep='\t')
sum(symptom_df.in_hsdn)
# Side Effects
side_effects = networkx.descendants(network, 'D064420') # Drug-Related Side Effects and Adverse Reactions
side_effect_df = mesh_df[mesh_df.mesh_id.isin(side_effects)]
len(side_effect_df)