Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module.
import os
import csv
import gzip
import collections
import re
import io
import xml.etree.ElementTree as ET
import requests
import pandas
xml_path = os.path.join('download', 'drugbank.xml.gz')
with gzip.open(xml_path) as xml_file:
tree = ET.parse(xml_file)
root = tree.getroot()
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"
rows = list()
for i, drug in enumerate(root):
row = collections.OrderedDict()
assert drug.tag == ns + 'drug'
row['type'] = drug.get('type')
row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
row['name'] = drug.findtext(ns + "name")
row['groups'] = [group.text for group in
drug.findall("{ns}groups/{ns}group".format(ns = ns))]
row['atc_codes'] = [code.get('code') for code in
drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
row['categories'] = [x.findtext(ns + 'category') for x in
drug.findall("{ns}categories/{ns}category".format(ns = ns))]
row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
rows.append(row)
def collapse_list_values(row):
for key, value in row.items():
if isinstance(value, list):
row[key] = '|'.join(value)
return row
rows = list(map(collapse_list_values, rows))
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi']
drugbank_df = pandas.DataFrame.from_dict(rows)[columns]
drugbank_df.head()
drugbank_id | name | type | groups | atc_codes | categories | inchikey | inchi | |
---|---|---|---|---|---|---|---|---|
0 | DB00001 | Lepirudin | biotech | approved | B01AE02 | Antithrombins|Fibrinolytic Agents | None | None |
1 | DB00002 | Cetuximab | biotech | approved | L01XC06 | Antineoplastic Agents | None | None |
2 | DB00003 | Dornase alfa | biotech | approved | R05CB13 | Enzymes | None | None |
3 | DB00004 | Denileukin diftitox | biotech | approved|investigational | L01XX29 | Antineoplastic Agents | None | None |
4 | DB00005 | Etanercept | biotech | approved|investigational | L04AB01 | Immunosuppressive Agents | None | None |
drugbank_slim_df = drugbank_df[
drugbank_df.groups.map(lambda x: 'approved' in x) &
drugbank_df.inchi.map(lambda x: x is not None) &
drugbank_df.type.map(lambda x: x == 'small molecule')
]
drugbank_slim_df.head()
drugbank_id | name | type | groups | atc_codes | categories | inchikey | inchi | |
---|---|---|---|---|---|---|---|---|
13 | DB00014 | Goserelin | small molecule | approved | L02AE03 | InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-N | InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3... | |
34 | DB00035 | Desmopressin | small molecule | approved | H01BA02 | Antidiuretic Agents|Hemostatics|Renal Agents | InChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-N | InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(... |
48 | DB00050 | Cetrorelix | small molecule | approved|investigational | H01CC02 | Hormone Antagonists|Fertility Agents | InChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-N | InChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82... |
86 | DB00091 | Cyclosporine | small molecule | approved|investigational | L04AD01|S01XA18 | Antirheumatic Agents|Dermatologic Agents|Immun... | InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-N | InChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)... |
88 | DB00093 | Felypressin | small molecule | approved | Vasoconstrictor Agents|Renal Agents | InChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-N | InChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)... |
# write drugbank tsv
path = os.path.join('data', 'drugbank.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)
# write slim drugbank tsv
path = os.path.join('data', 'drugbank-slim.tsv')
drugbank_slim_df.to_csv(path, sep='\t', index=False)
protein_rows = list()
for i, drug in enumerate(root):
drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
for category in ['target', 'enzyme', 'carrier', 'transporter']:
proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
for protein in proteins:
row = {'drugbank_id': drugbank_id, 'category': category}
row['organism'] = protein.findtext('{}organism'.format(ns))
row['known_action'] = protein.findtext('{}known-action'.format(ns))
actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
row['actions'] = '|'.join(action.text for action in actions)
uniprot_ids = [polypep.text for polypep in protein.findall(
"{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]
if len(uniprot_ids) != 1: continue
row['uniprot_id'] = uniprot_ids[0]
ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
row['pubmed_ids'] = '|'.join(pmids)
protein_rows.append(row)
protein_df = pandas.DataFrame.from_dict(protein_rows)
# Read our uniprot to entrez_gene mapping
response = requests.get('http://git.dhimmel.com/uniprot/data/map/GeneID.tsv.gz', stream=True)
text = io.TextIOWrapper(gzip.GzipFile(fileobj=response.raw))
uniprot_df = pandas.read_table(text, engine='python')
uniprot_df.rename(columns={'uniprot': 'uniprot_id', 'GeneID': 'entrez_gene_id'}, inplace=True)
# merge uniprot mapping with protein_df
entrez_df = protein_df.merge(uniprot_df, how='inner')
columns = ['drugbank_id', 'category', 'uniprot_id', 'entrez_gene_id', 'organism',
'known_action', 'actions', 'pubmed_ids']
entrez_df = entrez_df[columns]
path = os.path.join('data', 'proteins.tsv')
entrez_df.to_csv(path, sep='\t', index=False)