%matplotlib inline
%pylab inline
from __future__ import print_function, division
import warnings
import psycopg2
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
from IPython.display import HTML, Javascript
Populating the interactive namespace from numpy and matplotlib
# Get the base URL of the VM:
display(Javascript('IPython.notebook.kernel.execute("current_url = " + "\'"+document.URL+"\'");'))
# Base URL for report cards...
# NB This currently uses the main (external) ChEMBL instance, as myChEMBL doesn't yet offer report cards
report_url = 'https://www.ebi.ac.uk/chembl'
cur_base_url = current_url.split('http://')[1].split('/')[0]
base_url = 'localhost:8000' if (cur_base_url == 'localhost:9612' or cur_base_url == '127.0.0.1:9612') else current_url.split('http://')[1].split(':')[0] + ':8000'
img_url = 'http://' + base_url + '/chemblws'
img_url
'http://tannin.windows.ebi.ac.uk:8000/chemblws'
# PostgreSQL connection details...
username, password, hostname, database = 'mychembl', 'read', 'localhost', 'chembl_21'
# Connect to PostgreSQL...
conn = psycopg2.connect(host=hostname, database=database, user=username, password=password)
cursor = conn.cursor()
Here, we query myChEMBL for a list of marketed drugs, and for any alternative salt/solvate forms of those drugs. This is done by obtaining the parent form (i.e. the bioactive component) for each drug, and requesting all versions (i.e. salt/solvate forms) containing that parent.
This is done because data may be recorded in the literature for versions of the drug other than the marketed form. Interest is usually in the parent form (i.e. stripped of any salt/solvate components), as this is the bioactive component (ignoring prodrugs and active metabolites for the present). Thus, to ensure the greatest amount of data is obtained for a given bioactive compound, it normally makes sense to extend the query out to versions sharing the same parent.
Note that in some cases this might not be appropriate, as different salt forms might not behave the same in certain assays (e.g. dissolution rates and hence oral bioavailabilities might differ). If this was of concern, the query below could be simplified so as only to retrieve versions flagged as the marketed drugs (a simplified version is included but commented out).
In practice, the differences between the two approaches appear to be fairly small, and the more inclusive query is used mainly for illustrative purposes.
# Get list of marketed drugs, including any alternative salt forms...
drugs_sql = """
select
c.chembl_id as parent_chembl_id
, a.chembl_id
, a.pref_name
, d.canonical_smiles as parent_smiles
from
molecule_dictionary a
, molecule_hierarchy b
, chembl_id_lookup c
, compound_structures d
, ( -- List of parent structures (i.e. the bioactive form) of marketed drugs
select distinct
b.parent_molregno
from
molecule_dictionary a
, molecule_hierarchy b
where
a.molregno = b.molregno
and a.max_phase = 4
) e
where
a.molregno = b.molregno
and b.parent_molregno = c.entity_id and c.entity_type = 'COMPOUND'
and b.parent_molregno = d.molregno
and b.parent_molregno = e.parent_molregno -- restrict to cases where parent is that of as marketed drug
order by
parent_chembl_id
, chembl_id
"""
# Simplified query that doesn't pull in other salt forms...
# drugs_sql = """
# select
# c.chembl_id as parent_chembl_id
# , a.chembl_id
# , a.pref_name
# , d.canonical_smiles as parent_smiles
# from
# molecule_dictionary a
# , molecule_hierarchy b
# , chembl_id_lookup c
# , compound_structures d
# where
# a.molregno = b.molregno
# and b.parent_molregno = c.entity_id and c.entity_type = 'COMPOUND'
# and b.parent_molregno = d.molregno
# and a.max_phase = 4
# order by
# parent_chembl_id
# , chembl_id
# """
# Run query...
cursor.execute(drugs_sql)
# Get column names...
columns = [x[0] for x in cursor.description]
# Retrieve data...
rows = cursor.fetchall()
# Convert to Pandas dataframe...
drugs = pd.DataFrame(rows, columns=columns)
drugs.shape
(2947, 4)
drugs.head(10)
parent_chembl_id | chembl_id | pref_name | parent_smiles | |
---|---|---|---|---|
0 | CHEMBL1000 | CHEMBL1000 | CETIRIZINE | OC(=O)COCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3 |
1 | CHEMBL1000 | CHEMBL1201113 | CETIRIZINE HYDROCHLORIDE | OC(=O)COCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3 |
2 | CHEMBL1000 | CHEMBL1607273 | CETIRIZINE DIHYDROCHLORIDE | OC(=O)COCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3 |
3 | CHEMBL1002 | CHEMBL1002 | LEVOSALBUTAMOL | CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1 |
4 | CHEMBL1002 | CHEMBL1201061 | LEVALBUTEROL HYDROCHLORIDE | CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1 |
5 | CHEMBL1002 | CHEMBL2062258 | LEVALBUTEROL TARTRATE | CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1 |
6 | CHEMBL1002 | CHEMBL2106337 | LEVALBUTEROL SULFATE | CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1 |
7 | CHEMBL1004 | CHEMBL1004 | DOXYLAMINE | CN(C)CCOC(C)(c1ccccc1)c2ccccn2 |
8 | CHEMBL1004 | CHEMBL1200392 | DOXYLAMINE SUCCINATE | CN(C)CCOC(C)(c1ccccc1)c2ccccn2 |
9 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | CCC(=O)N(c1ccccc1)C2(CCN(CCC(=O)OC)CC2)C(=O)OC |
# Take a copy, as the dataframe will be altered...
drugs2 = drugs.copy()
# Add images for parent structure and salt form(s)...
drugs2['parent_img'] = drugs2['parent_chembl_id'].apply(lambda x: '<img src="{}/image/{}?dimensions=250" >'.format(img_url, x))
drugs2['version_img'] = drugs2['chembl_id'].apply(lambda x: '<img src="{}/image/{}?dimensions=250" >'.format(img_url, x))
# Add links to report cards (NB these are currently calls out to the main ChEMBL instance, as myChEMBL doesn't yet offer report cards)...
drugs2['parent_chembl_id'] = drugs2['parent_chembl_id'].apply(lambda x: '<a target="_blank" href="{}/compound/inspect/{}">{}</a>'.format(report_url, x, x))
drugs2['chembl_id'] = drugs2['chembl_id'].apply(lambda x: '<a target="_blank" href="{}/compound/inspect/{}">{}</a>'.format(report_url, x, x))
# Subset and reorder columns for easier inspection...
drugs2 = drugs2[['parent_chembl_id', 'parent_img', 'chembl_id', 'version_img', 'pref_name', 'parent_smiles']]
HTML(drugs2.head().drop('parent_smiles', 1).to_html(escape=False))
# HTML(drugs2.drop('parent_smiles', 1).to_html(escape=False)) # Uncomment to view full table
parent_chembl_id | parent_img | chembl_id | version_img | pref_name | |
---|---|---|---|---|---|
0 | CHEMBL1000 | CHEMBL1000 | CETIRIZINE | ||
1 | CHEMBL1000 | CHEMBL1201113 | CETIRIZINE HYDROCHLORIDE | ||
2 | CHEMBL1000 | CHEMBL1607273 | CETIRIZINE DIHYDROCHLORIDE | ||
3 | CHEMBL1002 | CHEMBL1002 | LEVOSALBUTAMOL | ||
4 | CHEMBL1002 | CHEMBL1201061 | LEVALBUTEROL HYDROCHLORIDE |
This query pulls back in vivo PK data for humans and a variety of species used in drug development and safety studies.
# ADME data query...
data_sql = """
SELECT
cil.chembl_id as chembl_id
, act.standard_type as data_type
, act.standard_relation as modifier
, act.standard_value as value
, act.standard_units as units
, ass.assay_organism as organism
, ass.assay_tissue as tissue
, td.chembl_id as target_id
FROM
activities act
, assays ass
, target_dictionary td
, chembl_id_lookup cil
WHERE
act.assay_id = ass.assay_id
AND td.tid = ass.tid
AND act.standard_value IS NOT NULL
AND act.data_validity_comment IS NULL
AND((act.standard_type = 'Cmax'
AND act.standard_units = 'nM')
OR (act.standard_type = 'F'
AND act.standard_units = '%%')
OR(act.standard_type = 'Tmax'
AND act.standard_units = 'hr')
OR (act.standard_type = 'T1/2'
AND act.standard_units = 'hr')
OR (act.standard_type = 'Vd'
AND act.standard_units = 'L.kg-1')
OR (act.standard_type = 'Vdss'
AND act.standard_units = 'L.kg-1')
OR (act.standard_type = 'CL'
AND act.standard_units = 'mL.min-1.kg-1'))
AND ass.assay_test_type = 'In vivo'
AND (ass.assay_tissue IS NULL
OR lower(ass.assay_tissue) NOT IN ('blood', 'liver', 'adipose', 'spleen', 'prostate', 'skin', 'jejunum', 'lung', 'duodenum', 'hypothalamus', 'cortex', 'pancreas', 'interstitial fluid', 'lung epithelial lining fluid', 'small intestine', 'adrenal gland', 'heart', 'retina', 'brain', 'tumour', 'interstitial fluid (thigh)', 'stomach', 'kidney', 'intestinal segment', 'brain', 'csf', 'liver', 'Pancreas', 'testes', 'hypothalamus', 'peritoneal fluid', 'adrenals', 'intestine', 'gi tract', 'choroid/sclera', 'not recorded', 'muscle', 'kidney', 'bile', 'cornea', 'ovary'))
AND upper(ass.assay_organism) IN ('HOMO SAPIENS', 'RATTUS NORVEGICUS', 'MUS MUSCULUS', 'CANIS LUPUS FAMILIARIS', 'MACACA MULATTA', 'MACACA FASCICULARIS','MACACA','CERCOPITHECIDAE','MARMOSETS', 'PRIMATES','MONKEY')
AND act.molregno = cil.entity_id AND cil.entity_type = 'COMPOUND'
ORDER BY
chembl_id
, target_id
"""
# Run query...
cursor.execute(data_sql)
# Get column names...
columns = [x[0] for x in cursor.description]
# Retrieve data...
rows = cursor.fetchall()
# Convert to Pandas dataframe...
data = pd.DataFrame(rows, columns=columns)
data.shape
(41380, 8)
data.head()
chembl_id | data_type | modifier | value | units | organism | tissue | target_id | |
---|---|---|---|---|---|---|---|---|
0 | CHEMBL100049 | T1/2 | = | 3.28 | hr | Canis lupus familiaris | None | CHEMBL373 |
1 | CHEMBL100181 | T1/2 | = | 0.6 | hr | Canis lupus familiaris | None | CHEMBL373 |
2 | CHEMBL100367 | Vd | = | 0.29 | L.kg-1 | Rattus norvegicus | None | CHEMBL376 |
3 | CHEMBL100367 | T1/2 | = | 5.7 | hr | Rattus norvegicus | None | CHEMBL376 |
4 | CHEMBL100367 | Vd | = | 0.34 | L.kg-1 | Rattus norvegicus | None | CHEMBL376 |
# Take a copy, as the dataframe will be altered...
data2 = data.copy()
# Add images...
data2['img'] = data2['chembl_id'].apply(lambda x: '<img src="{}/image/{}?dimensions=250" >'.format(img_url, x))
# Add links to report cards (NB these are currently calls out to the main ChEMBL instance, as myChEMBL doesn't yet offer report cards)...
data2['chembl_id'] = data2['chembl_id'].apply(lambda x: '<a target="_blank" href="{}/compound/inspect/{}">{}</a>'.format(report_url, x, x))
data2['target_id'] = data2['target_id'].apply(lambda x: '<a target="_blank" href="{}/target/inspect/{}">{}</a>'.format(report_url, x, x))
# Subset and reorder columns for easier inspection...
data2 = data2[['chembl_id', 'img', 'data_type', 'modifier', 'value', 'units', 'organism', 'tissue', 'target_id']]
HTML(data2.head(5).to_html(escape=False))
# HTML(data2.head(5).to_html(escape=False)) # NB Uncomment to render all rows
chembl_id | img | data_type | modifier | value | units | organism | tissue | target_id | |
---|---|---|---|---|---|---|---|---|---|
0 | CHEMBL100049 | T1/2 | = | 3.28 | hr | Canis lupus familiaris | None | CHEMBL373 | |
1 | CHEMBL100181 | T1/2 | = | 0.6 | hr | Canis lupus familiaris | None | CHEMBL373 | |
2 | CHEMBL100367 | Vd | = | 0.29 | L.kg-1 | Rattus norvegicus | None | CHEMBL376 | |
3 | CHEMBL100367 | T1/2 | = | 5.7 | hr | Rattus norvegicus | None | CHEMBL376 | |
4 | CHEMBL100367 | Vd | = | 0.34 | L.kg-1 | Rattus norvegicus | None | CHEMBL376 |
# Merge Pandas data frames...
# NB This could obviously also be done in the database using SQL.
df = pd.merge(drugs, data, how='inner', left_on='chembl_id', right_on='chembl_id')
df.shape
(5013, 11)
df.head()
parent_chembl_id | chembl_id | pref_name | parent_smiles | data_type | modifier | value | units | organism | tissue | target_id | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | CCC(=O)N(c1ccccc1)C2(CCN(CCC(=O)OC)CC2)C(=O)OC | CL | = | 37 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL372 |
1 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | CCC(=O)N(c1ccccc1)C2(CCN(CCC(=O)OC)CC2)C(=O)OC | Vdss | = | 0.4 | L.kg-1 | Homo sapiens | None | CHEMBL612558 |
2 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | CCC(=O)N(c1ccccc1)C2(CCN(CCC(=O)OC)CC2)C(=O)OC | T1/2 | = | 0.8 | hr | Homo sapiens | None | CHEMBL612558 |
3 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | CCC(=O)N(c1ccccc1)C2(CCN(CCC(=O)OC)CC2)C(=O)OC | CL | = | 37 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL612558 |
4 | CHEMBL1009 | CHEMBL1009 | LEVODOPA | N[C@@H](Cc1ccc(O)c(O)c1)C(=O)O | CL | = | 23 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL372 |
# Uncomment to save data as spreadsheet...
# df.to_csv('myChEMBL_drugs_ADME.csv')
# Take a copy, as the dataframe will be altered...
df2 = df.copy()
# Add images...
df2['parent_img'] = df2['parent_chembl_id'].apply(lambda x: '<img src="{}/image/{}?dimensions=250" >'.format(img_url, x))
# Add links to report cards (NB these are currently calls out to the main ChEMBL instance, as myChEMBL doesn't yet offer report cards)...
df2['parent_chembl_id'] = df2['parent_chembl_id'].apply(lambda x: '<a target="_blank" href="{}/compound/inspect/{}">{}</a>'.format(report_url, x, x))
df2['chembl_id'] = df2['chembl_id'].apply(lambda x: '<a target="_blank" href="{}/compound/inspect/{}">{}</a>'.format(report_url, x, x))
df2['target_id'] = df2['target_id'].apply(lambda x: '<a target="_blank" href="{}/target/inspect/{}">{}</a>'.format(report_url, x, x))
# Subset and reorder columns for easier inspection...
df2 = df2[['parent_chembl_id', 'parent_img', 'chembl_id', 'pref_name', 'parent_smiles', 'data_type', 'modifier', 'value', 'units', 'organism', 'tissue', 'target_id']]
HTML(df2.head().drop('parent_smiles', 1).to_html(escape=False)) # NB smiles column excluded from rendering of table only
# HTML(df2.drop('parent_smiles', 1).to_html(escape=False)) # NB Uncomment to render all rows
parent_chembl_id | parent_img | chembl_id | pref_name | data_type | modifier | value | units | organism | tissue | target_id | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | CL | = | 37 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL372 | |
1 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | Vdss | = | 0.4 | L.kg-1 | Homo sapiens | None | CHEMBL612558 | |
2 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | T1/2 | = | 0.8 | hr | Homo sapiens | None | CHEMBL612558 | |
3 | CHEMBL1005 | CHEMBL1005 | REMIFENTANIL | CL | = | 37 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL612558 | |
4 | CHEMBL1009 | CHEMBL1009 | LEVODOPA | CL | = | 23 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL372 |
# An example of a case where extra data is obtained by looking at other salt forms: CHEMBL539077 (the hydrochloride salt)
# would not have been found using the simple query.
HTML(df2[df2['parent_chembl_id'].str.contains('CHEMBL1422')].drop('parent_smiles', 1).to_html(escape=False))
parent_chembl_id | parent_img | chembl_id | pref_name | data_type | modifier | value | units | organism | tissue | target_id | |
---|---|---|---|---|---|---|---|---|---|---|---|
1006 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | Vdss | = | 2.8 | L.kg-1 | Homo sapiens | None | CHEMBL372 | |
1007 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 6 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL372 | |
1008 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 6 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL372 | |
1009 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 1.3 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL372 | |
1010 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 6 | mL.min-1.kg-1 | Canis lupus familiaris | None | CHEMBL373 | |
1011 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 60 | mL.min-1.kg-1 | Rattus norvegicus | None | CHEMBL376 | |
1012 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 60 | mL.min-1.kg-1 | Rattus norvegicus | None | CHEMBL376 | |
1013 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | T1/2 | = | 1.7 | hr | Rattus norvegicus | Plasma | CHEMBL376 | |
1014 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | T1/2 | = | 1.7 | hr | Rattus norvegicus | None | CHEMBL376 | |
1015 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | Cmax | = | 330 | nM | Rattus norvegicus | None | CHEMBL376 | |
1016 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 60 | mL.min-1.kg-1 | Rattus norvegicus | None | CHEMBL376 | |
1017 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | T1/2 | = | 12 | hr | Homo sapiens | None | CHEMBL612558 | |
1018 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 6 | mL.min-1.kg-1 | Homo sapiens | None | CHEMBL612558 | |
1019 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | Vdss | = | 2.8 | L.kg-1 | Homo sapiens | None | CHEMBL612558 | |
1020 | CHEMBL1422 | CHEMBL1422 | SITAGLIPTIN | CL | = | 28 | mL.min-1.kg-1 | Macaca mulatta | None | CHEMBL614875 | |
1021 | CHEMBL1422 | CHEMBL539077 | None | Cmax | = | 330 | nM | Rattus norvegicus | None | CHEMBL376 | |
1022 | CHEMBL1422 | CHEMBL539077 | None | T1/2 | = | 1.7 | hr | Rattus norvegicus | None | CHEMBL376 | |
1023 | CHEMBL1422 | CHEMBL539077 | None | CL | = | 60 | mL.min-1.kg-1 | Rattus norvegicus | None | CHEMBL376 | |
1024 | CHEMBL1422 | CHEMBL539077 | None | Cmax | = | 330 | nM | Cercopithecidae | Plasma | CHEMBL612558 | |
1025 | CHEMBL1422 | CHEMBL539077 | None | CL | = | 60 | mL.min-1.kg-1 | Rattus norvegicus | None | CHEMBL612558 | |
1026 | CHEMBL1422 | CHEMBL539077 | None | Cmax | = | 330 | nM | Rattus norvegicus | Plasma | CHEMBL612558 | |
1027 | CHEMBL1422 | CHEMBL539077 | None | CL | = | 6 | mL.min-1.kg-1 | Canis lupus familiaris | None | CHEMBL612558 | |
1028 | CHEMBL1422 | CHEMBL539077 | None | Cmax | = | 2200 | nM | Canis lupus familiaris | Plasma | CHEMBL612558 | |
1029 | CHEMBL1422 | CHEMBL539077 | None | CL | = | 28 | mL.min-1.kg-1 | Cercopithecidae | None | CHEMBL612558 |