#!/usr/bin/env python
# coding: utf-8

# # myChEMBL webservices version 2.x
# 
# ### myChEMBL team, ChEMBL Group, EMBL-EBI.

# ## Introduction
# 
# This notebook will provide some examples of using the **`myChEMBL`** webservices.
# 
# The web services have recently been updated to the 2.x version and are **not** backwards compatible.
# The main features introduced by this latest version are:
# 
#  - more resources
#  - filtering
#  - paging
#  - ordering
#  
# You can call the web services in the following two ways: 
#  
# 1) Directly _via_ URLs (see the 'Web Services' link on the **`myChEMBL`** LaunchPad for a list of the available endpoints). The advantage of using the URLs is that it is language-agnostic: although the examples below use Python, any other language with a library for executing HTTP requests would do just as well.
# 
# 
# 2) Using the API provided by the Python package **`chembl_webresource_client`**. This has the following advantages:
#  - the usage is simpler
#  - some extra functionality is available
#  - there are performance benefits
# 
# For the reasons above, we recommend using the API where possible.
# 
# Note that the **`chembl_webresource_client`** module is aleady installed on the **`myChEMBL`** VM; if you wish to use it on other machines, it can be installed using **`pip`**.
# 
# Please note that the code below attempts to balance clarity and brevity, and is not intended to be a template for production code: error checking, for example, should be much more thorough in practice. 

# ## Configuration and setup

# In[1]:


import logging

from collections import Counter
from operator import itemgetter

from lxml import etree

from rdkit import Chem
from rdkit.Chem import Draw 
from rdkit.Chem.Draw import IPythonConsole

from IPython.display import Image, display


# In[2]:


# Python modules used for API access...
# By default, the API connects to the main ChEMBL database; set it to use the local version (i.e. myChEMBL) instead...
from chembl_webresource_client.settings import Settings
Settings.Instance().NEW_CLIENT_URL = 'http://localhost/chemblws'

from chembl_webresource_client.new_client import new_client


# ## List of available resources
# It's easy to get a list of available resources by invoking:

# In[3]:


available_resources = [resource for resource in dir(new_client) if not resource.startswith('_')]
print available_resources
print len(available_resources)


# Which means there are 20 different types of resources available _via_ web services. In this notebook only the most important of these are covered.

# ## Molecules
# 
# Molecule records may be retrieved in a number of ways, such as lookup of single molecules using various identifiers or searching for compounds _via_ substruture or similarity. 

# In[4]:


# Get a molecule-handler object for API access and check the connection to the database...

molecule = new_client.molecule
molecule.set_format('json')
print "%s molecules available in myChEMBL_20" % len(molecule.all())


# ### Getting a single molecule
# 
# In order to retrieve a single molecule from the web services, you need to know its unique and unambiguous identifier. In case of molecule resource this can be one of three types:
# 
#  1. ChEMBL_ID
#  2. InChI Key
#  3. Canonical SMILES (non-canonical SMILES will be covered later in this notebook)

# In[5]:


# so this:
# 1.
m1 = molecule.get('CHEMBL25')
# 2.
m2 = molecule.get('BSYNRYMUTXBXSQ-UHFFFAOYSA-N')
#
m3 = molecule.get('CC(=O)Oc1ccccc1C(=O)O')
# will return the same data:
m1 == m2 == m3


# ### ChEMBL ID
# 
# All the main entities in the ChEMBL database have a ChEMBL ID. It is a stable identifier designed for straightforward lookup of data.

# In[6]:


# Lapatinib, the bioactive component of the anti-cancer drug Tykerb

chembl_id = "CHEMBL554" 


# In[7]:


# Get compound record using client...

record_via_client = molecule.get(chembl_id)

record_via_client


# As noted above, a URLs may also be used to access the data, and, although the examples here use Python, any other language with a library for executing HTTP requests would do as well.

# In[8]:


# Import a Python module to allow URL-based access...

import requests
from urllib import quote

# Stem of URL for local version of web services...

url_stem = "http://localhost/chemblws"


# In[9]:


# Note that, for historical reasons, the URL-based webservices return XML by default, so JSON
# must be requested explicity by appending '.json' to the URL.

# Get request object...
url = url_stem + "/molecule/" + chembl_id + ".json"
request = requests.get(url)

print url

# Check reqest status: should be 200 if everything went OK...
print request.status_code


# In[10]:


record_via_url = request.json()
record_via_url 


# Note that in both cases we are getting exactly the same results:

# In[11]:


record_via_client == record_via_url


# When retrieved in JSON format, a record is a nested dictionary, so to get, say, a SMILES string we have to write:

# In[12]:


smiles_from_json = record_via_client['molecule_structures']['canonical_smiles']


# It is possible to retrieve data in XML format as well:

# In[13]:


# Get compound record in XML format...

molecule.set_format('xml')
xml = molecule.get(chembl_id).encode('utf-8')
#print xml
# The XML must be parsed (e.g. using the lxml.etree module in Python) to enable extraction of the data...

root = etree.fromstring(xml).getroottree()


# In[14]:


# Extract SMILES via xpath...

smiles_from_xml = root.xpath("/molecule/molecule_structures/canonical_smiles/text()")[0]

print smiles_from_xml
print smiles_from_xml == smiles_from_json


# In[15]:


# Pretty-print XML...

print etree.tostring(root, pretty_print=True)


# ### InChIKey
# 
# Compound records may also be retrieved _via_ InChI Key lookup.

# In[16]:


# InChI Key for Lapatinib
inchi_key = "BCFGMOOMADDAQU-UHFFFAOYSA-N"

# getting molecule via client
molecule.set_format('json')
record_via_client = molecule.get(inchi_key)

# getting molecule via url
url = url_stem + "/molecule/" + inchi_key + ".json"
record_via_url = requests.get(url).json()

print url

# they are the same
print record_via_url == record_via_client


# ### SMILES
# 
# Compound records may also be retrieved _via_ SMILES lookup.
# 
# The purpose of the `get` method is to return objects identified by their unique and unambiguous properties.
# This is why SMILES provided as arguments to the `get` method need to be canonical.
# But you can still search for molecules, using non-canonical SMILES - this functionaly will be covered later in this notebook.

# In[17]:


# Canonoical SMILES for Lapatinib
canonical_smiles = "CS(=O)(=O)CCNCc1oc(cc1)c2ccc3ncnc(Nc4ccc(OCc5cccc(F)c5)c(Cl)c4)c3c2"

# getting molecule via client
molecule.set_format('json')
record_via_client = molecule.get(canonical_smiles)

# getting molecule via url
url = url_stem + "/molecule/" + quote(canonical_smiles) + ".json"
record_via_url = requests.get(url).json()

print url

# they are the same
record_via_url == record_via_client


# ### Batch queries
# 
# Multiple records may be requested at once. The `get` method can accept a list of homogenous identifiers.

# In[18]:


records1 = molecule.get(['CHEMBL6498', 'CHEMBL6499', 'CHEMBL6505'])
records2 = molecule.get(['XSQLHVPPXBBUPP-UHFFFAOYSA-N', 'JXHVRXRRSSBGPY-UHFFFAOYSA-N', 'TUHYVXGNMOGVMR-GASGPIRDSA-N'])
records3 = molecule.get(['CNC(=O)c1ccc(cc1)N(CC#C)Cc2ccc3nc(C)nc(O)c3c2',
            'Cc1cc2SC(C)(C)CC(C)(C)c2cc1\\N=C(/S)\\Nc3ccc(cc3)S(=O)(=O)N',
            'CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H]3CCCN3C(=O)C(CCCCN)CCCCN)C(C)(C)C)C(=O)O'])
records1 == records2 == records3


# The same can be done _via_ urls:

# In[19]:


url1 = url_stem + "/molecule/set/%s;%s;%s" % ('CHEMBL6498', 'CHEMBL6499', 'CHEMBL6505') + ".json"
records1 = requests.get(url1).json()

url2 = url_stem + "/molecule/set/%s;%s;%s" % ('XSQLHVPPXBBUPP-UHFFFAOYSA-N', 'JXHVRXRRSSBGPY-UHFFFAOYSA-N', 'TUHYVXGNMOGVMR-GASGPIRDSA-N') + ".json"
records2 = requests.get(url2).json()

url3 = url_stem + "/molecule/set/%s;%s;%s" % (quote('CNC(=O)c1ccc(cc1)N(CC#C)Cc2ccc3nc(C)nc(O)c3c2'),
            quote('Cc1cc2SC(C)(C)CC(C)(C)c2cc1\\N=C(/S)\\Nc3ccc(cc3)S(=O)(=O)N'),
            quote('CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H]3CCCN3C(=O)C(CCCCN)CCCCN)C(C)(C)C)C(=O)O')) + ".json"
records3 = requests.get(url3).json()

print url1
print url2
print url3

records1 == records2 == records3


# Please note that the length of url can't be more than 4000 characters. This is why url-based approach should not be used for a very long lists of identifiers. Also `molecule.get` call needs to be modified slightly in that case.

# In[20]:


# Generate a list of 300 ChEMBL IDs (N.B. not all will be valid)...

chembl_ids = ['CHEMBL{}'.format(x) for x in range(1, 301)]

# Get compound records, note `molecule_chembl_id` named parameter.
# Named parameters should always be used for longer lists

records = molecule.get(molecule_chembl_id=chembl_ids)

len(records)


# Note that we expect to see a number that is less than 300 (169). This is because for some identifiers in range `(CHEMBL1, ..., CHEMBL300)` there are no molecule mapped to them.

# ### Filtering
# All resources available through ChEMBL web services can be filtered.
# Some examples of filtering applied to molecules:
# 
# 1. Get all approved drugs
# 2. Get all molecules in ChEMBL with no Rule-of-Five violations
# 3. Get all biotherapeutic molecules
# 4. Return molecules with molecular weight <= 300
# 5. Return molecules with molecular weight <= 300 AND pref_name ends with -nib

# In[21]:


# First, filtering using the client:

# 1. Get all approved drugs
approved_drugs = molecule.filter(max_phase=4)

# 2. Get all molecules in ChEMBL with no Rule-of-Five violations
no_violations = molecule.filter(molecule_properties__num_ro5_violations=0)

# 3. Get all biotherapeutic molecules
biotherapeutics = molecule.filter(biotherapeutic__isnull=False)

# 4. Return molecules with molecular weight <= 300
light_molecules = molecule.filter(molecule_properties__mw_freebase__lte=300)

# 5. Return molecules with molecular weight <= 300 AND pref_name ends with nib
light_nib_molecules = molecule.filter(molecule_properties__mw_freebase__lte=300).filter(pref_name__iendswith="nib")


# In[22]:


# Secondly, fltering using url endpoint:

# 1. Get all approved drugs
url_1 = url_stem + "/molecule.json?max_phase=4"
url_approved_drugs = requests.get(url_1).json()

# 2. Get all molecules in ChEMBL with no Rule-of-Five violations
url_2 = url_stem + "/molecule.json?molecule_properties__num_ro5_violations=0"
ulr_no_violations = requests.get(url_2).json()

# 3. Get all biotherapeutic molecules
url_3 = url_stem + "/molecule.json?biotherapeutic__isnull=false"
url_biotherapeutics = requests.get(url_3).json()

# 4. Return molecules with molecular weight <= 300
url_4 = url_stem + "/molecule.json?molecule_properties__mw_freebase__lte=300"
url_light_molecules = requests.get(url_4).json()

# 5. Return molecules with molecular weight <= 300 AND pref_name ends with nib
url_5 = url_stem + "/molecule.json?molecule_properties__mw_freebase__lte=300&pref_name__iendswith=nib"
url_light_nib_molecules = requests.get(url_5).json()

print url_1
print url_2
print url_3
print url_4
print url_5


# ### Deferences between filtering with client and url endpoint - paging
# There are some important differences between filering results returned by the client and generated using URL endpoint.
# Let's have a look at them.

# In[23]:


# First off, they are not the same thing:
print approved_drugs == url_approved_drugs

# Not surprisingly, url-endpoint produced JSON data, which has been paresed into python dict:
print type(url_approved_drugs)

# Whereas the client has returned an object of type `QuerySet`
print type(approved_drugs)


# In[24]:


# Let's examine what data contains the python dict:
url_approved_drugs


# ### Page structure
# 
# The dictionary contains two top-level keys:
# 
# 1. `molecules` array
# 2. `page_meta` dictionary
# 
# This  means that by requesting data from the url-endpoint we are not getting the whole result set but a single page.
# The page consists of a single portion of data (`molecules` array) and some meta information about the page and whole result set (`page_meta` dictionary).

# In[25]:


# The default size of single page is 20 results:
len(url_approved_drugs['molecules'])


# In[26]:


# But it can be extended up to 1000 results by providing `limit` argument:
url = url_stem + "/molecule.json?max_phase=4&limit=200"
bigger_page = requests.get(url).json()

print url
print len(bigger_page['molecules'])


# In[27]:


#Let's see what data is provided in `page-meta` dictionary:
url_approved_drugs['page_meta']


# It gives following information:
# 1. `limit` - current size of the page (the actual amount of data can be smaller if the whole result set is smaller than page size or we are looking at the last page)
# 2. `offset` - the difference between first element in the whole result set and the first element on current page 
# 3. `next` - url poiting to the next page (if it exists)
# 4. `previous` - url pointing to the previous page (if it exists)
# 5. `total_count` - number of elements in the whole result set
# 
# This means that in order to get the whole result set we need to loop through the pages:

# In[28]:


# Getting all approved drugs using url endpoint
localhost = "http://localhost/"
url_approved_drugs = requests.get(localhost + "chemblws/molecule.json?max_phase=4&limit=1000").json()
results = url_approved_drugs['molecules']
while url_approved_drugs['page_meta']['next']:
    url_approved_drugs = requests.get(localhost + url_approved_drugs['page_meta']['next']).json()
    results += url_approved_drugs['molecules']
print len(results)
print len(results) == url_approved_drugs['page_meta']['total_count']


# With the client-generated results, we no longer have to worry about pagination:

# In[29]:


# The QuerySet object returned by the client is a lazily-evaluated iterator
# This means that it's ready to use and it will try to reduce the amount of server requests
# All results are cached as well so they are fetched from server only once.
approved_drugs = molecule.filter(max_phase=4)

# Getting the lenght of the whole result set is easy:
print len(approved_drugs)

# So is getting a single element:
print approved_drugs[123]

# Or a chunk of elements:
print approved_drugs[2:5]

# Or using in the loops or list comprehensions:
drug_smiles = [drug['molecule_structures']['canonical_smiles'] for drug in approved_drugs if drug['molecule_structures']]
print len(drug_smiles)


# ### Ordering results
# Similar to filtering, it's also possible to order the result set, there is a parameter called `order_by` that is reposnsible for ordering:

# In[30]:


# Sort approved drugs by molecular weight ascending (from lightest to heaviest) and get the first (lightest) element
lightest_drug = molecule.filter(max_phase=4).order_by('molecule_properties__mw_freebase')[0]
lightest_drug['pref_name']


# In[31]:


# Sort approved drugs by molecular weight descending (from heaviest to lightest) and get the first (heaviest) element
heaviest_drug = molecule.filter(max_phase=4).order_by('-molecule_properties__mw_freebase')[0]
heaviest_drug['pref_name']


# In[32]:


# Do the same using url endpoint
url_1 = url_stem + "/molecule.json?max_phase=4&order_by=molecule_properties__mw_freebase"
lightest_drug = requests.get(url_1).json()['molecules'][0]
print url_1
print lightest_drug['pref_name']

url_2 = url_stem + "/molecule.json?max_phase=4&order_by=-molecule_properties__mw_freebase"
heaviest_drug = requests.get(url_2).json()['molecules'][0]
print url_2
print heaviest_drug['pref_name']


# ### Filtering molecules using SMILES
# It is possible to filter molecules by SMILES

# In[33]:


# Atorvastatin...
smiles = "CC(C)c1c(C(=O)Nc2ccccc2)c(c3ccccc3)c(c4ccc(F)cc4)n1CC[C@@H](O)C[C@@H](O)CC(=O)O"

# By default, the type of search used is 'exact search' which means that only compounds with exacly same SMILES string will be picked:
result = molecule.filter(molecule_structures__canonical_smiles=smiles)
print len(result)

# This is quivalent of:
result1 = molecule.filter(molecule_structures__canonical_smiles__exact=smiles)
print len(result1)

# For convenience, we have a shortcut call:
result2 = molecule.filter(smiles=smiles)
print len(result2)

# Checking if they are all the same: 
print result[0]['pref_name'] == result1[0]['pref_name'] == result2[0]['pref_name']

# And because SMILES string are unique in ChEMBL, this is similar to:
result3 = molecule.get(smiles)
print result[0]['pref_name'] == result3['pref_name']


# There are however different filtering operators that can be applied to SMILES; the most important one is called `flexmatch`, which will return all structures described by given SMILES string even if this is non-canonical SMILES.

# In[34]:


# Flexmatch will look for structures that match given SMILES, ignoring stereo:
records = molecule.filter(molecule_structures__canonical_smiles__flexmatch=smiles)
print len(records)

for record in records:
    print("{:15s} : {}".format(record["molecule_chembl_id"], record['molecule_structures']['canonical_smiles']))


# Unlike with the exact string match, it is possible to retrieve multiple records when a SMILES is used for the `flexmatch` lookup (_i.e._ it is potentially one-to-many instead of one-to-one as the ID lookups are). This is due to the nature of `flexmatch`.
# 
# In our case two structures are returned, CHEMBL1487 (Atorvastatin) and CHEMBL1207181, which is the same structure as the former but with one of the two stereocentres undefined.

# In[35]:


# The same can be achieved using url endpoint:

url_1 = url_stem + "/molecule.json?molecule_structures__canonical_smiles=" + quote(smiles)
url_2 = url_stem + "/molecule.json?molecule_structures__canonical_smiles__exact=" + quote(smiles)
url_3 = url_stem + "/molecule.json?smiles=" + quote(smiles)
url_4 = url_stem + "/molecule.json?molecule_structures__canonical_smiles__flexmatch=" + quote(smiles)

exact_match = requests.get(url_1).json()
explicit_exact_match = requests.get(url_2).json()
convenient_shortcut = requests.get(url_3).json()
flexmatch = requests.get(url_4).json()

print url_1
print len(exact_match['molecules'])

print url_2
print len(explicit_exact_match['molecules'])

print url_3
print len(convenient_shortcut['molecules'])

print url_4
print len(flexmatch['molecules'])

print exact_match == explicit_exact_match


# #### A further note on SMILES searches
# 
# The URL-based example above used the HTTP GET method, which means the SMILES are passed _via_ the URL. This can cause problems where the SMILES inludes the '/', '\' or '#' characters, for example:

# In[36]:


# CHEMBL477889
smiles = "[Na+].CO[C@@H](CCC#C\C=C/CCCC(C)CCCCC=C)C(=O)[O-]"

url = url_stem + "/molecule/" + smiles + ".json"
result = requests.get(url)

print url
print result.ok
print result.status_code


# There are two solutions to this problem:
# 
# 1. When using GET, use `urllib.quote` function
# 2. Use POST with `X-HTTP-Method-Override`: `GET` header

# In[37]:


# Method one:
url = url_stem + "/molecule/" + quote(smiles) + ".json"
result_by_get = requests.get(url)

print url
print result_by_get.ok
print result_by_get.status_code


# In[38]:


# Method two:
url = url_stem + "/molecule.json"
result_by_post = requests.post(url, data={"smiles": smiles}, headers={"X-HTTP-Method-Override": "GET"})

print result_by_post.ok
print result_by_post.status_code


# In[39]:


print smiles
print result_by_post.json()
print result_by_get.json() == result_by_post.json()['molecules'][0]


# ### Substructure-searching
# 
# As well as ID lookups, the web services may also be used to perform substructure searches. Currently, only SMILES-based searches are supported, although this could change if there is is a need for more powerful search abilities (_e.g._ SMARTS searching).

# In[40]:


# Lapatinib contains the following core...

query = "c4ccc(Nc2ncnc3ccc(c1ccco1)cc23)cc4"

Chem.MolFromSmiles(query)


# In[41]:


# Perform substructure search on query using client

substructure = new_client.substructure
records = substructure.filter(smiles=query)


# In[42]:


# ... and using raw url-endpoint

url = url_stem + "/substructure/" + quote(query) + ".json"
result = requests.get(url).json()

print url
print result['page_meta']['total_count']


# In[43]:


mols = [Chem.MolFromSmiles(x['molecule_structures']['canonical_smiles']) for x in records[:6]]
legends=[str(x["molecule_chembl_id"]) for x in records]
Draw.MolsToGridImage(mols, legends=legends, subImgSize=(400, 200), useSVG=False)


# ### Similarity searching
# 
# The web services may also be used to perform SMILES-based similarity searches.

# In[44]:


# Lapatinib
smiles = "CS(=O)(=O)CCNCc1oc(cc1)c2ccc3ncnc(Nc4ccc(OCc5cccc(F)c5)c(Cl)c4)c3c2"


# In[45]:


# Peform similarity search on molecule using client...

# Note that a percentage similarity must be supplied.
similarity = new_client.similarity
res = similarity.filter(smiles=smiles, similarity=85)

len(res)


# In[46]:


##### ... and using raw url-endpoint

url = url_stem + "/similarity/" + quote(smiles) + "/85.json"
result = requests.get(url).json()

print url
print result['page_meta']['total_count']


# In[47]:


mols = [Chem.MolFromSmiles(x['molecule_structures']['canonical_smiles']) for x in res[:6]]
legends = [str(x["molecule_chembl_id"]) for x in res]
Draw.MolsToGridImage(mols, legends=legends, subImgSize=(400, 200), useSVG=False)


# ### Versions for a parent structure
# 
# The versions (_e.g._ salt forms) for a parent compound may be retrieved for a ChEMBL ID. Keep in mind that a parent structure is one that has had salt/solvate components removed; it corresponds to the bioactive moiety and its use facilitates structure searching, comparison _etc_. A compound without salt/solvate components is its own parent.

# In[48]:


# Neostigmine (a parent)...

chembl_id = "CHEMBL278020" 


# In[49]:


records = new_client.molecule_form.get(chembl_id)['molecule_forms']

records


# The ChEMBL ID lookup service may now be used to get the full records for the salt forms...

# In[50]:


for chembl_id in [x["molecule_chembl_id"] for x in records if x["parent"] == 'False']:
    record = new_client.molecule.get(chembl_id)          
    print("{:10s} : {}".format(chembl_id, record['molecule_structures']['canonical_smiles']))


# ### Drug mechanism(s) of action
# 
# The mechanisms of action of marketed drugs may be retrieved.
# 
# Note that this data may not be recorded for the parent structure, but rather for one of its versions. For example, the marketed drug, Tykerb, containing the active ingredient Lapatinib (CHEMBL554) is actually the ditosylate monohydrate (CHEMBL1201179).

# In[51]:


# Molecule forms for Lapatinib are used here...

for chembl_id in (x["molecule_chembl_id"] for x in new_client.molecule_form.get("CHEMBL554")['molecule_forms']):
        
    print("The recorded mechanisms of action of '{}' are...".format(chembl_id))
        
    mechanism_records = new_client.mechanism.filter(molecule_chembl_id=chembl_id)
    
    if mechanism_records:
    
        for mech_rec in mechanism_records:
    
            print("{:10s} : {}".format(mech_rec["molecule_chembl_id"], mech_rec["mechanism_of_action"]))
        
    print("-" * 50)


# ### Image query
# 
# The webservice may be used to obtain a PNG image of a compound.

# In[52]:


# Lapatinib ditosylate monohydrate (Tykerb)

chembl_id = "CHEMBL1201179" 


# In[53]:


png = new_client.image.get(chembl_id)

Image(png)


# ### Bioactivities
# 
# All bioactivity records for a compound may be retrieved _via_ its ChEMBL ID.
# 

# In[54]:


# Lapatinib

chembl_id = "CHEMBL554" 


# In[55]:


records = new_client.activity.filter(molecule_chembl_id=chembl_id)

len(records), records[:2]


# ## Targets
# 
# The webservices may also be used to obtain information on biological targets, _i.e._ the entities, such as proteins, cells or organisms, with which compounds interact.
# 

# In[56]:


# Like with any other resource type, a complete list of targets can be requested using the client:
records = new_client.target.all()
len(records)


# In[57]:


records[:4]


# In[58]:


# Count target types...

counts = Counter([x["target_type"] for x in records if x["target_type"]])

for targetType, n in sorted(counts.items(), key=itemgetter(1), reverse=True): print("{:30s} {:-4d}".format(targetType, n))


# ### ChEMBL ID
# 
# Data on any target type may be obtained _via_ a lookup of its ChEMBL ID.
# 

# In[59]:


# Receptor protein-tyrosine kinase erbB-2
    
chembl_id = "CHEMBL1824"


# In[60]:


record = new_client.target.get(chembl_id)

record


# Remember that all targets have ChEMBL IDs, not just proteins...

# In[61]:


# SK-BR-3, a cell line over-expressing erbB-2

chembl_id = "CHEMBL613834" 


# In[62]:


record = new_client.target.get(chembl_id)

record


# ### UniProt ID
# 
# Data on protein targets may also be obtained using UniProt ID.

# In[63]:


# UniProt ID for erbB-2, a target of Lapatinib

uniprot_id = "P04626"


# In[64]:


records = new_client.target.filter(target_components__accession=uniprot_id)
print [(x['target_chembl_id'], x['pref_name']) for x in records]


# ### Bioactivities
# 
# All bioactivities for a target may be retrieved.

# In[65]:


# Receptor protein-tyrosine kinase erbB-2

chembl_id = "CHEMBL1824"


# In[66]:


records = new_client.activity.filter(target_chembl_id=chembl_id)

len(records)


# In[67]:


# Show assays with most recorded bioactivities...

for assay, n in sorted(Counter((x["assay_chembl_id"], x["assay_description"]) for x in records).items(), key=itemgetter(1), reverse=True)[:5]:
    
    print("{:-4d} {:14s} {}".format(n, *assay))


# ### Approved Drugs
# 
# The approved drugs for a target may be retrieved.

# In[68]:


# Receptor protein-tyrosine kinase erbB-2

chembl_id = "CHEMBL1824"


# In[69]:


activities = new_client.mechanism.filter(target_chembl_id=chembl_id)
compound_ids = [x['molecule_chembl_id'] for x in activities]
approved_drugs = new_client.molecule.filter(molecule_chembl_id__in=compound_ids).filter(max_phase=4)

for record in approved_drugs:
    
    print("{:10s} : {}".format(record["molecule_chembl_id"], record["pref_name"]))


# ## Assays
# 
# Information about assays may also be retrieved by the web services.

# ### Assay details
# 
# Details of an assay may be retrieved _via_ its ChEMBL ID.

# In[70]:


# Inhibitory activity against epidermal growth factor receptor

chembl_id = "CHEMBL674106"


# In[71]:


record = new_client.assay.get(chembl_id)

record


# ### Bioactivities
# 
# All bioactivity records for an assay may be requested.

# In[72]:


records = new_client.activity.filter(assay_chembl_id=chembl_id)

len(records), records[:2]


# ## Other resources
# 
# As noted previously, there are many other resources that can be useful. They won't be covered in this document in a great detail but some examples may be helpful.

# In[73]:


# Documents - retrieve all publications published after 1985 in 5th volume.
print new_client.document.filter(doc_type='PUBLICATION').filter(year__gt=1985).filter(volume=5)


# In[74]:


# Cell lines:
print new_client.cell_line.get('CHEMBL3307242')


# In[75]:


# Protein class:
print new_client.protein_class.filter(l6="CAMK protein kinase AMPK subfamily")


# In[76]:


# Source:
print new_client.source.filter(src_short_name="ATLAS")


# In[77]:


# Target component:
print new_client.target_component.get(375)


# In[78]:


# ChEMBL ID Lookup: check if CHEMBL1 is a molecule, assay or target:
print new_client.chembl_id_lookup.get("CHEMBL1")['entity_type']


# In[79]:


# ATC class:
print new_client.atc_class.get('H03AA03')