#!/usr/bin/env python # coding: utf-8 # # Ligand-based Target Prediction Tutorial # ### myChEMBL team, ChEMBL Group, EMBL-EBI. # This is a short tutorial on getting target predictions based on the current multi-category Naive Bayesian [ChEMBL_20 models](http://chembl.blogspot.co.uk/2014/04/ligand-based-target-predictions-in.html) for a single molecule input. The predictions are ranked by the output classifier probability or score. # Outline of the tutorial: # * Input a molecule to get target predictions for it # * Assess the predictions # * Retrieve individual feature contributions for a prediction # The two models are under ~/models # First, import IPython helpers: # In[1]: get_ipython().run_line_magic('pylab', 'inline') from IPython.display import Image # Now, import the necessary modules: # In[2]: from rdkit.Chem import AllChem as Chem from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem import PandasTools from rdkit.Chem import Draw import pandas as pd from pandas import concat from collections import OrderedDict # By default, the API connects to the main ChEMBL database; set it to use the local version (i.e. myChEMBL) instead... from chembl_webresource_client.settings import Settings Settings.Instance().NEW_CLIENT_URL = 'http://localhost/chemblws' from chembl_webresource_client.new_client import new_client # In[3]: from sklearn.externals import joblib # In[4]: rcParams['figure.figsize'] = 10,10 # A couple of utility functions: # In[5]: def calc_scores(classes): p = [] for c in classes: p.append(pred_score(c)) return p # In[6]: def pred_score(trgt): diff = morgan_nb.estimators_[classes.index(trgt)].feature_log_prob_[1] - morgan_nb.estimators_[classes.index(trgt)].feature_log_prob_[0] return sum(diff*fp) # Load one of the two models: # In[7]: morgan_nb = joblib.load('/home/chembl/models_21/10uM/mNB_10uM_all.pkl') # In[8]: classes = list(morgan_nb.targets) # Check the number of classses/targets: # In[9]: len(classes) # We have to suppress some warnings because of: https://github.com/python-pillow/Pillow/issues/63 # In[10]: import warnings warnings.filterwarnings('ignore') warnings.filterwarnings("ignore", category=DeprecationWarning) # ### Get predictions for a compound # Let's use a molecule (GS-5759) found in [this](http://www.ncbi.nlm.nih.gov/pubmed/24513870) publication. It is a novel compound (not in ChEMBL) reported to have dual activity against PDE4 and beta-2 adrenergic receptor. # In[11]: smiles = 'O[C@@H](CNCCCC#CC1=CC=C(C=C1)NC(=O)C=1C=C(C=CC1)S(=O)(=O)C=1C=C2C(=C(C=NC2=C(C1)C)C(=O)N)NC1=CC(=CC=C1)OC)C1=C2C=CC(NC2=C(C=C1)O)=O' # In[12]: mol = Chem.MolFromSmiles(smiles) # In[13]: mol # Convert to the Morgan fingerprint and save the bit dictionary for later... # In[14]: info={} fp = Chem.GetMorganFingerprintAsBitVect(mol,2,nBits=2048, bitInfo=info) # Get the predictions in a Pandas frame # In[15]: predictions = pd.DataFrame(zip(classes, calc_scores(classes),list(morgan_nb.predict_proba(fp)[0])),columns=['id','score','proba']) # In[16]: predictions.head() # Let's check the distributions of prediction probability and score for the 1244 targets. # In[17]: predictions['proba'].hist() # In[18]: predictions['score'].hist() # Sort by probability and take top 10. # In[19]: top_preds = predictions.sort(columns=['proba'],ascending=False).head(10) # In[20]: top_preds # Use ChEMBL WS to merge with target information # In[21]: def fetch_WS(trgt): targets = new_client.target return targets.get(trgt) # In[22]: plist = [] for i,e in enumerate(top_preds['id']): p = pd.DataFrame(fetch_WS(e), index=(i,)) plist.append(p) target_info = concat(plist) # In[23]: target_info.shape # In[24]: target_info # Finally, let's get the complete top 10 target predictions in descreasing order of probability. # In[25]: result = pd.merge(top_preds, target_info, left_on='id', right_on='target_chembl_id') # In[26]: result # Not bad! The model retrieved the correct targets in the top 5 predictions out of a pool of 1342 targets. # ### Feature contribution # Now let's look at contributions of specific substructural features to the human β2 AR predictions. Which features of the active molecules in the training set were the most important? # In[27]: bit_scores = (morgan_nb.estimators_[classes.index(result['id'][1])].feature_log_prob_[1] - morgan_nb.estimators_[classes.index(result['id'][1])].feature_log_prob_[0])*fp # The snippet below maps the specific bits back to atomic environments # In[28]: frags = OrderedDict() for k in info.keys(): if bit_scores[k] > 0.1: atomId,radius = info[k][0] env=Chem.FindAtomEnvironmentOfRadiusN(mol,radius,atomId) ats = set([atomId]) for bidx in env: bond = mol.GetBondWithIdx(bidx) ats.add(bond.GetBeginAtomIdx()) ats.add(bond.GetEndAtomIdx()) frag = Chem.MolFragmentToSmiles(mol,atomsToUse=list(ats),bondsToUse=env,rootedAtAtom=atomId) legend = str(round(bit_scores[k],2)) frags[k] = (legend,frag) # In[29]: legends = [l[1][0] for l in sorted(frags.items(), key=lambda t: t[1][0], reverse=True)][:10] ffrags = [l[1][1] for l in sorted(frags.items(), key=lambda t: t[1][0], reverse=True)][:10] # In[30]: fmols=[Chem.MolFromSmarts(s) for s in ffrags] # This was the original molecule # In[31]: mol # And these are its most important features, based on their enrichent in human β2 AR actives in ChEMBL. The legend is the Bayesian score for each feature. # In[33]: Draw.MolsToGridImage(fmols, molsPerRow=5, legends=legends, useSVG=False)