#!/usr/bin/env python
# coding: utf-8

# # Ligand-based Target Prediction Tutorial

# ### myChEMBL team, ChEMBL Group, EMBL-EBI.

# This is a short tutorial on getting target predictions based on the current multi-category Naive Bayesian [ChEMBL_20 models](http://chembl.blogspot.co.uk/2014/04/ligand-based-target-predictions-in.html) for a single molecule input. The predictions are ranked by the output classifier probability or score.

# Outline of the tutorial:
# * Input a molecule to get target predictions for it
# * Assess the predictions
# * Retrieve individual feature contributions for a prediction

# The two models are under ~/models

# First, import IPython helpers:

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')
from IPython.display import Image


# Now, import the necessary modules:

# In[2]:


from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw

import pandas as pd
from pandas import concat

from collections import OrderedDict

# By default, the API connects to the main ChEMBL database; set it to use the local version (i.e. myChEMBL) instead...
from chembl_webresource_client.settings import Settings

Settings.Instance().NEW_CLIENT_URL = 'http://localhost/chemblws'

from chembl_webresource_client.new_client import new_client


# In[3]:


from sklearn.externals import joblib


# In[4]:


rcParams['figure.figsize'] = 10,10


# A couple of utility functions:

# In[5]:


def calc_scores(classes):
    p = []
    for c in classes:
        p.append(pred_score(c))
    return p


# In[6]:


def pred_score(trgt):
    diff = morgan_nb.estimators_[classes.index(trgt)].feature_log_prob_[1] - morgan_nb.estimators_[classes.index(trgt)].feature_log_prob_[0]
    return sum(diff*fp)


# Load one of the two models:

# In[7]:


morgan_nb = joblib.load('/home/chembl/models_21/10uM/mNB_10uM_all.pkl')


# In[8]:


classes = list(morgan_nb.targets)


# Check the number of classses/targets:

# In[9]:


len(classes)


# We have to suppress some warnings because of: https://github.com/python-pillow/Pillow/issues/63

# In[10]:


import warnings
warnings.filterwarnings('ignore') 
warnings.filterwarnings("ignore", category=DeprecationWarning)


# ### Get predictions for a compound

# Let's use a molecule (GS-5759) found in [this](http://www.ncbi.nlm.nih.gov/pubmed/24513870) publication. It is a novel compound (not in ChEMBL) reported to have dual activity against PDE4 and beta-2 adrenergic receptor. 

# In[11]:


smiles = 'O[C@@H](CNCCCC#CC1=CC=C(C=C1)NC(=O)C=1C=C(C=CC1)S(=O)(=O)C=1C=C2C(=C(C=NC2=C(C1)C)C(=O)N)NC1=CC(=CC=C1)OC)C1=C2C=CC(NC2=C(C=C1)O)=O'


# In[12]:


mol = Chem.MolFromSmiles(smiles)


# In[13]:


mol


# Convert to the Morgan fingerprint and save the bit dictionary for later...

# In[14]:


info={}
fp = Chem.GetMorganFingerprintAsBitVect(mol,2,nBits=2048, bitInfo=info)


# Get the predictions in a Pandas frame

# In[15]:


predictions = pd.DataFrame(zip(classes, calc_scores(classes),list(morgan_nb.predict_proba(fp)[0])),columns=['id','score','proba'])


# In[16]:


predictions.head()


# Let's check the distributions of prediction probability and score for the 1244 targets.

# In[17]:


predictions['proba'].hist()


# In[18]:


predictions['score'].hist()


# Sort by probability and take top 10.

# In[19]:


top_preds = predictions.sort(columns=['proba'],ascending=False).head(10)


# In[20]:


top_preds


# Use ChEMBL WS to merge with target information

# In[21]:


def fetch_WS(trgt):
    targets = new_client.target
    return targets.get(trgt)


# In[22]:


plist = []
for i,e in enumerate(top_preds['id']):
    p = pd.DataFrame(fetch_WS(e), index=(i,))
    plist.append(p)
target_info = concat(plist)


# In[23]:


target_info.shape


# In[24]:


target_info


# Finally, let's get the complete top 10 target predictions in descreasing order of probability.

# In[25]:


result = pd.merge(top_preds, target_info, left_on='id', right_on='target_chembl_id')


# In[26]:


result


# Not bad! The model retrieved the correct targets in the top 5 predictions out of a pool of 1342 targets.

# ### Feature contribution

# Now let's look at contributions of specific substructural features to the human &beta;2 AR predictions. Which features of the active molecules in the training set were the most important?

# In[27]:


bit_scores = (morgan_nb.estimators_[classes.index(result['id'][1])].feature_log_prob_[1] - morgan_nb.estimators_[classes.index(result['id'][1])].feature_log_prob_[0])*fp


# The snippet below maps the specific bits back to atomic environments

# In[28]:


frags = OrderedDict()
for k in info.keys():
    if bit_scores[k] > 0.1:
        atomId,radius = info[k][0]
        env=Chem.FindAtomEnvironmentOfRadiusN(mol,radius,atomId)
        ats = set([atomId])
        for bidx in env:
            bond = mol.GetBondWithIdx(bidx)
            ats.add(bond.GetBeginAtomIdx())
            ats.add(bond.GetEndAtomIdx())
            frag = Chem.MolFragmentToSmiles(mol,atomsToUse=list(ats),bondsToUse=env,rootedAtAtom=atomId)
            legend = str(round(bit_scores[k],2))
            frags[k] = (legend,frag)


# In[29]:


legends = [l[1][0] for l in sorted(frags.items(), key=lambda t: t[1][0], reverse=True)][:10]
ffrags = [l[1][1] for l in sorted(frags.items(), key=lambda t: t[1][0], reverse=True)][:10]


# In[30]:


fmols=[Chem.MolFromSmarts(s) for s in ffrags]


# This was the original molecule

# In[31]:


mol


# And these are its most important features, based on their enrichent in human &beta;2 AR actives in ChEMBL. The legend is the Bayesian score for each feature. 

# In[33]:


Draw.MolsToGridImage(fmols, molsPerRow=5, legends=legends, useSVG=False)