#!/usr/bin/env python
# coding: utf-8

# # SureChEMBL iPython Notebook Tutorial 2

# ## An introduction to patent chemoinformatics using SureChEMBL data and the RDKit toolkit

# ### George Papadatos, ChEMBL group, EMBL-EBI

# ##In this tutorial:
# 1. Read a file that contains all chemistry extracted from the Levitra US patent (US6566360) along with *all* the other members of the same patent family.
# 2. Filter by different text-mining and chemoinformatics properties to remove noise and enrich the genuinely novel structures claimed in the patent documents. 
# 3. Visualise the chemical space using MDS and dimensionality reduction.  
# i.  Identify outliers in the chemistry space.  
# ii. Fix wrongly extracted structures.  
# 4. Find the Murcko and MCS scaffolds of the claimed compounds in the patent family.  
# i.  Compare the derived core with the actual Markush structure as reported in the original patent document.
# 6. Identify **key** compounds using structural information only. This is based on the publications by [Hattori et al.](http://pubs.acs.org/doi/abs/10.1021/ci7002686) and [Tyrchan et al.](http://pubs.acs.org/doi/abs/10.1021/ci3001293).   
# i.  Count the number of NNs per compound.  
# ii. Perform R-Group decomposition.

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')
import pandas as pd

from IPython.display import display
from IPython.display import display_pretty, display_html, HTML, Javascript, Image

from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import rdFMCS as MCS
from rdkit.Chem import Draw
Draw.DrawingOptions.elemDict[0]=(0.,0.,0.)  # draw dummy atoms in black

from itertools import cycle
from sklearn import manifold
from scipy.spatial.distance import *

import mpld3
mpld3.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.ERROR)

# Get the base URL of the VM:
display(Javascript('IPython.notebook.kernel.execute("current_url = " + "\'"+document.URL+"\'");'))


# In[2]:


from rdkit import rdBase
rdBase.rdkitVersion


# This is the base URL of the local Beaker server running on myChEMBL.

# In[3]:


cur_base_url = current_url.split('http://')[1].split('/')[0]
base_url = 'localhost:8000' if (cur_base_url == 'localhost:9612' or cur_base_url == '127.0.0.1:9612') else current_url.split('http://')[1].split(':')[0] + ':8000'
base_url


# In[4]:


pd.options.display.mpl_style = 'default'
pd.options.display.float_format = '{:.2f}'.format


# In[5]:


rcParams['figure.figsize'] = 16,10


# ## 1. Read and filter the input file

# The file was generated by extracting all chemistry from a list of patent documents in [SureChEMBL](https://www.surechembl.org). The Lucene query used to retrieve the list of relevant patents was: pn:"US6566360".

# In[6]:


df = pd.read_csv('/home/chembl/ipynb_workbench/document_chemistry_20141011_114421_271.csv',sep=',')


# Let's check the contents:

# In[7]:


df.info()


# In[8]:


df.shape


# Add a SCHEMBL ID column and sort:

# In[9]:


df['SCHEMBL ID'] = df['Chemical ID'].map(lambda x: 'SCHEMBL{0}'.format(x))
df = df.sort('Chemical ID')


# In[10]:


df.iloc[600:610]


# **First round of filtering: Novel compounds appear in the description or claims section of the document. Alternatively, they are extracted from images or mol files**

# In[11]:


dff = df[(df['Claims Count'] > 0) | (df['Description Count'] > 0) | (df['Type'] != "TEXT")]


# In[12]:


dff.shape


# **Second round of filtering: Simple physicochemical properties and counts**

# In[13]:


dff = dff[(dff['Rotatable Bound Count'] < 15) & (dff['Ring Count'] > 0) & (df['Radical'] == 0) & (df['Singleton'] == 0) & (df['Simple'] == 0)]


# In[14]:


dff.shape


# In[15]:


dff = dff[(dff['Molecular Weight'] >= 300) & (dff['Molecular Weight'] <= 800) & (dff['Log P'] > 0) & (dff['Log P'] < 7)]


# In[16]:


dff.shape


# **Third round of filtering: Using Global Corpus Count**

# In[17]:


dff = dff[(dff['Chemical Corpus Count'] < 400) & ((dff['Annotation Corpus Count'] < 400) | (dff['Annotation Corpus Count'].isnull()))]


# In[18]:


dff.shape


# **Keep largest fragment and convert SMILES to RDKit molecules**

# In[19]:


dff['SMILES'] = dff['SMILES'].map(lambda x: max(x.split('.'), key=len))


# In[20]:


PandasTools.AddMoleculeColumnToFrame(dff, smilesCol = 'SMILES')


# In[21]:


#dff.head(10)


# **Fourth round of filtering: Remove duplicates based on Chemical IDs and InChI keys**

# In[22]:


dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi)
dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey)


# In[23]:


#dff.head()


# In[24]:


dff = dff.drop_duplicates('Chemical ID')


# In[25]:


dff.shape


# In[26]:


dff = dff.drop_duplicates('InChIKey')


# In[27]:


dff.shape


# Wow, that was a lot of duplicates. This is because the same compounds come from different patents in the same patent family. In addition, in US patents a compound may come from 3 different sources: text, image and mol file. 

# **Fifth round of filtering: Remove Boron-containing compounds as they are likely to be reactants.**

# In[28]:


dff = dff.ix[~(dff['ROMol'] >= Chem.MolFromSmarts('[#5]'))]


# In[29]:


dff.shape


# In[30]:


dff = dff.set_index('Chemical ID', drop=False)


# That was the end of the filters - let's see a summary of the remaining compounds and their patent document source:

# In[31]:


dff_counts = dff[['Patent ID','ROMol']].groupby('Patent ID').count()


# In[32]:


dff_counts['Link'] = dff_counts.index.map(lambda x: '<a href="https://www.surechembl.org/document/{0}/" target="_blank">{0}</a>'.format(x))


# In[33]:


dff_counts = dff_counts.rename(columns={'ROMol':'# Compounds'})


# In[34]:


dff_counts


# Right, is Levitra included in this set? ChEMBL tells me that its InChiKey is 'SECKRCOLJRRGGV-UHFFFAOYSA-N'.

# In[35]:


dff.ix[dff.InChIKey == 'SECKRCOLJRRGGV-UHFFFAOYSA-N',['SCHEMBL ID','ROMol']]


# Good - it is...

# ## 2. Visualise the patent chemical space - MDS

# Preparation of the pairwise distance matrix.

# In[36]:


fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']]


# In[37]:


dist_mat = squareform(pdist(fps,'jaccard'))


# In[38]:


mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2)


# In[39]:


results = mds.fit(dist_mat)


# In[40]:


coords = results.embedding_


# In[41]:


dff['X'] = [c[0] for c in coords]


# In[42]:


dff['Y'] = [c[1] for c in coords]


# In[43]:


csss = """
table
{
  border-collapse: collapse;
}
th
{
  color: #ffffff;
  background-color: #848482;
}
td
{
  background-color: #f2f3f4;
}
table, th, td
{
  font-family:Arial, Helvetica, sans-serif;
  border: 1px solid black;
  text-align: right;
}
"""


# In[44]:


import base64


# In[45]:


dff['base64smi'] = dff['SMILES'].map(base64.b64encode)


# Let's produce a scatter plot of the chemical space - points represent compounds, color-coded by the patent document they were found in. Thanks to [mpld3](http://mpld3.github.io/), the scatter plot is interactive with *live* structure rendering calls to the *local* myChEMBL Beaker server. 

# In[46]:


fig, ax = plt.subplots()
fig.set_size_inches(14.0, 12.0)
#colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw')
colors = cycle(cm.Dark2(np.linspace(0,1,len(dff_counts.index))))
for name, group in dff.groupby('Patent ID'):
    labels = []
    for i in group.index:
        zz = group.ix[[i],['Patent ID','SCHEMBL ID','base64smi']]
        zz['mol'] = zz['base64smi'].map(lambda x: '<img src="http://{0}/utils/smiles2image/{1}?size=300">'.format(base_url,x))
        del zz['base64smi']
        label = zz.T
        del zz
        label.columns = ['Row: {}'.format(i)]
        labels.append(str(label.to_html()))
    points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8)
    tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss)
    mpld3.plugins.connect(fig,tooltip)                                          


# Nice, we get Beaker rendering on the fly. However, there is a bunch of compounds on the left that are distinctively different. This is because of an image extraction error.  
# At least the error is systematic: extraction breaks the imidazole ring. Let's try to 'repair' the ring and rescue these compounds - we'll use an appropriate intramolecular reaction SMARTS.

# In[47]:


resmarts = "([R:5][C;!R:1]=[C;H2].[C;H0,H1;!R:4]-[N;H2:2])>>[R:5][*:1]=[*:2]-[*:4]"


# In[48]:


Draw.ReactionToImage(Chem.ReactionFromSmarts(resmarts))


# In[49]:


def fixRing(mol,resmarts=resmarts):
    rxn = Chem.ReactionFromSmarts(resmarts)
    ps = rxn.RunReactants((mol,))
    if ps:
        m = ps[0][0]
        Chem.SanitizeMol(m)
        return m
    else:
        return mol


# So this...

# In[50]:


mm = dff.ix[12823029]['ROMol']
mm


# ...will become this:

# In[51]:


fixRing(mm)


# Let's apply the repair to all affected structures:

# In[52]:


for i, p, m in zip(dff.index, dff['Patent ID'], dff['ROMol']):
    if p == 'EP-2295436-A1':
        dff.ix[i,'ROMol'] = fixRing(m)


# In[53]:


dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi)
dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey)
dff = dff.drop_duplicates('InChIKey')


# So after the fix, we have even less structures because some of the fixed ones already existed. Hopefully, however, we have less false positives too:

# In[54]:


dff.shape


# In[55]:


dff['RD_SMILES'] = dff['ROMol'].map(Chem.MolToSmiles)
dff['base64rdsmi'] = dff['RD_SMILES'].map(base64.b64encode)


# We're going to re-calculate the distance matrix and plot the MCS just to double-check they were no side-effects from the applications of the reaction.

# In[56]:


fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']]
dist_mat = squareform(pdist(fps,'jaccard'))
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2)
results = mds.fit(dist_mat)
coords = results.embedding_
dff['X'] = [c[0] for c in coords]
dff['Y'] = [c[1] for c in coords]


# In[57]:


fig, ax = plt.subplots()
fig.set_size_inches(14.0, 12.0)
#colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw')
colors = cycle(cm.Dark2(np.linspace(0,1,len(dff_counts.index))))
for name, group in dff.groupby('Patent ID'):
    labels = []
    for i in group.index:
        zz = group.ix[[i],['Patent ID','SCHEMBL ID','base64rdsmi']]
        zz['mol'] = zz['base64rdsmi'].map(lambda x: '<img src="http://{0}/utils/smiles2image/{1}?size=300">'.format(base_url,x))
        del zz['base64rdsmi']
        label = zz.T
        del zz
        label.columns = ['Row: {}'.format(i)]
        labels.append(str(label.to_html()))
    points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8)
    tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss)
    mpld3.plugins.connect(fig,tooltip)  


# So the "island" of outliers has disappeared. There's only one obvious outlier on the far left. This might be a reactant which wasn't filtered out. 

# ## 3. Define the MCS for this set of claimed molecules

# A couple of helper functions, inspired by Greg Landrum's post [here](http://rdkit.blogspot.co.uk/2015/05/a-set-of-scaffolds-from-chembl-papers.html).

# In[58]:


from IPython.display import SVG
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
def moltosvg(mol,molSize=(450,350),kekulize=True):
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    
    rdDepictor.Compute2DCoords(mc)
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    # the MolDraw2D code is not very good at the moment at dealing with atom queries,
    # this is a workaround until that's fixed.
    # The rendering is still not going to be perfect because query bonds are not properly indicated
    opts = drawer.drawOptions()
    for atom in mc.GetAtoms():
        if atom.HasQuery() and atom.DescribeQuery().find('AtomAtomicNum')!=0:
            opts.atomLabels[atom.GetIdx()]=atom.GetSmarts()
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    return svg.replace('svg:','')


# In[59]:


def MCS_Report(ms,atomCompare=MCS.AtomCompare.CompareAny,**kwargs):
    mcs = MCS.FindMCS(ms,atomCompare=atomCompare,completeRingsOnly=True,timeout=120,**kwargs)
    nAts = np.array([x.GetNumAtoms() for x in ms])
    print 'Mean nAts: {0}, mcs nAts: {1}'.format(nAts.mean(),mcs.numAtoms)
    print 'MCS SMARTS: {0}'.format(mcs.smartsString)
    mcsM = Chem.MolFromSmarts(mcs.smartsString)
    
    mcsM.UpdatePropertyCache(False)
    Chem.SetHybridization(mcsM)
    Chem.Compute2DCoords(mcsM)
    smi = Chem.MolToSmiles(mcsM,isomericSmiles=True)
    print "MCS SMILES: {0}".format(smi)
    img=Draw.MolToImage(mcsM,kekulize=False)
    return mcs.smartsString,smi,img,mcsM


# Let's add the Murcko framework to our data frame:

# In[60]:


PandasTools.AddMurckoToFrame(dff)


# In[61]:


PandasTools.AddMoleculeColumnToFrame(dff,smilesCol='Murcko_SMILES', molCol='MurMol')


# In[62]:


PandasTools.AlignToScaffold(dff)


# In[63]:


dff[['ROMol','MurMol']].head()


# And now we'll extract the MCS core:

# In[64]:


mols = list(dff.MurMol)
smarts,smi,img,mcsM = MCS_Report(mols,threshold=0.8,ringMatchesRingOnly=True)
SVG(moltosvg(mcsM))


# ## 4. Compare the extracted MCS core with the actual claimed Markush structure.

# Given the MCS core, we'll do an R-Group decomposition first to define the R-Groups and their attachement points.

# In[65]:


core = Chem.MolFromSmarts(smarts)    


# In[66]:


pind = []
rgroups = []
for i, m in zip(dff.index, dff.ROMol):
    rgrps = Chem.ReplaceCore(m,core,labelByIndex=True)
    if rgrps:
        s = Chem.MolToSmiles(rgrps, True)
        for e in s.split("."):
            pind.append(e[1:4].replace('*','').replace(']',''))
            rgroups.append(e)
        #print "R-Groups: ", i, s


# In[67]:


pind = sorted(list(set(pind)),key=int)


# In[68]:


pind = map(int,pind)


# In[69]:


Draw.MolToImage(core,includeAtomNumbers=True,highlightAtoms=pind)


# Is the MCS similar to the claimed Markush structure?
# Let's fetch the original document in PDF and check in page 4:

# In[70]:


HTML('<iframe src=https://dl.dropboxusercontent.com/u/22273283/US6566360B1.pdf width=1000 height=500></iframe>')


# Pretty close!

# ## 5. Identify key compounds - in this case, vardenafil itself

# This is based on the publication by [Hattori et al.](http://pubs.acs.org/doi/abs/10.1021/ci7002686) and [Tyrchan et al.](http://pubs.acs.org/doi/abs/10.1021/ci3001293). First we'll try the Hattori et al. version of finding the compounds with the highest number of nearest neighbours (NNs) for a given similarity threshold. We will need to calculate the similarity matrix.

# In[71]:


sim_mat = []
CUTOFF = 0.8
for i,fp in enumerate(fps):
    tt = DataStructs.BulkTanimotoSimilarity(fps[i],fps)
    for i,t in enumerate(tt):
        if tt[i] < CUTOFF:
            tt[i] = None
    sim_mat.append(tt)
sim_mat=numpy.array(sim_mat) 


# In[72]:


sim_df = pd.DataFrame(sim_mat, columns = dff['Chemical ID'], index=dff['Chemical ID'])


# In[73]:


sim_df.head()


# In[74]:


d = sim_df.count().to_dict()
nn_df = pd.DataFrame(d.items(),columns=['Chemical ID', '# NNs'])
nn_df.set_index('Chemical ID',inplace=True)
nn_df.head()


# In[75]:


pd.merge(dff[['Chemical ID','ROMol']],nn_df,left_index=True, right_index=True, sort=False).sort('# NNs',ascending=False).head()


# So, the method seems to be working in this case. Vardenafil has the second largest number of NNs with a cutoff of 0.8. Next, we'll try one of the Tyrchan et al. methods (Frequency of Groups, FOG) and count the frequency of the derived R-Groups:

# In[76]:


from collections import Counter
rgroups = [r.replace('[9*]','[5*]').replace('[8*]','[6*]') for r in rgroups]
c = Counter(rgroups)


# In[77]:


rgroup_counts = pd.DataFrame(c.most_common(20),columns=['RGroups','Frequency'])
rgroup_counts.head(10)


# In[78]:


Draw.MolToImage(core,includeAtomNumbers=True,highlightAtoms=pind)


# In[79]:


PandasTools.AddMoleculeColumnToFrame(rgroup_counts, smilesCol = 'RGroups')
rgroup_counts.head(10)


# In[80]:


rgroup_counts[[r.startswith('[6*]') for r in rgroup_counts.RGroups]].head(10)


# Although the most frequent R-groups in positions 9 and 11 correspond to those in Vardenafil (see below), the exact piperazine R-group in position 6 is the 5th most frequent one in that position.

# In[81]:


dff.ix[dff.InChIKey == 'SECKRCOLJRRGGV-UHFFFAOYSA-N',['SCHEMBL ID','ROMol']]