#!/usr/bin/env python
# coding: utf-8

# # SureChEMBL Tutorial

# ## An introduction to patent chemoinformatics using SureChEMBL data and RDKit

# ### myChEMBL team, ChEMBL group, EMBL-EBI.

# ### Outline:
# 1. Read a file that contains small molecule antimalarials reported in US patents in 2010. The file was downloaded from [SureChEMBL](https://www.surechembl.org).
# 2. Filter by different text-mining and chemoinformatics properties to remove noise and enrich the genuinely novel structures claimed in the patent document. 
# 3. Visualise the chemical space using MDS and dimensionality reduction.
# 4. Find the MCS of the claimed compounds in each patent.
# 5. Compare the derived MCS with the actual Markush structure as reported in the original patent document.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('pylab', 'inline')
import pandas as pd

from IPython.display import display
from IPython.display import display_pretty, display_html, HTML, Javascript

from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import rdFMCS as MCS
from rdkit.Chem import Draw
Draw.DrawingOptions.elemDict[0]=(0.,0.,0.)  # draw dummy atoms in black

from itertools import cycle
from sklearn import manifold
from scipy.spatial.distance import *

import mpld3
mpld3.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.ERROR)

# Get the base URL of the VM:
display(Javascript('IPython.notebook.kernel.execute("current_url = " + "\'"+document.URL+"\'");'))


# In[2]:


cur_base_url = current_url.split('http://')[1].split('/')[0]
base_url = 'localhost:8000' if (cur_base_url == 'localhost:9612' or cur_base_url == '127.0.0.1:9612') else current_url.split('http://')[1].split(':')[0] + ':8000'
base_url


# In[3]:


pd.options.display.mpl_style = 'default'


# In[4]:


rcParams['figure.figsize'] = 16,10


# ## 1. Read and filter the input file

# The file was manually collated by extracting all chemistry from a list of patent documents in [SureChEMBL](https://www.surechembl.org). The Lucene query used to retrieve the list of relevant patents was: "ic:C07D AND ic:A61P003306 AND pnctry:US AND pdyear:2010 AND (ttl:\*malaria\* OR ab:\*malaria\* OR ttl:\*parasit\*)"

# In[7]:


df = pd.read_csv('/home/chembl/ipynb_workbench/US_antimalarial_patents_cmpds.txt',sep='\t')


# Let's check the contents:

# In[8]:


df.info()


# In[9]:


df.head()


# In[10]:


df.shape


# In[11]:


df['chemical_id'] = df['chemical_id'].map(lambda x: 'SCHEMBL{0}'.format(x))


# **First round of filtering: Novel compounds appear in the description or claims section of the document. Alternatively, they are extracted from images or mol files**

# In[12]:


dff = df[(df['claims_count'] > 0) | (df['description_count'] > 0) | (df['type'] != "TEXT")]


# In[13]:


dff.shape


# **Second round of filtering: Simple physicochemical properties and counts**

# In[14]:


dff = dff[(dff['rotatable_bond_count'] < 15) & (dff['ring_count'] > 0) & (df['radical'] == 0) & (df['singleton'] == 0) & (df['simple'] == 0)]


# In[15]:


dff.shape


# In[16]:


dff = dff[(dff['molecular_weight'] >= 300) & (dff['molecular_weight'] <= 800) & (dff['log_p'] > 0) & (dff['log_p'] < 6)]


# In[17]:


dff.shape


# In[18]:


dff = dff[(dff['chemical_corpus_count'] < 400) & ((dff['annotation_corpus_count'] < 400) | (dff['annotation_corpus_count'].isnull()))]


# In[19]:


dff.shape


# **Convert SMILES to RDKit molecules**

# In[20]:


PandasTools.AddMoleculeColumnToFrame(dff, smilesCol = 'smiles')


# **Third round of filtering: Remove salts and duplicates, based on InChI keys**

# In[21]:


PandasTools.RemoveSaltsFromFrame(dff)


# In[22]:


dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi)


# In[23]:


dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey)


# In[24]:


dff.head()


# In[25]:


dff = dff.drop_duplicates(['SCPN','InChIKey'])


# In[26]:


dff.shape


# Wow, that was a lot of duplicates. This is because in US patents a compound may come from 3 different sources: text, image and mol file. 

# In[27]:


dff = dff.ix[[d.count('.') < 2 for d in dff['smiles']]]


# In[28]:


dff.shape


# **Fourth round of filtering: Remove Boron-containing compounds as they are likely to be reactants.**

# In[29]:


dff = dff.ix[~(dff['ROMol'] >= Chem.MolFromSmiles('B'))]


# In[30]:


dff.shape


# **Fifth round of filtering: Remove patents with less than 10 compounds**

# In[31]:


dff_counts = dff[['SCPN','ROMol']].groupby('SCPN').count()


# In[32]:


tokeep = list(dff_counts.ix[dff_counts.ROMol >= 10].index)


# In[33]:


dff = dff.ix[dff.SCPN.isin(tokeep)]


# In[34]:


dff.shape


# OK, filtering is over. Let's prepare a summary table of the remaining patent documents and their compounds:

# In[35]:


dff_counts = dff[['SCPN','ROMol']].groupby('SCPN').count()


# In[36]:


dff_counts['Link'] = dff_counts.index.map(lambda x: '<a href="https://www.surechembl.org/document/{0}/" target="_blank">{0}</a>'.format(x))


# In[37]:


dff_counts = dff_counts.rename(columns={'ROMol':'# Compounds'})


# In[38]:


dff_counts #NB: The links in this table are external.


# ## 2. Visualise the patent chemical space - MDS

# We will calculate fingerprints and the distance matrix and feed this to the MDS algorithm.

# In[39]:


fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']]


# In[40]:


dist_mat = squareform(pdist(fps,'jaccard'))


# In[41]:


mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2)


# In[42]:


results = mds.fit(dist_mat)


# In[43]:


coords = results.embedding_


# In[44]:


dff['X'] = [c[0] for c in coords]


# In[45]:


dff['Y'] = [c[1] for c in coords]


# A little bit of css for the pop-up tables

# In[46]:


csss = """
table
{
  border-collapse: collapse;
}
th
{
  color: #ffffff;
  background-color: #848482;
}
td
{
  background-color: #f2f3f4;
}
table, th, td
{
  font-family:Arial, Helvetica, sans-serif;
  border: 1px solid black;
  text-align: right;
}
"""


# In[47]:


# Encode SMILES in base64 for the Beaker calls.
import base64
dff['base64smi'] = dff['smiles'].map(base64.b64encode)


# Let's produce a scatter plot of the chemical space - points represent compounds, color-coded by the patent document they were found in. Thanks to [mpld3](http://mpld3.github.io/), the scatter plot is interactive with *live* structure rendering calls to the *local* myChEMBL Beaker server. 

# In[48]:


fig, ax = plt.subplots()
fig.set_size_inches(14.0, 12.0)
#colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw')
colors = cycle(cm.Dark2(np.linspace(0,1,13)))
for name, group in dff.groupby('SCPN'):
    labels = []
    for i in group.index:
        zz = group.ix[[i],['SCPN','chemical_id','base64smi']]
        zz['mol'] = zz['base64smi'].map(lambda x: '<img src="http://{0}/utils/smiles2image/{1}?size=300" >'.format(base_url,x))
        del zz['base64smi']
        label = zz.T
        del zz
        label.columns = ['Row: {}'.format(i)]
        labels.append(str(label.to_html()))
    #labels.append(str(label.to_html()))
    points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8)
    tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss)
    mpld3.plugins.connect(fig,tooltip)                                          


# Compounds in the same region of space come from the same patent document - that means that the MDS makes some sense. Plus we get [Beaker](https://www.ebi.ac.uk/chembl/api/utils/docs) structure renderings on the fly.

# ## 3. Define the MCS for each patent

# A couple of helper functions, inspired by Greg Landrum's post [here](http://rdkit.blogspot.co.uk/2015/05/a-set-of-scaffolds-from-chembl-papers.html).

# In[49]:


from IPython.display import SVG
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
def moltosvg(mol,molSize=(450,350),kekulize=True):
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    
    rdDepictor.Compute2DCoords(mc)
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    # the MolDraw2D code is not very good at the moment at dealing with atom queries,
    # this is a workaround until that's fixed.
    # The rendering is still not going to be perfect because query bonds are not properly indicated
    opts = drawer.drawOptions()
    for atom in mc.GetAtoms():
        if atom.HasQuery() and atom.DescribeQuery().find('AtomAtomicNum')!=0:
            opts.atomLabels[atom.GetIdx()]=atom.GetSmarts()
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    return svg.replace('svg:','')


# In[50]:


def MCS_Report(ms,atomCompare=MCS.AtomCompare.CompareAny,**kwargs):
    mcs = MCS.FindMCS(ms,atomCompare=atomCompare,timeout=120,**kwargs)
    nAts = np.array([x.GetNumAtoms() for x in ms])
    print 'Mean nAts: {0}, mcs nAts: {1}'.format(nAts.mean(),mcs.numAtoms)
    print 'MCS SMARTS: {0}'.format(mcs.smartsString)
    mcsM = Chem.MolFromSmarts(mcs.smartsString)
    
    mcsM.UpdatePropertyCache(False)
    Chem.SetHybridization(mcsM)
    Chem.Compute2DCoords(mcsM)
    smi = Chem.MolToSmiles(mcsM,isomericSmiles=True)
    print "MCS SMILES: {0}".format(smi)
    img=Draw.MolToImage(mcsM,kekulize=False)
    return mcs.smartsString,smi,img,mcsM


# Add Murcko scaffolds in the frame, just in case we need them.

# In[51]:


PandasTools.AddMurckoToFrame(dff)
PandasTools.AddMoleculeColumnToFrame(dff,smilesCol='Murcko_SMILES', molCol='MurMol')
PandasTools.AlignToScaffold(dff)
dff[['ROMol','MurMol']].head()


# Let's now visualise the extracted compounds for a single patent, say US20100056494A1:

# In[54]:


PandasTools.FrameToGridImage(dff.ix[dff['SCPN'] == 'US-20100056494-A1'],legendsCol='chemical_id', molsPerRow=4, subImgSize=(300, 300), useSVG=False)


# OK, the structures seem consistent with a well defined MCS. Let's see if that's the case for two thresholds

# In[57]:


mols = list(dff.ix[dff['SCPN'] == 'US-20100056494-A1'].ROMol)
smarts,smi,img,mcsM = MCS_Report(mols,threshold=0.6,ringMatchesRingOnly=True)
SVG(moltosvg(mcsM))


# In[58]:


smarts,smi,img,mcsM = MCS_Report(mols,threshold=0.8,ringMatchesRingOnly=True)
SVG(moltosvg(mcsM))


# So, the higher the threshold, the more generic the MCS. But is the MCS similar to the claimed Markush structure?
# Let's fetch the original document in PDF and check in page 2: (**NB:** The link below is external.)

# In[59]:


HTML('<iframe src=https://dl.dropboxusercontent.com/u/22273283/US20100056494A1.pdf width=1000 height=500></iframe>')


# Pretty close! Now let's do the same thing for all the patents:

# In[60]:


smartss = []
smis = []
imgs = []
patents = []
mcss = []


# In[61]:


for patent, group in dff.groupby('SCPN'):
    mols = list(group['ROMol'])
    print "Patent: {0}".format(patent)
    smarts,smi,img,mcs = MCS_Report(mols,threshold=0.8,ringMatchesRingOnly=True)
    smartss.append(smarts)
    smis.append(smi)
    imgs.append(img)
    patents.append(patent)
    mcss.append(mcs)


# Tidy up this info in a data frame:

# In[62]:


dd = pd.DataFrame(zip(patents,smartss,smis),columns=['SCPN','MCS_SMARTS','MCS_SMILES'])
dd.set_index('SCPN', inplace=True)
dd


# Visualise the MCS per patent document:

# In[63]:


Draw.MolsToGridImage(mcss,legends=patents, molsPerRow=3, subImgSize=(400, 400),kekulize=False)


# The previous summary table was:

# In[64]:


dff_counts #NB: The links in this table are external.


# Finally, merge the two frames together

# In[65]:


pd.merge(dd, dff_counts, left_index=True, right_index=True)