#!/usr/bin/env python # coding: utf-8 # # SureChEMBL iPython Notebook Tutorial 2 # ## An introduction to patent chemoinformatics using SureChEMBL data and the RDKit toolkit # ### George Papadatos, ChEMBL group, EMBL-EBI # ##In this tutorial: # 1. Read a file that contains all chemistry extracted from the Levitra US patent (US6566360) along with *all* the other members of the same patent family. # 2. Filter by different text-mining and chemoinformatics properties to remove noise and enrich the genuinely novel structures claimed in the patent documents. # 3. Visualise the chemical space using MDS and dimensionality reduction. # i. Identify outliers in the chemistry space. # ii. Fix wrongly extracted structures. # 4. Find the Murcko and MCS scaffolds of the claimed compounds in the patent family. # i. Compare the derived core with the actual Markush structure as reported in the original patent document. # 6. Identify **key** compounds using structural information only. This is based on the publications by [Hattori et al.](http://pubs.acs.org/doi/abs/10.1021/ci7002686) and [Tyrchan et al.](http://pubs.acs.org/doi/abs/10.1021/ci3001293). # i. Count the number of NNs per compound. # ii. Perform R-Group decomposition. # In[1]: get_ipython().run_line_magic('pylab', 'inline') import pandas as pd from IPython.display import display from IPython.display import display_pretty, display_html, HTML, Javascript, Image from rdkit.Chem import PandasTools from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem import AllChem as Chem from rdkit.Chem import DataStructs from rdkit.Chem import rdFMCS as MCS from rdkit.Chem import Draw Draw.DrawingOptions.elemDict[0]=(0.,0.,0.) # draw dummy atoms in black from itertools import cycle from sklearn import manifold from scipy.spatial.distance import * import mpld3 mpld3.enable_notebook() import warnings warnings.filterwarnings('ignore') from rdkit import RDLogger lg = RDLogger.logger() lg.setLevel(RDLogger.ERROR) # Get the base URL of the VM: display(Javascript('IPython.notebook.kernel.execute("current_url = " + "\'"+document.URL+"\'");')) # In[2]: from rdkit import rdBase rdBase.rdkitVersion # This is the base URL of the local Beaker server running on myChEMBL. # In[3]: cur_base_url = current_url.split('http://')[1].split('/')[0] base_url = 'localhost:8000' if (cur_base_url == 'localhost:9612' or cur_base_url == '127.0.0.1:9612') else current_url.split('http://')[1].split(':')[0] + ':8000' base_url # In[4]: pd.options.display.mpl_style = 'default' pd.options.display.float_format = '{:.2f}'.format # In[5]: rcParams['figure.figsize'] = 16,10 # ## 1. Read and filter the input file # The file was generated by extracting all chemistry from a list of patent documents in [SureChEMBL](https://www.surechembl.org). The Lucene query used to retrieve the list of relevant patents was: pn:"US6566360". # In[6]: df = pd.read_csv('/home/chembl/ipynb_workbench/document_chemistry_20141011_114421_271.csv',sep=',') # Let's check the contents: # In[7]: df.info() # In[8]: df.shape # Add a SCHEMBL ID column and sort: # In[9]: df['SCHEMBL ID'] = df['Chemical ID'].map(lambda x: 'SCHEMBL{0}'.format(x)) df = df.sort('Chemical ID') # In[10]: df.iloc[600:610] # **First round of filtering: Novel compounds appear in the description or claims section of the document. Alternatively, they are extracted from images or mol files** # In[11]: dff = df[(df['Claims Count'] > 0) | (df['Description Count'] > 0) | (df['Type'] != "TEXT")] # In[12]: dff.shape # **Second round of filtering: Simple physicochemical properties and counts** # In[13]: dff = dff[(dff['Rotatable Bound Count'] < 15) & (dff['Ring Count'] > 0) & (df['Radical'] == 0) & (df['Singleton'] == 0) & (df['Simple'] == 0)] # In[14]: dff.shape # In[15]: dff = dff[(dff['Molecular Weight'] >= 300) & (dff['Molecular Weight'] <= 800) & (dff['Log P'] > 0) & (dff['Log P'] < 7)] # In[16]: dff.shape # **Third round of filtering: Using Global Corpus Count** # In[17]: dff = dff[(dff['Chemical Corpus Count'] < 400) & ((dff['Annotation Corpus Count'] < 400) | (dff['Annotation Corpus Count'].isnull()))] # In[18]: dff.shape # **Keep largest fragment and convert SMILES to RDKit molecules** # In[19]: dff['SMILES'] = dff['SMILES'].map(lambda x: max(x.split('.'), key=len)) # In[20]: PandasTools.AddMoleculeColumnToFrame(dff, smilesCol = 'SMILES') # In[21]: #dff.head(10) # **Fourth round of filtering: Remove duplicates based on Chemical IDs and InChI keys** # In[22]: dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi) dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey) # In[23]: #dff.head() # In[24]: dff = dff.drop_duplicates('Chemical ID') # In[25]: dff.shape # In[26]: dff = dff.drop_duplicates('InChIKey') # In[27]: dff.shape # Wow, that was a lot of duplicates. This is because the same compounds come from different patents in the same patent family. In addition, in US patents a compound may come from 3 different sources: text, image and mol file. # **Fifth round of filtering: Remove Boron-containing compounds as they are likely to be reactants.** # In[28]: dff = dff.ix[~(dff['ROMol'] >= Chem.MolFromSmarts('[#5]'))] # In[29]: dff.shape # In[30]: dff = dff.set_index('Chemical ID', drop=False) # That was the end of the filters - let's see a summary of the remaining compounds and their patent document source: # In[31]: dff_counts = dff[['Patent ID','ROMol']].groupby('Patent ID').count() # In[32]: dff_counts['Link'] = dff_counts.index.map(lambda x: '{0}'.format(x)) # In[33]: dff_counts = dff_counts.rename(columns={'ROMol':'# Compounds'}) # In[34]: dff_counts # Right, is Levitra included in this set? ChEMBL tells me that its InChiKey is 'SECKRCOLJRRGGV-UHFFFAOYSA-N'. # In[35]: dff.ix[dff.InChIKey == 'SECKRCOLJRRGGV-UHFFFAOYSA-N',['SCHEMBL ID','ROMol']] # Good - it is... # ## 2. Visualise the patent chemical space - MDS # Preparation of the pairwise distance matrix. # In[36]: fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']] # In[37]: dist_mat = squareform(pdist(fps,'jaccard')) # In[38]: mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2) # In[39]: results = mds.fit(dist_mat) # In[40]: coords = results.embedding_ # In[41]: dff['X'] = [c[0] for c in coords] # In[42]: dff['Y'] = [c[1] for c in coords] # In[43]: csss = """ table { border-collapse: collapse; } th { color: #ffffff; background-color: #848482; } td { background-color: #f2f3f4; } table, th, td { font-family:Arial, Helvetica, sans-serif; border: 1px solid black; text-align: right; } """ # In[44]: import base64 # In[45]: dff['base64smi'] = dff['SMILES'].map(base64.b64encode) # Let's produce a scatter plot of the chemical space - points represent compounds, color-coded by the patent document they were found in. Thanks to [mpld3](http://mpld3.github.io/), the scatter plot is interactive with *live* structure rendering calls to the *local* myChEMBL Beaker server. # In[46]: fig, ax = plt.subplots() fig.set_size_inches(14.0, 12.0) #colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw') colors = cycle(cm.Dark2(np.linspace(0,1,len(dff_counts.index)))) for name, group in dff.groupby('Patent ID'): labels = [] for i in group.index: zz = group.ix[[i],['Patent ID','SCHEMBL ID','base64smi']] zz['mol'] = zz['base64smi'].map(lambda x: ''.format(base_url,x)) del zz['base64smi'] label = zz.T del zz label.columns = ['Row: {}'.format(i)] labels.append(str(label.to_html())) points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8) tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss) mpld3.plugins.connect(fig,tooltip) # Nice, we get Beaker rendering on the fly. However, there is a bunch of compounds on the left that are distinctively different. This is because of an image extraction error. # At least the error is systematic: extraction breaks the imidazole ring. Let's try to 'repair' the ring and rescue these compounds - we'll use an appropriate intramolecular reaction SMARTS. # In[47]: resmarts = "([R:5][C;!R:1]=[C;H2].[C;H0,H1;!R:4]-[N;H2:2])>>[R:5][*:1]=[*:2]-[*:4]" # In[48]: Draw.ReactionToImage(Chem.ReactionFromSmarts(resmarts)) # In[49]: def fixRing(mol,resmarts=resmarts): rxn = Chem.ReactionFromSmarts(resmarts) ps = rxn.RunReactants((mol,)) if ps: m = ps[0][0] Chem.SanitizeMol(m) return m else: return mol # So this... # In[50]: mm = dff.ix[12823029]['ROMol'] mm # ...will become this: # In[51]: fixRing(mm) # Let's apply the repair to all affected structures: # In[52]: for i, p, m in zip(dff.index, dff['Patent ID'], dff['ROMol']): if p == 'EP-2295436-A1': dff.ix[i,'ROMol'] = fixRing(m) # In[53]: dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi) dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey) dff = dff.drop_duplicates('InChIKey') # So after the fix, we have even less structures because some of the fixed ones already existed. Hopefully, however, we have less false positives too: # In[54]: dff.shape # In[55]: dff['RD_SMILES'] = dff['ROMol'].map(Chem.MolToSmiles) dff['base64rdsmi'] = dff['RD_SMILES'].map(base64.b64encode) # We're going to re-calculate the distance matrix and plot the MCS just to double-check they were no side-effects from the applications of the reaction. # In[56]: fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']] dist_mat = squareform(pdist(fps,'jaccard')) mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2) results = mds.fit(dist_mat) coords = results.embedding_ dff['X'] = [c[0] for c in coords] dff['Y'] = [c[1] for c in coords] # In[57]: fig, ax = plt.subplots() fig.set_size_inches(14.0, 12.0) #colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw') colors = cycle(cm.Dark2(np.linspace(0,1,len(dff_counts.index)))) for name, group in dff.groupby('Patent ID'): labels = [] for i in group.index: zz = group.ix[[i],['Patent ID','SCHEMBL ID','base64rdsmi']] zz['mol'] = zz['base64rdsmi'].map(lambda x: ''.format(base_url,x)) del zz['base64rdsmi'] label = zz.T del zz label.columns = ['Row: {}'.format(i)] labels.append(str(label.to_html())) points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8) tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss) mpld3.plugins.connect(fig,tooltip) # So the "island" of outliers has disappeared. There's only one obvious outlier on the far left. This might be a reactant which wasn't filtered out. # ## 3. Define the MCS for this set of claimed molecules # A couple of helper functions, inspired by Greg Landrum's post [here](http://rdkit.blogspot.co.uk/2015/05/a-set-of-scaffolds-from-chembl-papers.html). # In[58]: from IPython.display import SVG from rdkit.Chem import rdDepictor from rdkit.Chem.Draw import rdMolDraw2D def moltosvg(mol,molSize=(450,350),kekulize=True): mc = Chem.Mol(mol.ToBinary()) if kekulize: try: Chem.Kekulize(mc) except: mc = Chem.Mol(mol.ToBinary()) rdDepictor.Compute2DCoords(mc) drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1]) # the MolDraw2D code is not very good at the moment at dealing with atom queries, # this is a workaround until that's fixed. # The rendering is still not going to be perfect because query bonds are not properly indicated opts = drawer.drawOptions() for atom in mc.GetAtoms(): if atom.HasQuery() and atom.DescribeQuery().find('AtomAtomicNum')!=0: opts.atomLabels[atom.GetIdx()]=atom.GetSmarts() drawer.DrawMolecule(mc) drawer.FinishDrawing() svg = drawer.GetDrawingText() return svg.replace('svg:','') # In[59]: def MCS_Report(ms,atomCompare=MCS.AtomCompare.CompareAny,**kwargs): mcs = MCS.FindMCS(ms,atomCompare=atomCompare,completeRingsOnly=True,timeout=120,**kwargs) nAts = np.array([x.GetNumAtoms() for x in ms]) print 'Mean nAts: {0}, mcs nAts: {1}'.format(nAts.mean(),mcs.numAtoms) print 'MCS SMARTS: {0}'.format(mcs.smartsString) mcsM = Chem.MolFromSmarts(mcs.smartsString) mcsM.UpdatePropertyCache(False) Chem.SetHybridization(mcsM) Chem.Compute2DCoords(mcsM) smi = Chem.MolToSmiles(mcsM,isomericSmiles=True) print "MCS SMILES: {0}".format(smi) img=Draw.MolToImage(mcsM,kekulize=False) return mcs.smartsString,smi,img,mcsM # Let's add the Murcko framework to our data frame: # In[60]: PandasTools.AddMurckoToFrame(dff) # In[61]: PandasTools.AddMoleculeColumnToFrame(dff,smilesCol='Murcko_SMILES', molCol='MurMol') # In[62]: PandasTools.AlignToScaffold(dff) # In[63]: dff[['ROMol','MurMol']].head() # And now we'll extract the MCS core: # In[64]: mols = list(dff.MurMol) smarts,smi,img,mcsM = MCS_Report(mols,threshold=0.8,ringMatchesRingOnly=True) SVG(moltosvg(mcsM)) # ## 4. Compare the extracted MCS core with the actual claimed Markush structure. # Given the MCS core, we'll do an R-Group decomposition first to define the R-Groups and their attachement points. # In[65]: core = Chem.MolFromSmarts(smarts) # In[66]: pind = [] rgroups = [] for i, m in zip(dff.index, dff.ROMol): rgrps = Chem.ReplaceCore(m,core,labelByIndex=True) if rgrps: s = Chem.MolToSmiles(rgrps, True) for e in s.split("."): pind.append(e[1:4].replace('*','').replace(']','')) rgroups.append(e) #print "R-Groups: ", i, s # In[67]: pind = sorted(list(set(pind)),key=int) # In[68]: pind = map(int,pind) # In[69]: Draw.MolToImage(core,includeAtomNumbers=True,highlightAtoms=pind) # Is the MCS similar to the claimed Markush structure? # Let's fetch the original document in PDF and check in page 4: # In[70]: HTML('') # Pretty close! # ## 5. Identify key compounds - in this case, vardenafil itself # This is based on the publication by [Hattori et al.](http://pubs.acs.org/doi/abs/10.1021/ci7002686) and [Tyrchan et al.](http://pubs.acs.org/doi/abs/10.1021/ci3001293). First we'll try the Hattori et al. version of finding the compounds with the highest number of nearest neighbours (NNs) for a given similarity threshold. We will need to calculate the similarity matrix. # In[71]: sim_mat = [] CUTOFF = 0.8 for i,fp in enumerate(fps): tt = DataStructs.BulkTanimotoSimilarity(fps[i],fps) for i,t in enumerate(tt): if tt[i] < CUTOFF: tt[i] = None sim_mat.append(tt) sim_mat=numpy.array(sim_mat) # In[72]: sim_df = pd.DataFrame(sim_mat, columns = dff['Chemical ID'], index=dff['Chemical ID']) # In[73]: sim_df.head() # In[74]: d = sim_df.count().to_dict() nn_df = pd.DataFrame(d.items(),columns=['Chemical ID', '# NNs']) nn_df.set_index('Chemical ID',inplace=True) nn_df.head() # In[75]: pd.merge(dff[['Chemical ID','ROMol']],nn_df,left_index=True, right_index=True, sort=False).sort('# NNs',ascending=False).head() # So, the method seems to be working in this case. Vardenafil has the second largest number of NNs with a cutoff of 0.8. Next, we'll try one of the Tyrchan et al. methods (Frequency of Groups, FOG) and count the frequency of the derived R-Groups: # In[76]: from collections import Counter rgroups = [r.replace('[9*]','[5*]').replace('[8*]','[6*]') for r in rgroups] c = Counter(rgroups) # In[77]: rgroup_counts = pd.DataFrame(c.most_common(20),columns=['RGroups','Frequency']) rgroup_counts.head(10) # In[78]: Draw.MolToImage(core,includeAtomNumbers=True,highlightAtoms=pind) # In[79]: PandasTools.AddMoleculeColumnToFrame(rgroup_counts, smilesCol = 'RGroups') rgroup_counts.head(10) # In[80]: rgroup_counts[[r.startswith('[6*]') for r in rgroup_counts.RGroups]].head(10) # Although the most frequent R-groups in positions 9 and 11 correspond to those in Vardenafil (see below), the exact piperazine R-group in position 6 is the 5th most frequent one in that position. # In[81]: dff.ix[dff.InChIKey == 'SECKRCOLJRRGGV-UHFFFAOYSA-N',['SCHEMBL ID','ROMol']]