#!/usr/bin/env python # coding: utf-8 # # SureChEMBL Tutorial # ## An introduction to patent chemoinformatics using SureChEMBL data and RDKit # ### myChEMBL team, ChEMBL group, EMBL-EBI. # ### Outline: # 1. Read a file that contains small molecule antimalarials reported in US patents in 2010. The file was downloaded from [SureChEMBL](https://www.surechembl.org). # 2. Filter by different text-mining and chemoinformatics properties to remove noise and enrich the genuinely novel structures claimed in the patent document. # 3. Visualise the chemical space using MDS and dimensionality reduction. # 4. Find the MCS of the claimed compounds in each patent. # 5. Compare the derived MCS with the actual Markush structure as reported in the original patent document. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('pylab', 'inline') import pandas as pd from IPython.display import display from IPython.display import display_pretty, display_html, HTML, Javascript from rdkit.Chem import PandasTools from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem import AllChem as Chem from rdkit.Chem import DataStructs from rdkit.Chem import rdFMCS as MCS from rdkit.Chem import Draw Draw.DrawingOptions.elemDict[0]=(0.,0.,0.) # draw dummy atoms in black from itertools import cycle from sklearn import manifold from scipy.spatial.distance import * import mpld3 mpld3.enable_notebook() import warnings warnings.filterwarnings('ignore') from rdkit import RDLogger lg = RDLogger.logger() lg.setLevel(RDLogger.ERROR) # Get the base URL of the VM: display(Javascript('IPython.notebook.kernel.execute("current_url = " + "\'"+document.URL+"\'");')) # In[2]: cur_base_url = current_url.split('http://')[1].split('/')[0] base_url = 'localhost:8000' if (cur_base_url == 'localhost:9612' or cur_base_url == '127.0.0.1:9612') else current_url.split('http://')[1].split(':')[0] + ':8000' base_url # In[3]: pd.options.display.mpl_style = 'default' # In[4]: rcParams['figure.figsize'] = 16,10 # ## 1. Read and filter the input file # The file was manually collated by extracting all chemistry from a list of patent documents in [SureChEMBL](https://www.surechembl.org). The Lucene query used to retrieve the list of relevant patents was: "ic:C07D AND ic:A61P003306 AND pnctry:US AND pdyear:2010 AND (ttl:\*malaria\* OR ab:\*malaria\* OR ttl:\*parasit\*)" # In[7]: df = pd.read_csv('/home/chembl/ipynb_workbench/US_antimalarial_patents_cmpds.txt',sep='\t') # Let's check the contents: # In[8]: df.info() # In[9]: df.head() # In[10]: df.shape # In[11]: df['chemical_id'] = df['chemical_id'].map(lambda x: 'SCHEMBL{0}'.format(x)) # **First round of filtering: Novel compounds appear in the description or claims section of the document. Alternatively, they are extracted from images or mol files** # In[12]: dff = df[(df['claims_count'] > 0) | (df['description_count'] > 0) | (df['type'] != "TEXT")] # In[13]: dff.shape # **Second round of filtering: Simple physicochemical properties and counts** # In[14]: dff = dff[(dff['rotatable_bond_count'] < 15) & (dff['ring_count'] > 0) & (df['radical'] == 0) & (df['singleton'] == 0) & (df['simple'] == 0)] # In[15]: dff.shape # In[16]: dff = dff[(dff['molecular_weight'] >= 300) & (dff['molecular_weight'] <= 800) & (dff['log_p'] > 0) & (dff['log_p'] < 6)] # In[17]: dff.shape # In[18]: dff = dff[(dff['chemical_corpus_count'] < 400) & ((dff['annotation_corpus_count'] < 400) | (dff['annotation_corpus_count'].isnull()))] # In[19]: dff.shape # **Convert SMILES to RDKit molecules** # In[20]: PandasTools.AddMoleculeColumnToFrame(dff, smilesCol = 'smiles') # **Third round of filtering: Remove salts and duplicates, based on InChI keys** # In[21]: PandasTools.RemoveSaltsFromFrame(dff) # In[22]: dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi) # In[23]: dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey) # In[24]: dff.head() # In[25]: dff = dff.drop_duplicates(['SCPN','InChIKey']) # In[26]: dff.shape # Wow, that was a lot of duplicates. This is because in US patents a compound may come from 3 different sources: text, image and mol file. # In[27]: dff = dff.ix[[d.count('.') < 2 for d in dff['smiles']]] # In[28]: dff.shape # **Fourth round of filtering: Remove Boron-containing compounds as they are likely to be reactants.** # In[29]: dff = dff.ix[~(dff['ROMol'] >= Chem.MolFromSmiles('B'))] # In[30]: dff.shape # **Fifth round of filtering: Remove patents with less than 10 compounds** # In[31]: dff_counts = dff[['SCPN','ROMol']].groupby('SCPN').count() # In[32]: tokeep = list(dff_counts.ix[dff_counts.ROMol >= 10].index) # In[33]: dff = dff.ix[dff.SCPN.isin(tokeep)] # In[34]: dff.shape # OK, filtering is over. Let's prepare a summary table of the remaining patent documents and their compounds: # In[35]: dff_counts = dff[['SCPN','ROMol']].groupby('SCPN').count() # In[36]: dff_counts['Link'] = dff_counts.index.map(lambda x: '{0}'.format(x)) # In[37]: dff_counts = dff_counts.rename(columns={'ROMol':'# Compounds'}) # In[38]: dff_counts #NB: The links in this table are external. # ## 2. Visualise the patent chemical space - MDS # We will calculate fingerprints and the distance matrix and feed this to the MDS algorithm. # In[39]: fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']] # In[40]: dist_mat = squareform(pdist(fps,'jaccard')) # In[41]: mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2) # In[42]: results = mds.fit(dist_mat) # In[43]: coords = results.embedding_ # In[44]: dff['X'] = [c[0] for c in coords] # In[45]: dff['Y'] = [c[1] for c in coords] # A little bit of css for the pop-up tables # In[46]: csss = """ table { border-collapse: collapse; } th { color: #ffffff; background-color: #848482; } td { background-color: #f2f3f4; } table, th, td { font-family:Arial, Helvetica, sans-serif; border: 1px solid black; text-align: right; } """ # In[47]: # Encode SMILES in base64 for the Beaker calls. import base64 dff['base64smi'] = dff['smiles'].map(base64.b64encode) # Let's produce a scatter plot of the chemical space - points represent compounds, color-coded by the patent document they were found in. Thanks to [mpld3](http://mpld3.github.io/), the scatter plot is interactive with *live* structure rendering calls to the *local* myChEMBL Beaker server. # In[48]: fig, ax = plt.subplots() fig.set_size_inches(14.0, 12.0) #colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw') colors = cycle(cm.Dark2(np.linspace(0,1,13))) for name, group in dff.groupby('SCPN'): labels = [] for i in group.index: zz = group.ix[[i],['SCPN','chemical_id','base64smi']] zz['mol'] = zz['base64smi'].map(lambda x: ''.format(base_url,x)) del zz['base64smi'] label = zz.T del zz label.columns = ['Row: {}'.format(i)] labels.append(str(label.to_html())) #labels.append(str(label.to_html())) points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8) tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss) mpld3.plugins.connect(fig,tooltip) # Compounds in the same region of space come from the same patent document - that means that the MDS makes some sense. Plus we get [Beaker](https://www.ebi.ac.uk/chembl/api/utils/docs) structure renderings on the fly. # ## 3. Define the MCS for each patent # A couple of helper functions, inspired by Greg Landrum's post [here](http://rdkit.blogspot.co.uk/2015/05/a-set-of-scaffolds-from-chembl-papers.html). # In[49]: from IPython.display import SVG from rdkit.Chem import rdDepictor from rdkit.Chem.Draw import rdMolDraw2D def moltosvg(mol,molSize=(450,350),kekulize=True): mc = Chem.Mol(mol.ToBinary()) if kekulize: try: Chem.Kekulize(mc) except: mc = Chem.Mol(mol.ToBinary()) rdDepictor.Compute2DCoords(mc) drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1]) # the MolDraw2D code is not very good at the moment at dealing with atom queries, # this is a workaround until that's fixed. # The rendering is still not going to be perfect because query bonds are not properly indicated opts = drawer.drawOptions() for atom in mc.GetAtoms(): if atom.HasQuery() and atom.DescribeQuery().find('AtomAtomicNum')!=0: opts.atomLabels[atom.GetIdx()]=atom.GetSmarts() drawer.DrawMolecule(mc) drawer.FinishDrawing() svg = drawer.GetDrawingText() return svg.replace('svg:','') # In[50]: def MCS_Report(ms,atomCompare=MCS.AtomCompare.CompareAny,**kwargs): mcs = MCS.FindMCS(ms,atomCompare=atomCompare,timeout=120,**kwargs) nAts = np.array([x.GetNumAtoms() for x in ms]) print 'Mean nAts: {0}, mcs nAts: {1}'.format(nAts.mean(),mcs.numAtoms) print 'MCS SMARTS: {0}'.format(mcs.smartsString) mcsM = Chem.MolFromSmarts(mcs.smartsString) mcsM.UpdatePropertyCache(False) Chem.SetHybridization(mcsM) Chem.Compute2DCoords(mcsM) smi = Chem.MolToSmiles(mcsM,isomericSmiles=True) print "MCS SMILES: {0}".format(smi) img=Draw.MolToImage(mcsM,kekulize=False) return mcs.smartsString,smi,img,mcsM # Add Murcko scaffolds in the frame, just in case we need them. # In[51]: PandasTools.AddMurckoToFrame(dff) PandasTools.AddMoleculeColumnToFrame(dff,smilesCol='Murcko_SMILES', molCol='MurMol') PandasTools.AlignToScaffold(dff) dff[['ROMol','MurMol']].head() # Let's now visualise the extracted compounds for a single patent, say US20100056494A1: # In[54]: PandasTools.FrameToGridImage(dff.ix[dff['SCPN'] == 'US-20100056494-A1'],legendsCol='chemical_id', molsPerRow=4, subImgSize=(300, 300), useSVG=False) # OK, the structures seem consistent with a well defined MCS. Let's see if that's the case for two thresholds # In[57]: mols = list(dff.ix[dff['SCPN'] == 'US-20100056494-A1'].ROMol) smarts,smi,img,mcsM = MCS_Report(mols,threshold=0.6,ringMatchesRingOnly=True) SVG(moltosvg(mcsM)) # In[58]: smarts,smi,img,mcsM = MCS_Report(mols,threshold=0.8,ringMatchesRingOnly=True) SVG(moltosvg(mcsM)) # So, the higher the threshold, the more generic the MCS. But is the MCS similar to the claimed Markush structure? # Let's fetch the original document in PDF and check in page 2: (**NB:** The link below is external.) # In[59]: HTML('') # Pretty close! Now let's do the same thing for all the patents: # In[60]: smartss = [] smis = [] imgs = [] patents = [] mcss = [] # In[61]: for patent, group in dff.groupby('SCPN'): mols = list(group['ROMol']) print "Patent: {0}".format(patent) smarts,smi,img,mcs = MCS_Report(mols,threshold=0.8,ringMatchesRingOnly=True) smartss.append(smarts) smis.append(smi) imgs.append(img) patents.append(patent) mcss.append(mcs) # Tidy up this info in a data frame: # In[62]: dd = pd.DataFrame(zip(patents,smartss,smis),columns=['SCPN','MCS_SMARTS','MCS_SMILES']) dd.set_index('SCPN', inplace=True) dd # Visualise the MCS per patent document: # In[63]: Draw.MolsToGridImage(mcss,legends=patents, molsPerRow=3, subImgSize=(400, 400),kekulize=False) # The previous summary table was: # In[64]: dff_counts #NB: The links in this table are external. # Finally, merge the two frames together # In[65]: pd.merge(dd, dff_counts, left_index=True, right_index=True)