#!/usr/bin/env python # coding: utf-8 # # Thresholds for "random" in fingerprints the RDKit supports # # This is an updated version of a post. The original version of the notebook can be [found in github](https://github.com/greglandrum/rdkit_blog/blob/936b35b77ddca3e843991d1c4b1e3532e754b734/notebooks/Fingerprint%20Thresholds.ipynb). # # A frequent question that comes up when considering fingerprint similarity is: "What threshold should I use to determine what a neighbor is?" The answer is poorly defined. Of course it depends heavily on the details of the fingerprint, but there's also a very subjective component: you want to pick a low enough threshold that you're sure you won't miss anything, but you don't want to pick up too much noise. # # The goal here is to systematically come up with some guidelines that can be used for fingerprints supported within the RDKit. We will do that by looking a similarities between random "drug-like" (MW<600) molecules picked from ChEMBL. # # For the analysis, the 25K similarity values are sorted and the values at particular threshold are examined. # # There's a fair amount of code and results below, so here's the summary table. To help interpret this: 22500 of the 25000 pairs (90%) have a MACCS keys similarity value less than 0.528. # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
FingerprintMetric70% level80% level90% level95% level99% level
MACCSTanimoto0.4310.4710.5280.5750.655
Morgan0 (counts)Tanimoto0.4290.4710.5250.5680.651
Morgan1 (counts)Tanimoto0.2650.2930.3330.3640.429
Morgan2 (counts)Tanimoto0.1810.2010.2290.2520.305
Morgan3 (counts)Tanimoto0.1410.1560.1780.1960.238
Morgan0 (bits)Tanimoto0.4350.4750.5290.5710.656
Morgan1 (bits)Tanimoto0.2730.3010.3410.3710.434
Morgan2 (bits)Tanimoto0.1970.2170.2460.2690.322
Morgan3 (bits)Tanimoto0.1650.1810.2030.2220.264
FeatMorgan0 (counts)Tanimoto0.5830.6300.6900.7370.818
FeatMorgan1 (counts)Tanimoto0.3900.4250.4740.5110.581
FeatMorgan2 (counts)Tanimoto0.2720.2980.3330.3640.424
FeatMorgan3 (counts)Tanimoto0.2090.2280.2560.2790.328
FeatMorgan0 (bits)Tanimoto0.5830.6300.6900.7370.818
FeatMorgan1 (bits)Tanimoto0.3950.4290.4770.5140.585
FeatMorgan2 (bits)Tanimoto0.2840.3100.3470.3760.434
FeatMorgan3 (bits)Tanimoto0.2280.2480.2760.2990.349
RDKit 4 (bits)Tanimoto0.2090.2390.2850.3250.426
RDKit 5 (bits)Tanimoto0.1970.2190.2530.2870.368
RDKit 6 (bits)Tanimoto0.2300.2500.2800.3080.369
RDKit 7 (bits)Tanimoto0.3130.3460.3890.4290.507
linear RDKit 4 (bits)Tanimoto0.2250.2580.3090.3540.462
linear RDKit 5 (bits)Tanimoto0.1980.2250.2690.3090.404
linear RDKit 6 (bits)Tanimoto0.1870.2100.2460.2820.365
linear RDKit 7 (bits)Tanimoto0.1820.2030.2340.2640.337
Atom Pairs (counts)Tanimoto0.1800.2040.2370.2650.325
Torsions (counts)Tanimoto0.1070.1300.1650.1940.266
Atom Pairs (bits)Tanimoto0.2750.3010.3350.3630.415
Torsions (bits)Tanimoto0.1330.1550.1880.2190.288
Avalon 512 (bits)Tanimoto0.3690.4070.4610.5050.575
Avalon 1024 (bits)Tanimoto0.2690.2970.3400.3750.449
Avalon 512 (counts)Tanimoto0.3000.3330.3790.4180.491
Avalon 1024 (counts)Tanimoto0.2670.2990.3440.3840.462
# # In[1]: from rdkit import Chem from rdkit.Chem import rdMolDescriptors from rdkit.Avalon import pyAvalonTools from rdkit.Chem import Draw from rdkit.Chem.Draw import IPythonConsole from rdkit import rdBase from rdkit import DataStructs from collections import defaultdict import pickle,random,gzip print(rdBase.rdkitVersion) import time print(time.asctime()) get_ipython().run_line_magic('pylab', 'inline') # # Read in the data # # We're using the set of 25K reference pairs generated in an earlier post: http://rdkit.blogspot.ch/2013/10/building-similarity-comparison-set-goal.html # # As a quick reminder: these are pairs of molecules taken from ChEMBL with MW<600 and a count-based MFP0 similarity of at least 0.7 to each other. # In[2]: ind = [x.split() for x in gzip.open('../data/chembl16_25K.pairs.txt.gz')] ms1 = [] ms2 = [] for i,row in enumerate(ind): m1 = Chem.MolFromSmiles(row[1]) ms1.append((row[0],m1)) m2 = Chem.MolFromSmiles(row[3]) ms2.append((row[2],m2)) # Those pairs are related to each other, but we want random pairs, so shuffle the second list: # In[3]: random.seed(23) random.shuffle(ms2) # In[4]: try: import ipyparallel as ipp rc = ipp.Client() dview = rc[:] dview.execute('from rdkit import Chem') dview.execute('from rdkit import Descriptors') dview.execute('from rdkit.Chem import rdMolDescriptors') dview.execute('from rdkit.Avalon import pyAvalonTools') except: print("could not use ipyparallel") dview = None results_accum = dict() def compareFPs(ms1,ms2,fpfn,fpName): if dview is not None: fps = dview.map_sync(lambda x:fpfn(x[1]),ms1) fp2s = dview.map_sync(lambda x:fpfn(x[1]),ms2) else: fps = [fpfn(x[1]) for x in ms1] fp2s = [fpfn(x[1]) for x in ms2] sims = [DataStructs.TanimotoSimilarity(x,y) for x,y in zip(fps,fp2s)] sl = sorted(sims) np = len(sl) with open('fp_results.txt','a+') as outf: outf.write(f'{fpName}Tanimoto\n') accum = {} for bin in (.7,.8,.9,.95,.99): simv = sl[int(bin*np)] print( bin,simv) outf.write(f' {simv:.3f}\n') accum[bin] = simv outf.write('') results_accum[fpName] = accum hist(sims,bins=20) xlabel(fpName) # # MACCS # In[5]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMACCSKeysFingerprint(x),"MACCS") # # Morgan FPs # ## count based # In[6]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,0),"Morgan0 (counts)") # In[7]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,1),"Morgan1 (counts)") # In[8]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,2),"Morgan2 (counts)") # In[9]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,3),"Morgan3 (counts)") # ## bit-vector based # In[10]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,0,1024),"Morgan0 (bits)") # In[11]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,1,1024),"Morgan1 (bits)") # In[12]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,2,1024),"Morgan2 (bits)") # In[13]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,3,1024),"Morgan3 (bits)") # # FeatMorgan # ## count based # In[14]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,0,useFeatures=True),"FeatMorgan0 (counts)") # In[15]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,1,useFeatures=True),"FeatMorgan1 (counts)") # In[16]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,2,useFeatures=True),"FeatMorgan2 (counts)") # In[17]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetMorganFingerprint(x,3,useFeatures=True),"FeatMorgan3 (counts)") # ## bit vectors # In[18]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,0,1024,useFeatures=True),"FeatMorgan0 (bits)") # In[19]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,1,1024,useFeatures=True),"FeatMorgan1 (bits)") # In[20]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,2,1024,useFeatures=True),"FeatMorgan2 (bits)") # In[21]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,3,1024,useFeatures=True),"FeatMorgan3 (bits)") # # RDKit # ## Branched (default) # In[22]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=4),"RDKit 4 (bits)") # In[23]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=5),"RDKit 5 (bits)") # In[24]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=6),"RDKit 6 (bits)") # In[25]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=7),"RDKit 7 (bits)") # ## linear # In[26]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=4,branchedPaths=False),"linear RDKit 4 (bits)") # In[27]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=5,branchedPaths=False),"linear RDKit 5 (bits)") # In[28]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=6,branchedPaths=False),"linear RDKit 6 (bits)") # In[29]: compareFPs(ms1,ms2,lambda x:Chem.RDKFingerprint(x,maxPath=7,branchedPaths=False),"linear RDKit 7 (bits)") # # Atom pairs and torsions # ## count-based # In[30]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetAtomPairFingerprint(x),"Atom Pairs (counts)") # In[31]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetTopologicalTorsionFingerprint(x),"Topological Torsions (counts)") # ## bit vectors # In[32]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x),"Atom Pairs (bits)") # In[33]: compareFPs(ms1,ms2,lambda x:rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x),"Topological Torsions (bits)") # # Avalon # In[34]: compareFPs(ms1,ms2,lambda x:pyAvalonTools.GetAvalonFP(x,512),"Avalon 512 (bits)") # In[35]: compareFPs(ms1,ms2,lambda x:pyAvalonTools.GetAvalonFP(x,1024),"Avalon 1024 (bits)") # # Avalon Counts # In[36]: compareFPs(ms1,ms2,lambda x:pyAvalonTools.GetAvalonCountFP(x,512),"Avalon 512 (counts)") # In[37]: compareFPs(ms1,ms2,lambda x:pyAvalonTools.GetAvalonCountFP(x,1024),"Avalon 1024 (counts)") # In[38]: pickle.dump(results_accum,open('./results/fp_thresholds_random_accum.pkl','wb+'))