As we change the threshold, what fraction of the database do we retrieve when doing similarity searches with the RDKit's different fingerprint types?
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from rdkit import DataStructs
from collections import defaultdict
import pickle,random,gzip
import pandas as pd
import numpy as np
print(rdBase.rdkitVersion)
import time
print(time.asctime())
%pylab inline
2021.03.1 Fri May 21 15:42:26 2021 Populating the interactive namespace from numpy and matplotlib
/home/glandrum/miniconda3/envs/rdkit_blog/lib/python3.9/site-packages/IPython/core/magics/pylab.py:159: UserWarning: pylab import has clobbered these variables: ['random'] `%matplotlib` prevents importing * from pylab and numpy warn("pylab import has clobbered these variables: %s" % clobbered +
100K randomly chosen compounds from ChEMBL28
These are the ~20K compounds from ChEMBL documents which we used to determine how many related compounds are retrieved with particular similarity thresholds
# we will use a namedtuple to return the results
from collections import namedtuple
MCSRes=namedtuple('MCSRes',('smarts','numAtoms','numMols','avgNumMolAtoms','mcsTime'))
data = pickle.load(open('../data/scaffolds_revisited_again.pkl','rb'))
keep = [x for x in data if x[2].numAtoms>=np.mean(x[2].avgNumMolAtoms)/2]
all_queries = []
for assay,smis,mcs,_ in keep:
all_queries.extend(smis)
Limit it to 5K queries
import random
random.seed(0xf00d)
random.shuffle(all_queries)
all_queries = all_queries[:5000]
all_queries = [Chem.MolFromSmiles(x[1]) for x in all_queries]
import pickle
chembl_data = pickle.load(open('./results/chembl28_background_set.pkl','rb'))
try:
import ipyparallel as ipp
rc = ipp.Client()
dview = rc[:]
dview.execute('from rdkit import Chem')
dview.execute('from rdkit import Descriptors')
dview.execute('from rdkit import DataStructs')
dview.execute('from rdkit.Chem import rdMolDescriptors')
dview.execute('from rdkit.Avalon import pyAvalonTools')
except:
print("could not use ipyparallel")
dview = None
def smi_to_fp(row,fpfn):
mol = Chem.MolFromSmiles(row[1])
return fpfn(mol)
def calc_sims(fp,db):
return DataStructs.BulkTanimotoSimilarity(fp,db)
from collections import defaultdict
import bisect
def compareFPs(queries,db,fpfn,fpName,accum):
if dview is not None:
qfps = dview.map_sync(lambda x:fpfn(x),queries)
dbfps = dview.map_sync(lambda x:fpfn(Chem.Mol(x[1])),db)
else:
qfps = [fpfn(x) for x in queries]
dbfps = [fpfn(Chem.Mol(x[1])) for x in db]
dbsize=len(db)
accums = defaultdict(list)
for x in qfps:
tmp = sorted(calc_sims(x,dbfps))
for bin in (.2,.3,.4,.5,.6,.7,.8,.9,.95):
accums[bin].append(dbsize - bisect.bisect(tmp,bin))
qfps = None
dbfps = None
for bin in (.2,.3,.4,.5,.6,.7,.8,.9,.95):
cnts = accums[bin]
accum[fpName][bin] = np.median(cnts)/dbsize
accum = defaultdict(dict)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMACCSKeysFingerprint(x),"MACCS",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,0),"Morgan0 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,1),"Morgan1 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,2),"Morgan2 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,3),"Morgan3 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,0,1024),"Morgan0 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,1,1024),"Morgan1 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,2,1024),"Morgan2 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,3,1024),"Morgan3 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,0,useFeatures=True),"FeatMorgan0 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,1,useFeatures=True),"FeatMorgan1 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,2,useFeatures=True),"FeatMorgan2 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetMorganFingerprint(x,3,useFeatures=True),"FeatMorgan3 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,0,1024,useFeatures=True),"FeatMorgan0 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,1,1024,useFeatures=True),"FeatMorgan1 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,2,1024,useFeatures=True),"FeatMorgan2 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedMorganFingerprint(x,3,1024,useFeatures=True),"FeatMorgan3 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=4),"RDKit 4 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=5),"RDKit 5 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=6),"RDKit 6 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=7),"RDKit 7 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=4,branchedPaths=False),"linear RDKit 4 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=5,branchedPaths=False),"linear RDKit 5 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=6,branchedPaths=False),"linear RDKit 6 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:Chem.RDKFingerprint(x,maxPath=7,branchedPaths=False),"linear RDKit 7 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetAtomPairFingerprint(x),
"Atom Pairs (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetTopologicalTorsionFingerprint(x),
"Topological Torsions (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x),
"Atom Pairs (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x),
"Topological Torsions (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:pyAvalonTools.GetAvalonFP(x,512),"Avalon 512 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:pyAvalonTools.GetAvalonFP(x,1024),"Avalon 1024 (bits)",accum)
compareFPs(all_queries,chembl_data,lambda x:pyAvalonTools.GetAvalonCountFP(x,512),"Avalon 512 (counts)",accum)
compareFPs(all_queries,chembl_data,lambda x:pyAvalonTools.GetAvalonCountFP(x,1024),"Avalon 1024 (counts)",accum)
pickle.dump(accum,open('./results/fp_thresholds_database_accum.pkl','wb+'))
accum = pickle.load(open('./results/fp_thresholds_database_accum.pkl','rb'))
accum['MACCS']
{0.2: 0.973165, 0.3: 0.84028, 0.4: 0.555265, 0.5: 0.20339, 0.6: 0.04265, 0.7: 0.00303, 0.8: 0.0001, 0.9: 1e-05, 0.95: 0.0}
figsize(15,15)
for k in accum:
bins = accum[k].keys()
vs = accum[k].values()
plot(vs,label=k)
xticks(rotation=45,ha='right');
xticks(range(1,len(bins)+1),bins);
#title('0.2');
yscale('log')
grid();
legend();
from ipywidgets import widgets,interact
@interact(accum=widgets.fixed(accum),
ks=widgets.SelectMultiple(options=accum.keys(),
value=['MACCS'],
description="fingerprints"))
def plotvals(accum,ks):
figsize(15,15)
for k in ks:
bins = accum[k].keys()
vs = accum[k].values()
plot(vs,label=k)
xticks(rotation=45,ha='right');
xticks(range(1,len(bins)+1),bins);
#title('0.2');
#ylim(0,1)
yscale('log')
grid();
legend();
interactive(children=(SelectMultiple(description='fingerprints', index=(0,), options=('MACCS', 'Morgan0 (count…
scaff_accum = pickle.load(open('./results/fp_thresholds_scaffolds_accum.pkl','rb'))
figsize(15,6)
fpnames = list(scaff_accum.values())[0].keys()
d = []
for nm in fpnames:
d.append([x[nm][0.9] for x in scaff_accum.values()])
violinplot(d,showmedians=True);
xticks(rotation=45,ha='right');
xticks(range(1,len(d)+1),fpnames);
title('0.9');
fp = 'RDKit 7 (bits)'
thresh=0.1
ds = [x[fp][thresh] for x in scaff_accum.values()]
med = np.median(ds)
bin = round(10*med)/10
print(f'{np.quantile(ds,0.2):.2f} {med:.2f} {np.quantile(ds,0.8):.2f}')
print(f'{med} {bin} {accum[fp][bin]} {int(accum[fp][bin]*1e6)}')
0.42 0.58 0.71 0.5770049680624556 0.6 9e-05 90
def interp(where,lookup):
bins = sorted(lookup.keys())
if where <= bins[0]:
return lookup[bins[0]]
for i,b in enumerate(bins):
if not i:
continue
if where==b:
return lookup[b]
elif where<b:
bl = bins[i-1]
vl = lookup[bl]
vu = lookup[b]
frac = (where-b)/(bl-b)
return vl+frac*(vu-vl)
return lookup[bins[-1]]
interp(0.5777,accum[fp])
0.0009136200000000001
fp = 'Morgan1 (counts)'
for thresh in (0.1,0.2,0.5):
ds = [x[fp][thresh] for x in scaff_accum.values()]
med = np.median(ds)
bin = round(20*med)/20
tval = interp(bin,accum[fp])
#print(f'{np.quantile(ds,0.2):.2f} {med:.2f} {np.quantile(ds,0.8):.2f}')
print(f'{thresh} {med:.2f} {bin} {tval} {int(tval*1e6)}')
0.1 0.48 0.5 0.00032 320 0.2 0.53 0.55 0.00018000000000000017 180 0.5 0.63 0.65 2.5000000000000018e-05 25
background_accum = pickle.load(open('./results/fp_thresholds_random_accum.pkl','rb'))
def summary_plot(thresh):
figsize(15,6)
fpnames = list(scaff_accum.values())[0].keys()
d = []
lbounds = []
pcts = []
for nm in fpnames:
d.append([x[nm][thresh] for x in scaff_accum.values()])
lbounds.append(background_accum[nm][0.95])
ds = [x[nm][thresh] for x in scaff_accum.values()]
med = np.median(ds)
bin = round(20*med)/20
tval = interp(bin,accum[nm])
pcts.append(tval)
fig, ax1 = plt.subplots()
ax1.violinplot(d,showmedians=True);
ax1.scatter(range(1,len(fpnames)+1),lbounds,c='dimgray',marker='s')
ax1.set_ylim(0,1);
ax1.set_ylabel('similarity')
xticks(rotation=45,ha='right');
xticks(range(1,len(d)+1),fpnames);
ax2 = ax1.twinx()
ax2.plot(range(1,len(fpnames)+1),pcts,c='red')
ax2.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
ax2.set_yscale('log')
ax2.set_ylabel('frac retrieved')
title(f'{1-thresh:.2f} of pairs found');
These plots show the distribution of paired similarities for each fingerprint (the blue violin plots), the "noise level" for the fingerprint as a dark gray square, and the median fraction of the database retrieved using that fingerprint (the red line and the right-hand y axis).
summary_plot(thresh=0.1)
summary_plot(thresh=0.5)
summary_plot(thresh=0.9)
fps = list(list(scaff_accum.values())[0].keys())
all_thresholds = list(list(scaff_accum.values())[0][fps[0]].keys())
thresholds = [0.05,0.1,0.2,0.5]
for fp in fps:
print('------------------------')
for thresh in thresholds:
ds = [x[fp][thresh] for x in scaff_accum.values()]
med = np.median(ds)
bin = round(20*med)/20
tval = interp(bin,accum[fp])
print(f'{fp} {background_accum[fp][0.95]:.2f} {1.-thresh} {bin} {tval:.2g} {int(tval*1e6)}')
------------------------ MACCS 0.57 0.95 0.6 0.043 42650 MACCS 0.57 0.9 0.65 0.023 22840 MACCS 0.57 0.8 0.7 0.003 3030 MACCS 0.57 0.5 0.8 0.0001 100 ------------------------ Morgan0 (counts) 0.57 0.95 0.55 0.058 58467 Morgan0 (counts) 0.57 0.9 0.6 0.014 14240 Morgan0 (counts) 0.57 0.8 0.65 0.0076 7570 Morgan0 (counts) 0.57 0.5 0.75 0.00047 470 ------------------------ Morgan1 (counts) 0.36 0.95 0.45 0.0042 4230 Morgan1 (counts) 0.36 0.9 0.5 0.00032 320 Morgan1 (counts) 0.36 0.8 0.55 0.00018 180 Morgan1 (counts) 0.36 0.5 0.65 2.5e-05 25 ------------------------ Morgan2 (counts) 0.25 0.95 0.35 0.0018 1802 Morgan2 (counts) 0.25 0.9 0.4 0.00017 170 Morgan2 (counts) 0.25 0.8 0.45 0.00011 105 Morgan2 (counts) 0.25 0.5 0.55 2.5e-05 25 ------------------------ Morgan3 (counts) 0.20 0.95 0.3 0.00032 320 Morgan3 (counts) 0.20 0.9 0.3 0.00032 320 Morgan3 (counts) 0.20 0.8 0.35 0.00018 184 Morgan3 (counts) 0.20 0.5 0.45 3.5e-05 35 ------------------------ Morgan0 (bits) 0.57 0.95 0.55 0.062 61650 Morgan0 (bits) 0.57 0.9 0.6 0.016 15515 Morgan0 (bits) 0.57 0.8 0.65 0.0083 8250 Morgan0 (bits) 0.57 0.5 0.75 0.00051 512 ------------------------ Morgan1 (bits) 0.37 0.95 0.45 0.0052 5217 Morgan1 (bits) 0.37 0.9 0.5 0.00038 380 Morgan1 (bits) 0.37 0.8 0.55 0.00021 210 Morgan1 (bits) 0.37 0.5 0.65 2.5e-05 25 ------------------------ Morgan2 (bits) 0.27 0.95 0.35 0.0036 3567 Morgan2 (bits) 0.27 0.9 0.4 0.00023 230 Morgan2 (bits) 0.27 0.8 0.45 0.00014 135 Morgan2 (bits) 0.27 0.5 0.55 2.5e-05 25 ------------------------ Morgan3 (bits) 0.22 0.95 0.3 0.00063 630 Morgan3 (bits) 0.22 0.9 0.35 0.00034 344 Morgan3 (bits) 0.22 0.8 0.35 0.00034 344 Morgan3 (bits) 0.22 0.5 0.5 2e-05 20 ------------------------ FeatMorgan0 (counts) 0.74 0.95 0.6 0.25 246595 FeatMorgan0 (counts) 0.74 0.9 0.65 0.16 157367 FeatMorgan0 (counts) 0.74 0.8 0.7 0.068 68140 FeatMorgan0 (counts) 0.74 0.5 0.8 0.0082 8195 ------------------------ FeatMorgan1 (counts) 0.51 0.95 0.55 0.021 21037 FeatMorgan1 (counts) 0.51 0.9 0.55 0.021 21037 FeatMorgan1 (counts) 0.51 0.8 0.6 0.0026 2610 FeatMorgan1 (counts) 0.51 0.5 0.7 0.00013 130 ------------------------ FeatMorgan2 (counts) 0.36 0.95 0.45 0.004 4050 FeatMorgan2 (counts) 0.36 0.9 0.5 0.00027 270 FeatMorgan2 (counts) 0.36 0.8 0.55 0.00016 155 FeatMorgan2 (counts) 0.36 0.5 0.65 2.5e-05 25 ------------------------ FeatMorgan3 (counts) 0.28 0.95 0.35 0.0054 5412 FeatMorgan3 (counts) 0.28 0.9 0.4 0.00028 280 FeatMorgan3 (counts) 0.28 0.8 0.45 0.00016 165 FeatMorgan3 (counts) 0.28 0.5 0.55 3.5e-05 35 ------------------------ FeatMorgan0 (bits) 0.74 0.95 0.6 0.25 246595 FeatMorgan0 (bits) 0.74 0.9 0.65 0.16 157367 FeatMorgan0 (bits) 0.74 0.8 0.7 0.068 68140 FeatMorgan0 (bits) 0.74 0.5 0.8 0.0082 8195 ------------------------ FeatMorgan1 (bits) 0.51 0.95 0.55 0.023 23195 FeatMorgan1 (bits) 0.51 0.9 0.55 0.023 23195 FeatMorgan1 (bits) 0.51 0.8 0.6 0.003 3010 FeatMorgan1 (bits) 0.51 0.5 0.7 0.00014 140 ------------------------ FeatMorgan2 (bits) 0.38 0.95 0.45 0.0065 6452 FeatMorgan2 (bits) 0.38 0.9 0.5 0.00036 360 FeatMorgan2 (bits) 0.38 0.8 0.55 0.00021 205 FeatMorgan2 (bits) 0.38 0.5 0.65 3e-05 30 ------------------------ FeatMorgan3 (bits) 0.30 0.95 0.4 0.00045 450 FeatMorgan3 (bits) 0.30 0.9 0.4 0.00045 450 FeatMorgan3 (bits) 0.30 0.8 0.45 0.00025 250 FeatMorgan3 (bits) 0.30 0.5 0.55 3.5e-05 35 ------------------------ RDKit 4 (bits) 0.33 0.95 0.5 0.0011 1120 RDKit 4 (bits) 0.33 0.9 0.55 0.00065 645 RDKit 4 (bits) 0.33 0.8 0.6 0.00017 170 RDKit 4 (bits) 0.33 0.5 0.7 4e-05 40 ------------------------ RDKit 5 (bits) 0.29 0.95 0.5 0.0004 400 RDKit 5 (bits) 0.29 0.9 0.55 0.00025 245 RDKit 5 (bits) 0.29 0.8 0.6 9e-05 90 RDKit 5 (bits) 0.29 0.5 0.7 3e-05 30 ------------------------ RDKit 6 (bits) 0.31 0.95 0.5 0.00027 270 RDKit 6 (bits) 0.31 0.9 0.55 0.00017 170 RDKit 6 (bits) 0.31 0.8 0.6 7e-05 70 RDKit 6 (bits) 0.31 0.5 0.7 3e-05 30 ------------------------ RDKit 7 (bits) 0.43 0.95 0.5 0.0011 1150 RDKit 7 (bits) 0.43 0.9 0.6 9e-05 90 RDKit 7 (bits) 0.43 0.8 0.65 6e-05 60 RDKit 7 (bits) 0.43 0.5 0.75 2e-05 20 ------------------------ linear RDKit 4 (bits) 0.35 0.95 0.5 0.0023 2255 linear RDKit 4 (bits) 0.35 0.9 0.55 0.0013 1277 linear RDKit 4 (bits) 0.35 0.8 0.6 0.0003 300 linear RDKit 4 (bits) 0.35 0.5 0.75 4e-05 40 ------------------------ linear RDKit 5 (bits) 0.31 0.95 0.45 0.0036 3590 linear RDKit 5 (bits) 0.31 0.9 0.55 0.0004 400 linear RDKit 5 (bits) 0.31 0.8 0.6 0.00013 130 linear RDKit 5 (bits) 0.31 0.5 0.7 4e-05 40 ------------------------ linear RDKit 6 (bits) 0.28 0.95 0.45 0.0015 1525 linear RDKit 6 (bits) 0.28 0.9 0.5 0.00033 330 linear RDKit 6 (bits) 0.28 0.8 0.6 8e-05 80 linear RDKit 6 (bits) 0.28 0.5 0.7 3e-05 30 ------------------------ linear RDKit 7 (bits) 0.26 0.95 0.45 0.00076 755 linear RDKit 7 (bits) 0.26 0.9 0.5 0.0002 200 linear RDKit 7 (bits) 0.26 0.8 0.55 0.00013 130 linear RDKit 7 (bits) 0.26 0.5 0.7 2e-05 20 ------------------------ Atom Pairs (counts) 0.27 0.95 0.3 0.0057 5730 Atom Pairs (counts) 0.27 0.9 0.35 0.0029 2949 Atom Pairs (counts) 0.27 0.8 0.4 0.00017 170 Atom Pairs (counts) 0.27 0.5 0.5 3e-05 30 ------------------------ Topological Torsions (counts) 0.19 0.95 0.3 0.00078 780 Topological Torsions (counts) 0.19 0.9 0.35 0.00045 449 Topological Torsions (counts) 0.19 0.8 0.45 8e-05 80 Topological Torsions (counts) 0.19 0.5 0.55 2.5e-05 25 ------------------------ Atom Pairs (bits) 0.36 0.95 0.4 0.009 8960 Atom Pairs (bits) 0.36 0.9 0.45 0.0045 4545 Atom Pairs (bits) 0.36 0.8 0.5 0.00013 130 Atom Pairs (bits) 0.36 0.5 0.55 7.5e-05 75 ------------------------ Topological Torsions (bits) 0.22 0.95 0.35 0.00093 934 Topological Torsions (bits) 0.22 0.9 0.4 0.00017 170 Topological Torsions (bits) 0.22 0.8 0.45 0.00011 105 Topological Torsions (bits) 0.22 0.5 0.55 3e-05 30 ------------------------ Avalon 512 (bits) 0.51 0.95 0.6 0.00082 820 Avalon 512 (bits) 0.51 0.9 0.65 0.00046 455 Avalon 512 (bits) 0.51 0.8 0.7 9e-05 90 Avalon 512 (bits) 0.51 0.5 0.8 3e-05 30 ------------------------ Avalon 1024 (bits) 0.37 0.95 0.55 0.00083 830 Avalon 1024 (bits) 0.37 0.9 0.6 0.00017 170 Avalon 1024 (bits) 0.37 0.8 0.65 0.00011 105 Avalon 1024 (bits) 0.37 0.5 0.75 3e-05 30 ------------------------ Avalon 512 (counts) 0.42 0.95 0.5 0.0042 4180 Avalon 512 (counts) 0.42 0.9 0.55 0.0022 2215 Avalon 512 (counts) 0.42 0.8 0.65 0.00015 145 Avalon 512 (counts) 0.42 0.5 0.75 2.5e-05 25 ------------------------ Avalon 1024 (counts) 0.38 0.95 0.5 0.0019 1890 Avalon 1024 (counts) 0.38 0.9 0.55 0.001 1030 Avalon 1024 (counts) 0.38 0.8 0.6 0.00017 170 Avalon 1024 (counts) 0.38 0.5 0.75 2.5e-05 25
Same thing, formatted for the blog post
fps = list(list(scaff_accum.values())[0].keys())
all_thresholds = list(list(scaff_accum.values())[0][fps[0]].keys())
thresholds = [0.05,0.1,0.2,0.5]
headings1 = ["<th></th>"]*2
headings2 = ["<th>Fingerprint</th>","<th>0.95 noise level</th>"]
for thresh in thresholds:
headings1.append(f'<th colspan="2">{1-thresh} of related compounds</th>')
headings2.extend(['<th>threshold</th>','<th>db fraction / count per million</th>'])
print("<table>")
print(f"<tr>{' '.join(headings1)}</tr>")
print(f"<tr>{' '.join(headings2)}</tr>")
for fp in fps:
print('<tr>')
print(f'<td><b>{fp}</b></td> <td>{background_accum[fp][0.95]:.2f}</td>',end=" ")
for thresh in thresholds:
ds = [x[fp][thresh] for x in scaff_accum.values()]
med = np.median(ds)
bin = round(20*med)/20
tval = interp(bin,accum[fp])
print(f'<td>{bin}</td> <td>{tval:.2g} / {int(tval*1e6)}</td>',end=" ")
print('</tr>')
print("</table>")
<table> <tr><th></th> <th></th> <th colspan="2">0.95 of related compounds</th> <th colspan="2">0.9 of related compounds</th> <th colspan="2">0.8 of related compounds</th> <th colspan="2">0.5 of related compounds</th></tr> <tr><th>Fingerprint</th> <th>0.95 noise level</th> <th>threshold</th> <th>db fraction / count per million</th> <th>threshold</th> <th>db fraction / count per million</th> <th>threshold</th> <th>db fraction / count per million</th> <th>threshold</th> <th>db fraction / count per million</th></tr> <tr> <td><b>MACCS</b></td> <td>0.57</td> <td>0.6</td> <td>0.043 / 42650</td> <td>0.65</td> <td>0.023 / 22840</td> <td>0.7</td> <td>0.003 / 3030</td> <td>0.8</td> <td>0.0001 / 100</td> </tr> <tr> <td><b>Morgan0 (counts)</b></td> <td>0.57</td> <td>0.55</td> <td>0.058 / 58467</td> <td>0.6</td> <td>0.014 / 14240</td> <td>0.65</td> <td>0.0076 / 7570</td> <td>0.75</td> <td>0.00047 / 470</td> </tr> <tr> <td><b>Morgan1 (counts)</b></td> <td>0.36</td> <td>0.45</td> <td>0.0042 / 4230</td> <td>0.5</td> <td>0.00032 / 320</td> <td>0.55</td> <td>0.00018 / 180</td> <td>0.65</td> <td>2.5e-05 / 25</td> </tr> <tr> <td><b>Morgan2 (counts)</b></td> <td>0.25</td> <td>0.35</td> <td>0.0018 / 1802</td> <td>0.4</td> <td>0.00017 / 170</td> <td>0.45</td> <td>0.00011 / 105</td> <td>0.55</td> <td>2.5e-05 / 25</td> </tr> <tr> <td><b>Morgan3 (counts)</b></td> <td>0.20</td> <td>0.3</td> <td>0.00032 / 320</td> <td>0.3</td> <td>0.00032 / 320</td> <td>0.35</td> <td>0.00018 / 184</td> <td>0.45</td> <td>3.5e-05 / 35</td> </tr> <tr> <td><b>Morgan0 (bits)</b></td> <td>0.57</td> <td>0.55</td> <td>0.062 / 61650</td> <td>0.6</td> <td>0.016 / 15515</td> <td>0.65</td> <td>0.0083 / 8250</td> <td>0.75</td> <td>0.00051 / 512</td> </tr> <tr> <td><b>Morgan1 (bits)</b></td> <td>0.37</td> <td>0.45</td> <td>0.0052 / 5217</td> <td>0.5</td> <td>0.00038 / 380</td> <td>0.55</td> <td>0.00021 / 210</td> <td>0.65</td> <td>2.5e-05 / 25</td> </tr> <tr> <td><b>Morgan2 (bits)</b></td> <td>0.27</td> <td>0.35</td> <td>0.0036 / 3567</td> <td>0.4</td> <td>0.00023 / 230</td> <td>0.45</td> <td>0.00014 / 135</td> <td>0.55</td> <td>2.5e-05 / 25</td> </tr> <tr> <td><b>Morgan3 (bits)</b></td> <td>0.22</td> <td>0.3</td> <td>0.00063 / 630</td> <td>0.35</td> <td>0.00034 / 344</td> <td>0.35</td> <td>0.00034 / 344</td> <td>0.5</td> <td>2e-05 / 20</td> </tr> <tr> <td><b>FeatMorgan0 (counts)</b></td> <td>0.74</td> <td>0.6</td> <td>0.25 / 246595</td> <td>0.65</td> <td>0.16 / 157367</td> <td>0.7</td> <td>0.068 / 68140</td> <td>0.8</td> <td>0.0082 / 8195</td> </tr> <tr> <td><b>FeatMorgan1 (counts)</b></td> <td>0.51</td> <td>0.55</td> <td>0.021 / 21037</td> <td>0.55</td> <td>0.021 / 21037</td> <td>0.6</td> <td>0.0026 / 2610</td> <td>0.7</td> <td>0.00013 / 130</td> </tr> <tr> <td><b>FeatMorgan2 (counts)</b></td> <td>0.36</td> <td>0.45</td> <td>0.004 / 4050</td> <td>0.5</td> <td>0.00027 / 270</td> <td>0.55</td> <td>0.00016 / 155</td> <td>0.65</td> <td>2.5e-05 / 25</td> </tr> <tr> <td><b>FeatMorgan3 (counts)</b></td> <td>0.28</td> <td>0.35</td> <td>0.0054 / 5412</td> <td>0.4</td> <td>0.00028 / 280</td> <td>0.45</td> <td>0.00016 / 165</td> <td>0.55</td> <td>3.5e-05 / 35</td> </tr> <tr> <td><b>FeatMorgan0 (bits)</b></td> <td>0.74</td> <td>0.6</td> <td>0.25 / 246595</td> <td>0.65</td> <td>0.16 / 157367</td> <td>0.7</td> <td>0.068 / 68140</td> <td>0.8</td> <td>0.0082 / 8195</td> </tr> <tr> <td><b>FeatMorgan1 (bits)</b></td> <td>0.51</td> <td>0.55</td> <td>0.023 / 23195</td> <td>0.55</td> <td>0.023 / 23195</td> <td>0.6</td> <td>0.003 / 3010</td> <td>0.7</td> <td>0.00014 / 140</td> </tr> <tr> <td><b>FeatMorgan2 (bits)</b></td> <td>0.38</td> <td>0.45</td> <td>0.0065 / 6452</td> <td>0.5</td> <td>0.00036 / 360</td> <td>0.55</td> <td>0.00021 / 205</td> <td>0.65</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>FeatMorgan3 (bits)</b></td> <td>0.30</td> <td>0.4</td> <td>0.00045 / 450</td> <td>0.4</td> <td>0.00045 / 450</td> <td>0.45</td> <td>0.00025 / 250</td> <td>0.55</td> <td>3.5e-05 / 35</td> </tr> <tr> <td><b>RDKit 4 (bits)</b></td> <td>0.33</td> <td>0.5</td> <td>0.0011 / 1120</td> <td>0.55</td> <td>0.00065 / 645</td> <td>0.6</td> <td>0.00017 / 170</td> <td>0.7</td> <td>4e-05 / 40</td> </tr> <tr> <td><b>RDKit 5 (bits)</b></td> <td>0.29</td> <td>0.5</td> <td>0.0004 / 400</td> <td>0.55</td> <td>0.00025 / 245</td> <td>0.6</td> <td>9e-05 / 90</td> <td>0.7</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>RDKit 6 (bits)</b></td> <td>0.31</td> <td>0.5</td> <td>0.00027 / 270</td> <td>0.55</td> <td>0.00017 / 170</td> <td>0.6</td> <td>7e-05 / 70</td> <td>0.7</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>RDKit 7 (bits)</b></td> <td>0.43</td> <td>0.5</td> <td>0.0011 / 1150</td> <td>0.6</td> <td>9e-05 / 90</td> <td>0.65</td> <td>6e-05 / 60</td> <td>0.75</td> <td>2e-05 / 20</td> </tr> <tr> <td><b>linear RDKit 4 (bits)</b></td> <td>0.35</td> <td>0.5</td> <td>0.0023 / 2255</td> <td>0.55</td> <td>0.0013 / 1277</td> <td>0.6</td> <td>0.0003 / 300</td> <td>0.75</td> <td>4e-05 / 40</td> </tr> <tr> <td><b>linear RDKit 5 (bits)</b></td> <td>0.31</td> <td>0.45</td> <td>0.0036 / 3590</td> <td>0.55</td> <td>0.0004 / 400</td> <td>0.6</td> <td>0.00013 / 130</td> <td>0.7</td> <td>4e-05 / 40</td> </tr> <tr> <td><b>linear RDKit 6 (bits)</b></td> <td>0.28</td> <td>0.45</td> <td>0.0015 / 1525</td> <td>0.5</td> <td>0.00033 / 330</td> <td>0.6</td> <td>8e-05 / 80</td> <td>0.7</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>linear RDKit 7 (bits)</b></td> <td>0.26</td> <td>0.45</td> <td>0.00076 / 755</td> <td>0.5</td> <td>0.0002 / 200</td> <td>0.55</td> <td>0.00013 / 130</td> <td>0.7</td> <td>2e-05 / 20</td> </tr> <tr> <td><b>Atom Pairs (counts)</b></td> <td>0.27</td> <td>0.3</td> <td>0.0057 / 5730</td> <td>0.35</td> <td>0.0029 / 2949</td> <td>0.4</td> <td>0.00017 / 170</td> <td>0.5</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>Topological Torsions (counts)</b></td> <td>0.19</td> <td>0.3</td> <td>0.00078 / 780</td> <td>0.35</td> <td>0.00045 / 449</td> <td>0.45</td> <td>8e-05 / 80</td> <td>0.55</td> <td>2.5e-05 / 25</td> </tr> <tr> <td><b>Atom Pairs (bits)</b></td> <td>0.36</td> <td>0.4</td> <td>0.009 / 8960</td> <td>0.45</td> <td>0.0045 / 4545</td> <td>0.5</td> <td>0.00013 / 130</td> <td>0.55</td> <td>7.5e-05 / 75</td> </tr> <tr> <td><b>Topological Torsions (bits)</b></td> <td>0.22</td> <td>0.35</td> <td>0.00093 / 934</td> <td>0.4</td> <td>0.00017 / 170</td> <td>0.45</td> <td>0.00011 / 105</td> <td>0.55</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>Avalon 512 (bits)</b></td> <td>0.51</td> <td>0.6</td> <td>0.00082 / 820</td> <td>0.65</td> <td>0.00046 / 455</td> <td>0.7</td> <td>9e-05 / 90</td> <td>0.8</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>Avalon 1024 (bits)</b></td> <td>0.37</td> <td>0.55</td> <td>0.00083 / 830</td> <td>0.6</td> <td>0.00017 / 170</td> <td>0.65</td> <td>0.00011 / 105</td> <td>0.75</td> <td>3e-05 / 30</td> </tr> <tr> <td><b>Avalon 512 (counts)</b></td> <td>0.42</td> <td>0.5</td> <td>0.0042 / 4180</td> <td>0.55</td> <td>0.0022 / 2215</td> <td>0.65</td> <td>0.00015 / 145</td> <td>0.75</td> <td>2.5e-05 / 25</td> </tr> <tr> <td><b>Avalon 1024 (counts)</b></td> <td>0.38</td> <td>0.5</td> <td>0.0019 / 1890</td> <td>0.55</td> <td>0.001 / 1030</td> <td>0.6</td> <td>0.00017 / 170</td> <td>0.75</td> <td>2.5e-05 / 25</td> </tr> </table>
background_accum['Morgan2 (bits)'][0.95]
0.2689655172413793