I've done a number of posts looking at Morgan fingerprint statistics before, including:
I have done similar analysis for other fingerprint types, but it looks like I didn't post that (at least I can't find it if I did). It's useful to do this because, as we'll see, the different fingerprint types have very different numbers of bits set for typical molecules.
Here's the summary of the mean and standard deviation of the number of bits set, from an analysis of 5 million molecules with less than 50 heavy atoms extracted from ZINC:
Fingerprint | Type | Mean num_bits | SD num_bits |
---|---|---|---|
Morgan1 | sparse | 29.4 | 5.6 |
Morgan2 | sparse | 48.7 | 9.6 |
Morgan3 | sparse | 66.8 | 13.8 |
FeatMorgan1 | sparse | 20.1 | 3.9 |
FeatMorgan2 | sparse | 38.1 | 7.7 |
FeatMorgan3 | sparse | 56.0 | 11.8 |
RDKit5 | bitvect | 363 | 122 |
RDKit6 | bitvect | 621 | 233 |
RDKit7 | bitvect | 993 | 406 |
pattern | bitvect | 446 | 122 |
avalon | bitvect | 280 | 130 |
atom pairs | sparse | 167 | 56 |
TT | sparse | 33.4 | 9.8 |
atom pairs | bitvect | 267 | 90 |
TT | bitvect | 47.2 | 12.0 |
The bit vector fingerprints were all 4096 bits long.
from rdkit import Chem,DataStructs
import time,random,gzip,pickle,copy
import numpy as np
from collections import Counter,defaultdict
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
from rdkit import rdBase
%pylab inline
print(rdBase.rdkitVersion)
import time
print(time.asctime())
Populating the interactive namespace from numpy and matplotlib 2021.09.2 Sat Nov 27 05:43:29 2021
/home/glandrum/miniconda3/envs/rdkit_blog/lib/python3.9/site-packages/IPython/core/magics/pylab.py:159: UserWarning: pylab import has clobbered these variables: ['copy', 'random'] `%matplotlib` prevents importing * from pylab and numpy warn("pylab import has clobbered these variables: %s" % clobbered +
try:
import ipyparallel as ipp
rc = ipp.Client()
dview = rc[:]
dview.execute('from rdkit import Chem')
dview.execute('from rdkit import Descriptors')
dview.execute('from rdkit.Chem import rdMolDescriptors')
dview.execute('from rdkit.Avalon import pyAvalonTools')
except:
print("could not use ipyparallel")
dview = None
For test data I'll use the same 16 million ZINC compounds I used in the bit statistics post.
filen='/scratch/RDKit_git/LocalData/Zinc/zinc_all_clean.pkl.gz'
Loop over the molecules, skip anything with more than 50 atoms, and build fingerprints for all the others.
The fingerprints I generate for this analysis are:
All of the BitVect fingerprints are 4096 bits long
import copy
historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','wb+')
counts=defaultdict(Counter)
t1 = time.time()
with gzip.open(filen,'rb') as inf:
i = 0
ms = []
while 1:
try:
m,nm = pickle.load(inf)
except EOFError:
break
if not m or m.GetNumHeavyAtoms()>50: continue
ms.append(m)
i+=1
if len(ms)>=10000:
for v in 1,2,3:
cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('Morgan',v)][obc]+=1
for v in 1,2,3:
cnts = dview.map_sync(lambda x,v=v:len(rdMolDescriptors.GetMorganFingerprint(x,v,useFeatures=True).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('FeatMorgan',v)][obc]+=1
for v in 5,6,7:
cnts = dview.map_sync(lambda x,v=v:Chem.RDKFingerprint(x,maxPath=v,fpSize=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('RDKit',v)][obc]+=1
cnts = dview.map_sync(lambda x:Chem.PatternFingerprint(x,fpSize=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('pattern',-1)][obc]+=1
cnts = dview.map_sync(lambda x:pyAvalonTools.GetAvalonFP(x,nBits=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('avalon',-1)][obc]+=1
cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetAtomPairFingerprint(x).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('ap-counts',-1)][obc]+=1
cnts = dview.map_sync(lambda x:len(rdMolDescriptors.GetTopologicalTorsionFingerprint(x).GetNonzeroElements()),
ms)
for obc in cnts:
counts[('tt-counts',-1)][obc]+=1
cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('ap-bv',-1)][obc]+=1
cnts = dview.map_sync(lambda x:rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x,nBits=4096).GetNumOnBits(),
ms)
for obc in cnts:
counts[('tt-bv',-1)][obc]+=1
ms = []
if not i%50000:
t2 = time.time()
print("Done %d in %.2f sec"%(i,t2-t1))
if not i%500000:
pickle.dump(dict(counts),historyf)
if i>=5000000:
break
Done 50000 in 38.37 sec Done 100000 in 76.13 sec Done 150000 in 112.82 sec Done 200000 in 159.44 sec Done 250000 in 209.28 sec Done 300000 in 260.03 sec Done 350000 in 309.92 sec Done 400000 in 361.23 sec Done 450000 in 401.80 sec Done 500000 in 452.73 sec Done 550000 in 508.33 sec Done 600000 in 551.27 sec Done 650000 in 601.37 sec Done 700000 in 650.59 sec Done 750000 in 699.14 sec Done 800000 in 747.98 sec Done 850000 in 793.70 sec Done 900000 in 841.68 sec Done 950000 in 888.89 sec Done 1000000 in 934.87 sec Done 1050000 in 981.82 sec Done 1100000 in 1028.41 sec Done 1150000 in 1074.17 sec Done 1200000 in 1120.49 sec Done 1250000 in 1165.59 sec Done 1300000 in 1208.65 sec Done 1350000 in 1256.99 sec Done 1400000 in 1304.47 sec Done 1450000 in 1349.63 sec Done 1500000 in 1397.88 sec Done 1550000 in 1442.67 sec Done 1600000 in 1486.97 sec Done 1650000 in 1531.26 sec Done 1700000 in 1576.05 sec Done 1750000 in 1629.38 sec Done 1800000 in 1681.49 sec Done 1850000 in 1738.35 sec Done 1900000 in 1793.71 sec Done 1950000 in 1849.16 sec Done 2000000 in 1903.99 sec Done 2050000 in 1961.34 sec Done 2100000 in 2017.67 sec Done 2150000 in 2072.83 sec Done 2200000 in 2128.08 sec Done 2250000 in 2180.22 sec Done 2300000 in 2228.80 sec Done 2350000 in 2278.05 sec Done 2400000 in 2327.23 sec Done 2450000 in 2378.87 sec Done 2500000 in 2431.24 sec Done 2550000 in 2483.96 sec Done 2600000 in 2533.77 sec Done 2650000 in 2585.27 sec Done 2700000 in 2637.09 sec Done 2750000 in 2687.23 sec Done 2800000 in 2736.71 sec Done 2850000 in 2787.06 sec Done 2900000 in 2840.13 sec Done 2950000 in 2894.25 sec Done 3000000 in 2943.74 sec Done 3050000 in 2995.87 sec Done 3100000 in 3043.22 sec Done 3150000 in 3097.84 sec Done 3200000 in 3148.84 sec Done 3250000 in 3199.32 sec Done 3300000 in 3249.45 sec Done 3350000 in 3299.99 sec Done 3400000 in 3350.13 sec Done 3450000 in 3398.13 sec Done 3500000 in 3447.51 sec Done 3550000 in 3496.75 sec Done 3600000 in 3546.92 sec Done 3650000 in 3597.81 sec Done 3700000 in 3646.01 sec Done 3750000 in 3694.93 sec Done 3800000 in 3743.13 sec Done 3850000 in 3793.51 sec Done 3900000 in 3848.14 sec Done 3950000 in 3900.24 sec Done 4000000 in 3950.51 sec Done 4050000 in 4003.19 sec Done 4100000 in 4056.00 sec Done 4150000 in 4100.92 sec Done 4200000 in 4150.50 sec Done 4250000 in 4199.36 sec Done 4300000 in 4248.88 sec Done 4350000 in 4296.68 sec Done 4400000 in 4345.78 sec Done 4450000 in 4390.20 sec Done 4500000 in 4436.03 sec Done 4550000 in 4480.83 sec Done 4600000 in 4525.25 sec Done 4650000 in 4569.70 sec Done 4700000 in 4614.02 sec Done 4750000 in 4660.50 sec Done 4800000 in 4706.11 sec Done 4850000 in 4759.04 sec Done 4900000 in 4813.05 sec Done 4950000 in 4867.92 sec Done 5000000 in 4916.01 sec
pickle.dump(dict(counts),gzip.open('../data/fp_bit_counts.pkl.gz','wb+'))
Now plot the distributions of the number of bits set
morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']
figure(figsize=(15,15))
pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
cnts = sorted(counts[(n,r)].items())
plot([x for x,y in cnts],[y for x,y in cnts],label=
f"r={r}")
_=title("Morgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
cnts = sorted(counts[(n,r)].items())
plot([x for x,y in cnts],[y for x,y in cnts],label=
f"r={r}")
_=title("FeatMorgan")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
cnts = sorted(counts[(n,r)].items())
plot([x for x,y in cnts],[y for x,y in cnts],label=
f"r={r}")
_=title("RDKit")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
for k in counts.keys():
if k[0] in ('Morgan','FeatMorgan','RDKit'):
continue
pidx+=1
subplot(3,3,pidx)
cnts = sorted(counts[k].items())
plot([x for x,y in cnts],[y for x,y in cnts])
_=title(k[0])
_=xlabel("num bits set")
_=ylabel("count")
The avalon FP curve has an interesting shape
for k,cnts in counts.items():
accum = 0
denom = 0
for cnt,num in cnts.items():
accum += cnt*num
denom += num
mean = accum/denom
dev = 0
for cnt,num in cnts.items():
dev += num*(cnt-mean)**2
dev /= (denom-1)
dev = dev**0.5
label = k[0]
if k[1]!=-1:
label += str(k[1])
print(label,'\t%.1f'%mean,'%.1f'%dev)
Morgan1 29.4 5.6 Morgan2 48.7 9.6 Morgan3 66.8 13.8 FeatMorgan1 20.1 3.9 FeatMorgan2 38.1 7.7 FeatMorgan3 56.0 11.8 RDKit5 363.3 122.5 RDKit6 621.7 233.2 RDKit7 993.6 406.3 pattern 445.5 122.5 avalon 279.8 129.9 ap-counts 166.6 56.3 tt-counts 33.4 9.8 ap-bv 267.3 90.0 tt-bv 47.2 12.0
I did 5 million examples, which took a while (about 1.5 hours with 6 worker processes on my PC). Could I have analyzed less and gotten to the same results? Did the means converge? If so, how quickly?
historyf = gzip.open('../data/fp_bit_counts.history.pkl.gz','rb')
means = defaultdict(list)
devs = defaultdict(list)
nmols = []
while 1:
try:
lcounts = pickle.load(historyf)
except EOFError:
break
for k,cnts in lcounts.items():
accum = 0
denom = 0
for cnt,num in cnts.items():
accum += cnt*num
denom += num
mean = accum/denom
dev = 0
for cnt,num in cnts.items():
dev += num*(cnt-mean)**2
dev /= (denom-1)
dev = dev**0.5
if denom not in nmols:
nmols.append(denom)
means[k].append(mean)
devs[k].append(dev)
label = k[0]
if k[1]!=-1:
label += str(k[1])
print(denom,label,'\t%.1f'%mean,'%.1f'%dev)
500000 Morgan1 26.0 6.2 500000 Morgan2 42.8 10.7 500000 Morgan3 58.7 15.5 500000 FeatMorgan1 18.2 4.3 500000 FeatMorgan2 33.8 8.5 500000 FeatMorgan3 49.5 13.2 500000 RDKit5 324.6 133.9 500000 RDKit6 560.8 256.2 500000 RDKit7 902.9 445.7 500000 pattern 408.8 133.9 500000 avalon 241.8 133.8 500000 ap-counts 133.3 57.6 500000 tt-counts 28.6 10.2 500000 ap-bv 219.5 93.6 500000 tt-bv 41.9 12.9 1000000 Morgan1 27.1 6.1 1000000 Morgan2 44.6 10.5 1000000 Morgan3 61.2 15.2 1000000 FeatMorgan1 18.9 4.2 1000000 FeatMorgan2 35.2 8.4 1000000 FeatMorgan3 51.6 13.0 1000000 RDKit5 340.7 133.9 1000000 RDKit6 588.9 257.4 1000000 RDKit7 948.5 449.9 1000000 pattern 425.2 136.0 1000000 avalon 257.7 136.7 1000000 ap-counts 143.7 57.7 1000000 tt-counts 30.1 10.1 1000000 ap-bv 234.4 92.8 1000000 tt-bv 43.6 12.9 1500000 Morgan1 27.3 5.8 1500000 Morgan2 45.0 9.9 1500000 Morgan3 61.7 14.3 1500000 FeatMorgan1 19.0 4.1 1500000 FeatMorgan2 35.5 8.0 1500000 FeatMorgan3 52.0 12.3 1500000 RDKit5 340.3 127.8 1500000 RDKit6 587.1 246.2 1500000 RDKit7 944.8 432.0 1500000 pattern 424.0 129.4 1500000 avalon 260.5 133.7 1500000 ap-counts 145.1 54.8 1500000 tt-counts 30.5 9.8 1500000 ap-bv 234.9 87.3 1500000 tt-bv 43.7 12.3 2000000 Morgan1 28.0 5.7 2000000 Morgan2 46.2 9.8 2000000 Morgan3 63.4 14.1 2000000 FeatMorgan1 19.4 4.0 2000000 FeatMorgan2 36.3 7.9 2000000 FeatMorgan3 53.3 12.1 2000000 RDKit5 350.7 126.6 2000000 RDKit6 603.5 243.1 2000000 RDKit7 969.0 425.8 2000000 pattern 433.3 128.0 2000000 avalon 269.5 133.1 2000000 ap-counts 152.4 55.5 2000000 tt-counts 31.5 9.8 2000000 ap-bv 245.8 88.2 2000000 tt-bv 45.0 12.2 2500000 Morgan1 28.7 5.8 2500000 Morgan2 47.5 9.8 2500000 Morgan3 65.3 14.2 2500000 FeatMorgan1 19.7 4.0 2500000 FeatMorgan2 37.2 7.9 2500000 FeatMorgan3 54.7 12.1 2500000 RDKit5 361.5 126.3 2500000 RDKit6 621.2 241.1 2500000 RDKit7 996.0 420.5 2500000 pattern 443.2 126.4 2500000 avalon 278.4 132.6 2500000 ap-counts 160.1 56.9 2500000 tt-counts 32.6 9.9 2500000 ap-bv 257.9 90.2 2500000 tt-bv 46.3 12.2 3000000 Morgan1 29.1 5.7 3000000 Morgan2 48.1 9.8 3000000 Morgan3 66.1 14.1 3000000 FeatMorgan1 19.9 3.9 3000000 FeatMorgan2 37.6 7.8 3000000 FeatMorgan3 55.3 12.0 3000000 RDKit5 364.5 124.5 3000000 RDKit6 625.3 237.2 3000000 RDKit7 1001.4 413.2 3000000 pattern 446.5 124.1 3000000 avalon 280.5 131.5 3000000 ap-counts 163.7 57.0 3000000 tt-counts 33.1 9.8 3000000 ap-bv 263.5 90.5 3000000 tt-bv 46.9 12.1 3500000 Morgan1 29.2 5.7 3500000 Morgan2 48.3 9.7 3500000 Morgan3 66.4 14.0 3500000 FeatMorgan1 19.9 3.9 3500000 FeatMorgan2 37.7 7.8 3500000 FeatMorgan3 55.6 11.9 3500000 RDKit5 365.3 123.8 3500000 RDKit6 626.7 236.0 3500000 RDKit7 1003.7 411.3 3500000 pattern 448.4 123.3 3500000 avalon 280.3 131.1 3500000 ap-counts 165.1 56.7 3500000 tt-counts 33.3 9.8 3500000 ap-bv 265.9 90.1 3500000 tt-bv 47.2 12.1 4000000 Morgan1 29.4 5.7 4000000 Morgan2 48.6 9.8 4000000 Morgan3 66.7 14.1 4000000 FeatMorgan1 20.0 3.9 4000000 FeatMorgan2 38.0 7.8 4000000 FeatMorgan3 55.9 12.0 4000000 RDKit5 365.2 124.1 4000000 RDKit6 627.1 236.6 4000000 RDKit7 1005.0 412.4 4000000 pattern 448.6 124.0 4000000 avalon 281.4 131.3 4000000 ap-counts 165.7 56.9 4000000 tt-counts 33.4 9.9 4000000 ap-bv 266.8 90.6 4000000 tt-bv 47.3 12.2 4500000 Morgan1 29.4 5.6 4500000 Morgan2 48.7 9.6 4500000 Morgan3 66.8 13.9 4500000 FeatMorgan1 20.1 3.9 4500000 FeatMorgan2 38.0 7.7 4500000 FeatMorgan3 55.9 11.8 4500000 RDKit5 364.3 123.1 4500000 RDKit6 624.4 234.6 4500000 RDKit7 999.1 408.8 4500000 pattern 447.0 122.7 4500000 avalon 280.7 130.6 4500000 ap-counts 166.3 56.4 4500000 tt-counts 33.4 9.8 4500000 ap-bv 267.3 89.9 4500000 tt-bv 47.3 12.1 5000000 Morgan1 29.4 5.6 5000000 Morgan2 48.7 9.6 5000000 Morgan3 66.8 13.8 5000000 FeatMorgan1 20.1 3.9 5000000 FeatMorgan2 38.1 7.7 5000000 FeatMorgan3 56.0 11.8 5000000 RDKit5 363.3 122.5 5000000 RDKit6 621.7 233.2 5000000 RDKit7 993.6 406.3 5000000 pattern 445.5 122.5 5000000 avalon 279.8 129.9 5000000 ap-counts 166.6 56.3 5000000 tt-counts 33.4 9.8 5000000 ap-bv 267.3 90.0 5000000 tt-bv 47.2 12.0
Let's look at those graphically:
morgan_ks = [x for x in counts.keys() if x[0] =='Morgan']
featmorgan_ks = [x for x in counts.keys() if x[0] =='FeatMorgan']
rdkit_ks = [x for x in counts.keys() if x[0] == 'RDKit']
figure(figsize=(15,15))
nmols2 = [x/1000000 for x in nmols]
pidx=1
subplot(3,3,pidx)
for n,r in morgan_ks:
lmeans = means[(n,r)]
ldevs = devs[(n,r)]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("Morgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()
pidx=2
subplot(3,3,pidx)
for n,r in featmorgan_ks:
lmeans = means[(n,r)]
ldevs = devs[(n,r)]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("FeatMorgan")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()
pidx=3
subplot(3,3,pidx)
for n,r in rdkit_ks:
lmeans = means[(n,r)]
ldevs = devs[(n,r)]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title("RDKit")
_=xlabel("num mols (millions)")
_=ylabel("count")
#_=legend()
for k in counts.keys():
if k[0] in ('Morgan','FeatMorgan','RDKit'):
continue
pidx+=1
subplot(3,3,pidx)
lmeans = means[k]
ldevs = devs[k]
errorbar(nmols2,lmeans,yerr=ldevs,capsize=3)
_=title(k[0])
_=xlabel("num mols (millions)")
_=ylabel("count")
Looks like we would have been fine with 3 million molecules.