This is an updated and revised version of an old post
I looked at the number of collisions in Morgan fingerprints in an earlier post. The topic came up again in discussions about the recent post on Morgan fingerprint stats, which used a much larger data set.
Here we repeat earlier collision analysis, usting the larger dataset. I will look at fingerprints with different radii -- 1, 2, and 3 -- folded to a set of different sizes -- 1K, 2K, 4K, 8K, 16K.
TL;DR version: The conclusions match what we observed before, there are a fair number of collisions at fingerprint sizes below 4K. As expected, higher radii have more collisions.
from rdkit import Chem,DataStructs
import time,random,gzip,pickle,copy
import numpy as np
from collections import defaultdict
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
import rdkit
%pylab inline
plt.style.use('tableau-colorblind10')
print(rdkit.__version__)
import time
print(time.asctime())
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib 2022.09.1 Sat Jan 7 05:31:07 2023
/localhome/glandrum/.conda/envs/py310_rdkit/lib/python3.10/site-packages/IPython/core/magics/pylab.py:162: UserWarning: pylab import has clobbered these variables: ['random', 'copy'] `%matplotlib` prevents importing * from pylab and numpy warn("pylab import has clobbered these variables: %s" % clobbered +
For test data I'll use the 4 million random compounds from PubChem with <=50 heavy atoms. I constructed the set of compounds by downloading the full pubchem compound set (on 8 Jan, 2023), and picking 10 million random lines using an awk script.
filen='/localhome/glandrum/Datasets/PubChem/pubchem_compound_random_10000000.txt.gz'
Loop over the molecules and build fingerprints of multiple radii and folded lengths.
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
import copy
history={} # we will use this to see how quickly the results converge
counts=defaultdict(lambda:defaultdict(int))
t1 = time.time()
i = 0
with gzip.open(filen,'rb') as inf:
for inl in inf:
try:
nm,smi = inl.strip().split()
except:
break
m = Chem.MolFromSmiles(smi)
if m is None or m.GetNumHeavyAtoms()>50:
continue
i+=1
for v in 1,2,3:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=v)
onbits=len(fpg.GetSparseFingerprint(m).GetOnBits())
counts[(v,-1)][onbits]+=1
for l in 64,128,256,512,1024,2048,4096,8192,16384:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=v,fpSize=l)
dbits = onbits-fpg.GetFingerprint(m).GetNumOnBits()
counts[(v,l)][dbits]+=1
if not i%50000:
t2 = time.time()
print(f"Done {i} in {t2-t1:.2f} sec")
if not i%100000:
history[i] = copy.deepcopy(counts)
if i>=4000000:
break
Done 50000 in 57.94 sec Done 100000 in 125.87 sec Done 150000 in 205.79 sec Done 200000 in 278.77 sec Done 250000 in 353.97 sec Done 300000 in 435.63 sec Done 350000 in 519.25 sec Done 400000 in 600.79 sec Done 450000 in 679.81 sec Done 500000 in 760.31 sec Done 550000 in 830.27 sec Done 600000 in 900.44 sec Done 650000 in 970.40 sec Done 700000 in 1040.55 sec Done 750000 in 1121.73 sec Done 800000 in 1196.58 sec Done 850000 in 1268.15 sec Done 900000 in 1343.85 sec Done 950000 in 1395.36 sec Done 1000000 in 1457.15 sec Done 1050000 in 1542.61 sec Done 1100000 in 1620.64 sec Done 1150000 in 1695.22 sec Done 1200000 in 1773.86 sec Done 1250000 in 1854.29 sec Done 1300000 in 1925.72 sec Done 1350000 in 1995.17 sec Done 1400000 in 2062.94 sec Done 1450000 in 2133.66 sec Done 1500000 in 2206.72 sec Done 1550000 in 2290.43 sec Done 1600000 in 2375.93 sec Done 1650000 in 2457.73 sec Done 1700000 in 2536.96 sec Done 1750000 in 2605.62 sec Done 1800000 in 2680.01 sec Done 1850000 in 2751.66 sec Done 1900000 in 2833.90 sec Done 1950000 in 2907.74 sec Done 2000000 in 2958.12 sec Done 2050000 in 3047.43 sec Done 2100000 in 3127.37 sec Done 2150000 in 3209.95 sec Done 2200000 in 3283.15 sec Done 2250000 in 3361.46 sec Done 2300000 in 3433.11 sec Done 2350000 in 3508.50 sec Done 2400000 in 3575.93 sec Done 2450000 in 3645.63 sec Done 2500000 in 3706.89 sec Done 2550000 in 3789.58 sec Done 2600000 in 3870.42 sec Done 2650000 in 3939.69 sec Done 2700000 in 4021.64 sec Done 2750000 in 4105.62 sec Done 2800000 in 4188.76 sec Done 2850000 in 4270.74 sec Done 2900000 in 4346.26 sec Done 2950000 in 4398.31 sec Done 3000000 in 4447.39 sec Done 3050000 in 4496.38 sec Done 3100000 in 4544.76 sec Done 3150000 in 4594.69 sec Done 3200000 in 4643.74 sec Done 3250000 in 4692.25 sec Done 3300000 in 4740.53 sec Done 3350000 in 4789.92 sec Done 3400000 in 4838.41 sec Done 3450000 in 4916.83 sec Done 3500000 in 4999.03 sec Done 3550000 in 5079.96 sec Done 3600000 in 5162.39 sec Done 3650000 in 5244.71 sec Done 3700000 in 5327.69 sec Done 3750000 in 5402.79 sec Done 3800000 in 5482.04 sec Done 3850000 in 5555.05 sec Done 3900000 in 5631.47 sec Done 3950000 in 5704.01 sec Done 4000000 in 5769.00 sec
pickle.dump(dict(counts),gzip.open('../data/fp_collision_counts.pkl.gz','wb+'))
for k,d in history.items():
history[k] = dict(d)
pickle.dump(dict(history),gzip.open('../data/fp_collision_counts.history.pkl.gz','wb+'))
with gzip.open('../data/fp_collision_counts.pkl.gz','rb') as inf:
counts = pickle.load(inf)
with gzip.open('../data/fp_collision_counts.history.pkl.gz','rb') as inf:
history = pickle.load(inf)
Now plot histograms of the numbers of collisions along with the distributions of the number of bits set in the non-folded FPs
figure(figsize=(16,25))
pidx=1
#----------------------------
for nbits in (16384, 8192,4096,2048,1024):
subplot(5,2,pidx)
pidx+=1
maxCollisions = max(counts[3,nbits].keys())+1
d1=np.zeros(maxCollisions,int)
for k,v in counts[1,nbits].items():
d1[k]=v
d2=np.zeros(maxCollisions,int)
for k,v in counts[2,nbits].items():
d2[k]=v
d3=np.zeros(maxCollisions,int)
for k,v in counts[3,nbits].items():
d3[k]=v
barWidth=.25
locs = np.array(range(maxCollisions))
bar(locs,d1,barWidth,label="r=1")
bar(locs+barWidth,d2,barWidth,label="r=2")
bar(locs+2*barWidth,d3,barWidth,label="r=3")
#_=hist((d1,d2,d3),bins=20,log=True,label=("r=1","r=2","r=3"))
title('%d bits'%nbits)
_=yscale("log")
_=legend()
subplot(5,2,pidx)
pidx+=1
itms = list(sorted(counts[1,-1].items()))
plot([x for x,y in itms],[y for x,y in itms],'.-',label=
"r=1")
itms = list(sorted(counts[2,-1].items()))
plot([x for x,y in itms],[y for x,y in itms],'.-',label=
"r=2")
itms = list(sorted(counts[3,-1].items()))
plot([x for x,y in itms],[y for x,y in itms],'.-',label=
"r=3")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
So, there are definitely some collisions. But not a huge number.
Look at the shorter fingerprints
bcounts = (512,256,128,64)
figure(figsize=(16,5*len(bcounts)))
pidx=1
#----------------------------
for nbits in bcounts:
subplot(len(bcounts),2,pidx)
pidx+=1
maxCollisions = max(counts[3,nbits].keys())+1
d1=np.zeros(maxCollisions,int)
for k,v in counts[1,nbits].items():
d1[k]=v
d2=np.zeros(maxCollisions,int)
for k,v in counts[2,nbits].items():
d2[k]=v
d3=np.zeros(maxCollisions,int)
for k,v in counts[3,nbits].items():
d3[k]=v
barWidth=.25
locs = np.array(range(maxCollisions))
bar(locs,d1,barWidth,label="r=1")
bar(locs+barWidth,d2,barWidth,label="r=2")
bar(locs+2*barWidth,d3,barWidth,label="r=3")
#_=hist((d1,d2,d3),bins=20,log=True,label=("r=1","r=2","r=3"))
title('%d bits'%nbits)
_=yscale("log")
_=legend()
subplot(len(bcounts),2,pidx)
pidx+=1
itms = list(sorted(counts[1,-1].items()))
plot([x for x,y in itms],[y for x,y in itms],'.-',label=
"r=1")
itms = list(sorted(counts[2,-1].items()))
plot([x for x,y in itms],[y for x,y in itms],'.-',label=
"r=2")
itms = list(sorted(counts[3,-1].items()))
plot([x for x,y in itms],[y for x,y in itms],'.-',label=
"r=3")
_=xlabel("num bits set")
_=ylabel("count")
_=legend()
Look at that quantitatively
rads = (1,2,3)
bcounts = (64,128,256,512,1024,2048,4096,8192,16384)
ccounts = list(range(11))
row = ('length','radius',)+tuple(str(x) for x in ccounts)
divider = ['-'*len(k) for k in row]
print('| '+' | '.join(row)+' |')
print('| '+' | '.join(divider)+' |')
for bc in reversed(bcounts):
for rad in rads:
row = [str(bc),str(rad)]
nmols = np.sum(np.fromiter((x for x in counts[rad,bc].values()),int))
accum = 0
for ccount in ccounts:
accum += counts[rad,bc].get(ccount,0)
row.append(f'{accum/nmols:.3f}')
print('| '+' | '.join(row)+' |')
| length | radius | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | | ------ | ------ | - | - | - | - | - | - | - | - | - | - | -- | | 16384 | 1 | 0.991 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 16384 | 2 | 0.956 | 0.999 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 16384 | 3 | 0.900 | 0.993 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 8192 | 1 | 0.969 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 8192 | 2 | 0.906 | 0.994 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 8192 | 3 | 0.808 | 0.974 | 0.997 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 4096 | 1 | 0.950 | 0.998 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 4096 | 2 | 0.829 | 0.980 | 0.998 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 4096 | 3 | 0.673 | 0.922 | 0.985 | 0.997 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 2048 | 1 | 0.740 | 0.975 | 0.998 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 2048 | 2 | 0.564 | 0.874 | 0.971 | 0.994 | 0.999 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 2048 | 3 | 0.385 | 0.720 | 0.892 | 0.963 | 0.988 | 0.996 | 0.999 | 1.000 | 1.000 | 1.000 | 1.000 | | 1024 | 1 | 0.674 | 0.942 | 0.992 | 0.999 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 1024 | 2 | 0.393 | 0.728 | 0.898 | 0.965 | 0.989 | 0.997 | 0.999 | 1.000 | 1.000 | 1.000 | 1.000 | | 1024 | 3 | 0.207 | 0.476 | 0.694 | 0.835 | 0.917 | 0.960 | 0.981 | 0.991 | 0.996 | 0.998 | 0.999 | | 512 | 1 | 0.501 | 0.835 | 0.956 | 0.989 | 0.998 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | | 512 | 2 | 0.184 | 0.445 | 0.671 | 0.824 | 0.911 | 0.957 | 0.980 | 0.991 | 0.996 | 0.999 | 0.999 | | 512 | 3 | 0.074 | 0.209 | 0.371 | 0.527 | 0.662 | 0.767 | 0.844 | 0.899 | 0.936 | 0.960 | 0.976 | | 256 | 1 | 0.312 | 0.659 | 0.864 | 0.952 | 0.984 | 0.995 | 0.999 | 1.000 | 1.000 | 1.000 | 1.000 | | 256 | 2 | 0.060 | 0.186 | 0.348 | 0.511 | 0.653 | 0.764 | 0.846 | 0.902 | 0.939 | 0.963 | 0.978 | | 256 | 3 | 0.022 | 0.066 | 0.134 | 0.220 | 0.314 | 0.409 | 0.500 | 0.584 | 0.659 | 0.726 | 0.783 | | 128 | 1 | 0.117 | 0.324 | 0.546 | 0.726 | 0.846 | 0.918 | 0.958 | 0.980 | 0.991 | 0.996 | 0.998 | | 128 | 2 | 0.015 | 0.049 | 0.106 | 0.184 | 0.275 | 0.373 | 0.469 | 0.560 | 0.642 | 0.714 | 0.777 | | 128 | 3 | 0.007 | 0.018 | 0.036 | 0.062 | 0.095 | 0.136 | 0.183 | 0.235 | 0.290 | 0.345 | 0.399 | | 64 | 1 | 0.024 | 0.083 | 0.179 | 0.302 | 0.435 | 0.564 | 0.677 | 0.771 | 0.843 | 0.896 | 0.932 | | 64 | 2 | 0.004 | 0.011 | 0.025 | 0.044 | 0.072 | 0.107 | 0.149 | 0.199 | 0.254 | 0.312 | 0.371 | | 64 | 3 | 0.003 | 0.006 | 0.011 | 0.017 | 0.026 | 0.037 | 0.050 | 0.066 | 0.084 | 0.105 | 0.129 |
pairs = []
for l in gzip.open('../data/chembl21_25K.mfp1.pairs.txt.gz','rt'):
l = l.split()
pairs.append([Chem.MolFromSmiles(l[1]),Chem.MolFromSmiles(l[3])])
sims = defaultdict(list)
for v in 1,2,3:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=v)
for m1,m2 in pairs:
fp1 = fpg.GetSparseFingerprint(m1)
fp2 = fpg.GetSparseFingerprint(m2)
sims[(v,-1)].append(DataStructs.TanimotoSimilarity(fp1,fp2))
for l in 64,128,256,512,1024,2048,4096,8192,16384:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=v,fpSize=l)
for m1,m2 in pairs:
fp1 = fpg.GetFingerprint(m1)
fp2 = fpg.GetFingerprint(m2)
sims[(v,l)].append(DataStructs.TanimotoSimilarity(fp1,fp2))
pickle.dump(dict(sims),gzip.open('../data/fp_collision_sims.pkl.gz','wb+'))
rads = (1,2,3)
bcounts = (64,128,256,512,1024,2048,4096,8192,16384)
figsize(3*6,len(bcounts)*6)
plt = 1
for i,cnt in enumerate(reversed(bcounts)):
for j,rad in enumerate(rads):
subplot(len(bcounts),3,plt)
plt += 1
scatter(sims[rad,-1],sims[rad,cnt])
plot((0,1),(0,1),'k-')
xlabel('no collisions')
ylabel(f'{cnt} bits')
title(f'radius = {rad}')
from scipy.stats import spearmanr
row = ('radius','length','SpearmanR','mean(d)','std(d)','90% |d|')
divider = ['-'*len(k) for k in row]
print('| '+' | '.join(row)+' |')
print('| '+' | '.join(divider)+' |')
rads = (1,2,3)
bcounts = (64,128,256,512,1024,2048,4096,8192,16384)
for i,cnt in enumerate(reversed(bcounts)):
for j,rad in enumerate(rads):
d1 = numpy.array(sims[rad,-1])
d2 = numpy.array(sims[rad,cnt])
delt = d2-d1
row = [str(cnt),str(rad),f'{spearmanr(d1,d2).correlation:.3f}',f'{np.mean(delt):.2g}',
f'{np.std(delt):.2g}',f'{np.quantile(np.abs(delt),0.9):.2g}']
print('| '+' | '.join(row)+' |')
| radius | length | SpearmanR | mean(d) | std(d) | 90% |d| | | ------ | ------ | --------- | ------- | ------ | ------- | | 16384 | 1 | 1.000 | 0.0002 | 0.0025 | 0 | | 16384 | 2 | 1.000 | 0.00074 | 0.0037 | 0 | | 16384 | 3 | 1.000 | 0.0012 | 0.0038 | 0.0053 | | 8192 | 1 | 0.999 | 0.00071 | 0.0048 | 0 | | 8192 | 2 | 0.999 | 0.0014 | 0.0051 | 0.0068 | | 8192 | 3 | 0.999 | 0.0024 | 0.0054 | 0.011 | | 4096 | 1 | 0.998 | 0.0014 | 0.0067 | 0 | | 4096 | 2 | 0.998 | 0.0028 | 0.0072 | 0.012 | | 4096 | 3 | 0.998 | 0.0047 | 0.0076 | 0.016 | | 2048 | 1 | 0.993 | 0.0035 | 0.012 | 0.019 | | 2048 | 2 | 0.996 | 0.0059 | 0.012 | 0.023 | | 2048 | 3 | 0.996 | 0.0093 | 0.012 | 0.025 | | 1024 | 1 | 0.989 | 0.0056 | 0.015 | 0.025 | | 1024 | 2 | 0.992 | 0.012 | 0.016 | 0.033 | | 1024 | 3 | 0.991 | 0.019 | 0.017 | 0.041 | | 512 | 1 | 0.981 | 0.011 | 0.021 | 0.04 | | 512 | 2 | 0.983 | 0.024 | 0.023 | 0.055 | | 512 | 3 | 0.981 | 0.039 | 0.025 | 0.071 | | 256 | 1 | 0.964 | 0.022 | 0.028 | 0.06 | | 256 | 2 | 0.963 | 0.049 | 0.034 | 0.095 | | 256 | 3 | 0.953 | 0.077 | 0.039 | 0.13 | | 128 | 1 | 0.923 | 0.041 | 0.041 | 0.097 | | 128 | 2 | 0.917 | 0.094 | 0.051 | 0.16 | | 128 | 3 | 0.884 | 0.15 | 0.063 | 0.23 | | 64 | 1 | 0.840 | 0.084 | 0.061 | 0.16 | | 64 | 2 | 0.808 | 0.18 | 0.078 | 0.28 | | 64 | 3 | 0.713 | 0.27 | 0.1 | 0.4 |