from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
m1 = Chem.MolFromSmiles('O=C1CN(N=Cc2ccc([N+](=O)[O-])o2)C(=O)N1')
m2 = Chem.MolFromSmiles('CCCC1COC(Cn2cncn2)(c2ccc(Cl)cc2Cl)O1')
m3 = Chem.MolFromSmiles('CCCCCC=O')
# similar to m1
m4 = Chem.MolFromSmiles('CCCC1COC(Cn2cncn2)(c2ccc(Cl)cc2O)O1')
mols = [m1, m2, m3, m4]
fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024, useFeatures=False) for m in mols]
indx_fps = dict()
for indx, fp in enumerate(fps):
indx_fps[indx] = fp
def diversity(mols_fp, threshold):
diverse = []
similar = []
for m1 in mols_fp:
m1_fp = mols_fp[m1]
sim_vals = []
for m2 in mols_fp:
m2_fp = mols_fp[m2]
# if different molecules else skip
if m1 != m2:
sim_vals.append(DataStructs.FingerprintSimilarity(m1_fp, m2_fp, metric=DataStructs.TanimotoSimilarity))
# end if
# end for
# Get Maximum similarity
sim_vals = max(sim_vals)
if sim_vals >= threshold:
similar.append(m1)
else:
diverse.append(m1)
# end for
return (diverse, similar)
(diverse, similar) = diversity(indx_fps, 0.8)
print (diverse, similar)
([0, 2], [1, 3])