Inspired by this RDKit-discuss thread started by Liz Wylie: http://www.mail-archive.com/rdkit-discuss@lists.sourceforge.net/msg03676.html
from rdkit import Chem
from rdkit.Chem import MCS
from rdkit.Chem.Draw import IPythonConsole
Start with two sets of simple molecules
smis = ['CC(C)=C','CC(C)C']
ms = [Chem.MolFromSmiles(x) for x in smis]
smis2 = ['CC(C)=C','CC(C)=N']
ms2 = [Chem.MolFromSmiles(x) for x in smis2]
default MCS behavior:
mcs=MCS.FindMCS(ms)
mcs.smarts
'[#6]-[#6]-[#6]'
mcs=MCS.FindMCS(ms2)
mcs.smarts
'[#6]-[#6]-[#6]'
define a simple hash that combines atomic num and hybridization
def label(a):
return 100*int(a.GetHybridization())+a.GetAtomicNum()
and now use that to label the atoms of the first set of molecules:
nms = [Chem.Mol(x) for x in ms]
for nm in nms:
for at in nm.GetAtoms():
at.SetIsotope(label(at))
Chem.MolToSmiles(nms[0],True)
'[406CH3][306C]([406CH3])=[306CH2]'
If we now use "isotopes" as the MCS atom comparator, we get nothing for the first set of molecules:
mcs=MCS.FindMCS(nms,atomCompare='isotopes')
mcs.smarts
The second set returns a result:
nms2 = [Chem.Mol(x) for x in ms2]
for nm in nms2:
for at in nm.GetAtoms():
at.SetIsotope(label(at))
mcs=MCS.FindMCS(nms2,atomCompare='isotopes')
mcs.smarts
'[406*]-[306*]-[406*]'
But that's kind of ugly... let's get the SMILES for it.
We start by finding the matching atoms in the first of the modified molecules:
mcsp = Chem.MolFromSmarts(mcs.smarts)
match = nms2[0].GetSubstructMatch(mcsp)
match
(0, 1, 2)
And now generate smiles for the first unmodified molecule, but only including those atoms. We can do this because we know that the atom numbers didn't change.
print Chem.MolFragmentToSmiles(ms2[0],atomsToUse=match,isomericSmiles=True,canonical=False)
CCC
Ok, let's try it with Liz's test case:
smis=["COc1ccc(C(Nc2nc3c(ncn3COCC=O)c(=O)[nH]2)(c2ccccc2)c2ccccc2)cc1",
"COc1ccc(C(Nc2nc3c(ncn3COC(CO)(CO)CO)c(=O)[nH]2)(c2ccccc2)c2ccccc2)cc1"]
ms = [Chem.MolFromSmiles(x) for x in smis]
ms[0]
ms[1]
nms = [Chem.Mol(x) for x in ms]
for nm in nms:
for at in nm.GetAtoms():
at.SetIsotope(label(at))
mcs=MCS.FindMCS(nms,atomCompare='isotopes')
mcs.smarts
'[406*]-[308*]-[306*]:1:[306*]:[306*]:[306*](:[306*]:[306*]:1)-[406*](-[306*]:1:[306*]:[306*]:[306*]:[306*]:[306*]:1)(-[306*]:1:[306*]:[306*]:[306*]:[306*]:[306*]:1)-[307*]-[306*]:1:[307*]:[306*]:2:[306*](:[306*](:[307*]:1)=[308*]):[307*]:[306*]:[307*]:2-[406*]-[408*]-[406*]'
mcsp = Chem.MolFromSmarts(mcs.smarts)
match = nms[0].GetSubstructMatch(mcsp)
smi=Chem.MolFragmentToSmiles(ms[0],atomsToUse=match,isomericSmiles=True,canonical=False)
print smi
COc1ccc(C(Nc2nc3c(ncn3COC)c(=O)[nH]2)(c2ccccc2)c2ccccc2)cc1
core = Chem.MolFromSmiles(smi)
core