from rdkit import Chem
def clogSw(molObj):
"""
Calculate an estimation of water solubility.
Input: RDKit molecule object
Output: clogSw estimation
"""
from rdkit import Chem
from rdkit.Chem import Descriptors
# calculate MolWeight
MolWeight = Descriptors.MolWt(molObj)
# calculate clogP
clogP = Descriptors.MolLogP(molObj)
# calculate RotBonds
RotBonds = Descriptors.NumRotatableBonds(molObj)
# calculate the number of aromatic heavyatoms in the molecule
aromaticHeavyatoms = len(molObj.GetSubstructMatches(
Chem.MolFromSmarts("[a]")))
# calculate total number of atoms in the molecule
numAtoms = molObj.GetNumAtoms()
# calculate Aromatic Proportion
AromProp = float(aromaticHeavyatoms) / numAtoms
# then calculate clogSw...
#clogSw_value = 0.16 \
# - 0.63 * clogP \
# - 0.0062 * MolWeight \
# + 0.066 * RotBonds \
# - 0.74 * AromProp
# New clogSw with coefficients calculated using Ridge Regression with Crossvalidation
clogSw_value = 0.233743817233 \
-0.74253027 * clogP \
-0.00676305 * MolWeight \
+0.01580559 * RotBonds \
-0.35483585 * AromProp
return clogSw_value
logSw = dict()
supplier = Chem.SmilesMolSupplier("Delaney_SupplData.smi", delimiter="\t", titleLine=True)
for mol in supplier:
if mol:
mol_id = "ESOL_" + mol.GetProp("_Name")
mol.SetProp("_Name", mol_id)
logSw[mol_id] = dict()
logSw[mol_id]["CANSMILES"] = Chem.MolToSmiles(mol)
# RDKit clogSw
logSw[mol_id]["RDKit_clogSw"] = clogSw(mol)
mol.SetProp("RDKit_clogSw", str(logSw[mol_id]["RDKit_clogSw"]))
# ESOL
logSw[mol_id]["ESOL predicted log(solubility:mol/L)"] = mol.GetProp("ESOL predicted log(solubility:mol/L)")
# Measured Solubility
logSw[mol_id]["measured log(solubility:mol/L)"] = mol.GetProp("measured log(solubility:mol/L)")
# end if
# end for
# Dump logSw in csv
import csv
f = open('ESOL_Results.csv','wb')
w = csv.writer(f)
w.writerow(["Canonical SMILES", "Mol_Id", "measured log(solubility:mol/L)", "ESOL predicted log(solubility:mol/L)", "RDKit_clogSw"])
for mol_id in logSw:
w.writerow([logSw[mol_id]["CANSMILES"],
mol_id,
logSw[mol_id]["measured log(solubility:mol/L)"],
logSw[mol_id]["ESOL predicted log(solubility:mol/L)"],
logSw[mol_id]["RDKit_clogSw"]])
# end for
f.flush()
f.close()
1143 1143