from rdkit import Chem def clogSw(molObj): """ Calculate an estimation of water solubility. Input: RDKit molecule object Output: clogSw estimation """ from rdkit import Chem from rdkit.Chem import Descriptors # calculate MolWeight MolWeight = Descriptors.MolWt(molObj) # calculate clogP clogP = Descriptors.MolLogP(molObj) # calculate RotBonds RotBonds = Descriptors.NumRotatableBonds(molObj) # calculate the number of aromatic heavyatoms in the molecule aromaticHeavyatoms = len(molObj.GetSubstructMatches( Chem.MolFromSmarts("[a]"))) # calculate total number of atoms in the molecule numAtoms = molObj.GetNumAtoms() # calculate Aromatic Proportion AromProp = float(aromaticHeavyatoms) / numAtoms # then calculate clogSw... #clogSw_value = 0.16 \ # - 0.63 * clogP \ # - 0.0062 * MolWeight \ # + 0.066 * RotBonds \ # - 0.74 * AromProp # New clogSw with coefficients calculated using Ridge Regression with Crossvalidation clogSw_value = 0.233743817233 \ -0.74253027 * clogP \ -0.00676305 * MolWeight \ +0.01580559 * RotBonds \ -0.35483585 * AromProp return clogSw_value logSw = dict() supplier = Chem.SmilesMolSupplier("Delaney_SupplData.smi", delimiter="\t", titleLine=True) for mol in supplier: if mol: mol_id = "ESOL_" + mol.GetProp("_Name") mol.SetProp("_Name", mol_id) logSw[mol_id] = dict() logSw[mol_id]["CANSMILES"] = Chem.MolToSmiles(mol) # RDKit clogSw logSw[mol_id]["RDKit_clogSw"] = clogSw(mol) mol.SetProp("RDKit_clogSw", str(logSw[mol_id]["RDKit_clogSw"])) # ESOL logSw[mol_id]["ESOL predicted log(solubility:mol/L)"] = mol.GetProp("ESOL predicted log(solubility:mol/L)") # Measured Solubility logSw[mol_id]["measured log(solubility:mol/L)"] = mol.GetProp("measured log(solubility:mol/L)") # end if # end for # Dump logSw in csv import csv f = open('ESOL_Results.csv','wb') w = csv.writer(f) w.writerow(["Canonical SMILES", "Mol_Id", "measured log(solubility:mol/L)", "ESOL predicted log(solubility:mol/L)", "RDKit_clogSw"]) for mol_id in logSw: w.writerow([logSw[mol_id]["CANSMILES"], mol_id, logSw[mol_id]["measured log(solubility:mol/L)"], logSw[mol_id]["ESOL predicted log(solubility:mol/L)"], logSw[mol_id]["RDKit_clogSw"]]) # end for f.flush() f.close()