Notebook
RDLogger.DisableLog("rdApp.warning") molholder = rdSubstructLibrary.CachedTrustedSmilesMolHolder() patts = rdSubstructLibrary.TautomerPatternHolder() # this will automatically grab the "_Name" property for each molecule # in the ChEMBL SD file this contains the ChEMBL ID for the molecules. keys = rdSubstructLibrary.KeyFromPropHolder() slib = rdSubstructLibrary.SubstructLibrary(molholder,patts,keys) t1 = time.time() with gzip.GzipFile('/home/glandrum/Downloads/chembl_29.sdf.gz') as gz, Chem.ForwardSDMolSupplier(gz) as suppl: nDone = 0 for m in suppl: if m is None: continue # skip huge molecules if m.GetNumHeavyAtoms()>75: continue slib.AddMol(m) nDone += 1 if not nDone%50000: print(f' did {nDone} in {time.time()-t1:.2f}s') with open('./results/chembl29_ssslib.pkl','wb+') as outf: pickle.dump(slib,outf) print(f'That took {time.time()-t1:.2f}s in total.') with open('./results/chembl29_ssslib.pkl','wb+') as outf: pickle.dump(slib,outf)
holder = slib.GetMolHolder() nats = sorted([(holder.GetMol(x).GetNumHeavyAtoms(),x) for x in range(len(slib))]) order = [y for x,y in nats] # append that to the pickle file with the substruct lib: with open('./results/chembl29_ssslib.pkl','ab') as outf: pickle.dump(order,outf)