Notebook
# here's how I constructed the dataset conn2 = psycopg2.connect("dbname=chembl_26 host=localhost") curs2 = conn2.cursor() curs2.execute('''select cid1.chembl_id as compound_chembl_id,cid2.chembl_id as assay_chembl_id, target_dictionary.chembl_id as target_chembl_id,target_dictionary.pref_name as pref_name, standard_relation,standard_value,standard_units,standard_type,molfile from activities acts join assays using (assay_id) join compound_structures using (molregno) join chembl_id_lookup cid1 on (molregno=entity_id and entity_type='COMPOUND') join chembl_id_lookup cid2 on (assay_id=cid2.entity_id and cid2.entity_type='ASSAY') join target_dictionary using (tid) where standard_type='Ki' and standard_units='nM' and standard_value is not null and standard_relation='=' and standard_value<1''') data = curs2.fetchall() import gzip cnames = [x.name for x in curs2.description] w = Chem.SDWriter(gzip.open('/home/glandrum/RDKit_blog/data/chembl26_very_active.sdf.gz','wt+')) for row in data: m = Chem.MolFromMolBlock(row[-1]) for i in range(len(cnames)-1): m.SetProp(cnames[i],str(row[i])) w.write(m) w=None