from rdkit import Chem,DataStructs import time,random from collections import defaultdict import psycopg2 from rdkit.Chem import Draw,PandasTools,rdMolDescriptors from rdkit.Chem.Draw import IPythonConsole from rdkit import rdBase from __future__ import print_function import requests from xml.etree import ElementTree import pandas as pd %load_ext sql print(rdBase.rdkitVersion) %sql postgresql://localhost/chembl_19 \ select * from chembl_id_lookup where chembl_id = 'CHEMBL240'; %sql select count(*) from activities join assays using (assay_id) where tid=165; %sql select distinct(standard_type) from activities join assays using (assay_id) where tid=165; %sql select count(*) from activities join assays using (assay_id) where tid=165 and standard_type='Ki'; data = %sql select canonical_smiles,molregno,activity_id,standard_value,standard_units from activities \ join assays using (assay_id) \ join compound_structures using (molregno) \ where tid=165 and standard_type='Ki' and standard_value is not null and standard_relation='=' \ and canonical_smiles not like '%.%'; df = data.DataFrame() df.to_csv('../data/herg_data.txt',sep=" ",index=False) !python $RDBASE/Contrib/mmpa/rfrag.py < ../data/herg_data.txt > ../data/herg_fragmented.txt !python $RDBASE/Contrib/mmpa/indexing.py -s -r 0.1 < ../data/herg_fragmented.txt > ../data/mmps_default.txt mmps = pd.read_csv('../data/mmps_default.txt',header=None,names=('smiles1','smiles2','molregno1','molregno2','tform','core')) mmps[mmps.molregno1==290813] mmps=mmps.drop_duplicates(subset=("molregno1","molregno2")) PandasTools.AddMoleculeColumnToFrame(mmps,'smiles1','mol1') PandasTools.AddMoleculeColumnToFrame(mmps,'smiles2','mol2') mmps = mmps[['mol1','mol2','molregno1','molregno2','tform','core']] mmps.head() t1=df[['molregno','standard_value']] mmpdds = mmps.merge(t1,left_on='molregno1',right_on='molregno',suffixes=("_1","_2")).\ merge(t1,left_on='molregno2',right_on='molregno',suffixes=("_1","_2")) mmpdds.head() import math mmpdds['pKi_1']=mmpdds.apply(lambda row:-1*math.log10(float(row['standard_value_1'])*1e-9),axis=1) mmpdds['pKi_2']=mmpdds.apply(lambda row:-1*math.log10(float(row['standard_value_2'])*1e-9),axis=1) mmpdds['delta']=mmpdds['pKi_2']-mmpdds['pKi_1'] mmpdds=mmpdds[['mol1','mol2','molregno1','molregno2','pKi_1','pKi_2','delta','tform','core']] mmpdds.head() gs=mmpdds.groupby('tform') vs = [(len(y),x) for x,y in gs] vs.sort(reverse=True) vs[:5] gs['delta'].describe()['[*:1]F>>[*:1]Cl'] rows=[] for c,k in vs: if c>=5: descr=gs['delta'].describe()[k] rows.append((k,descr['count'],descr['mean'],descr['std'])) ndf = pd.DataFrame(rows,columns=('tform','count_val','mean_val','std_val')) ndf.head() ndf['react']=ndf.apply(lambda row:row['tform'].split('>>')[0],axis=1) ndf['prod']=ndf.apply(lambda row:row['tform'].split('>>')[1],axis=1) PandasTools.AddMoleculeColumnToFrame(ndf,'react','reactmol') PandasTools.AddMoleculeColumnToFrame(ndf,'prod','prodmol') ndf.head() ndf[ndf.mean_val<-.3] mmpdds[mmpdds['tform']=='[*:1]F>>[*:1]OC'][['mol1','mol2','pKi_1','pKi_2','delta']] !python $RDBASE/Contrib/mmpa/indexing.py -s -r 0.25 < ../data/herg_fragmented.txt > ../data/mmps_larger.txt mmps = pd.read_csv('../data/mmps_larger.txt',header=None,names=('smiles1','smiles2','molregno1','molregno2','tform','core')) mmps=mmps.drop_duplicates(subset=("molregno1","molregno2")) PandasTools.AddMoleculeColumnToFrame(mmps,'smiles1','mol1') PandasTools.AddMoleculeColumnToFrame(mmps,'smiles2','mol2') mmps = mmps[['mol1','mol2','molregno1','molregno2','tform','core']] t1=df[['molregno','standard_value']] mmpdds = mmps.merge(t1,left_on='molregno1',right_on='molregno',suffixes=("_1","_2")).\ merge(t1,left_on='molregno2',right_on='molregno',suffixes=("_1","_2")) import math mmpdds['pKi_1']=mmpdds.apply(lambda row:-1*math.log10(float(row['standard_value_1'])*1e-9),axis=1) mmpdds['pKi_2']=mmpdds.apply(lambda row:-1*math.log10(float(row['standard_value_2'])*1e-9),axis=1) mmpdds['delta']=mmpdds['pKi_2']-mmpdds['pKi_1'] mmpdds=mmpdds[['mol1','mol2','molregno1','molregno2','pKi_1','pKi_2','delta','tform','core']] mmpdds.head() gs=mmpdds.groupby('tform') vs = [(len(y),x) for x,y in gs] vs.sort(reverse=True) vs[:5] rows=[] for c,k in vs: if c>=5: descr=gs['delta'].describe()[k] rows.append((k,descr['count'],descr['mean'],descr['std'])) ndf = pd.DataFrame(rows,columns=('tform','count_val','mean_val','std_val')) ndf['react']=ndf.apply(lambda row:row['tform'].split('>>')[0],axis=1) ndf['prod']=ndf.apply(lambda row:row['tform'].split('>>')[1],axis=1) PandasTools.AddMoleculeColumnToFrame(ndf,'react','reactmol') PandasTools.AddMoleculeColumnToFrame(ndf,'prod','prodmol') ndf[ndf.mean_val<-.3].sort(columns='mean_val') tform='[*:1]c1ccc(Cl)cc1>>[*:1]c1ccccc1F' mmpdds[mmpdds['tform']==tform][['molregno1','molregno2','mol1','mol2','pKi_1','pKi_2','delta']] regnos = list(mmpdds[mmpdds['tform']==tform]['molregno1']) regnos += list(mmpdds[mmpdds['tform']==tform]['molregno2']) regnos=tuple(set(regnos)) %sql select distinct(activities.doc_id) from activities join assays using (assay_id) \ where tid=165 and standard_type='Ki' and molregno in :regnos; docid = _[0]['doc_id'] %sql select * from docs where doc_id=:docid; pmid = _[0]['pubmed_id'] txt=requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%d'%pmid).text et = ElementTree.fromstring(txt.encode('utf-8')) et.findall(".//*[@Name='Title']")[0].text %sql select * from assays where assay_id in (select distinct(assay_id) from activities where doc_id = :docid); assayData=%sql select * from activities join assays using (assay_id) \ where activities.doc_id=:docid \ and molregno in :regnos \ and standard_value is not null \ and assay_id!=454227; assayData