from rdkit import Chem from rdkit.Chem import rdMolDescriptors from rdkit.Avalon import pyAvalonTools from rdkit.Chem import Draw from rdkit.Chem.Fraggle import FraggleSim from rdkit.Chem.Draw import IPythonConsole from rdkit import rdBase from rdkit import DataStructs from collections import defaultdict import cPickle,random,gzip,time import scipy as sp import pandas from rdkit.Chem import PandasTools PandasTools.RenderImagesInAllDataFrames() from scipy import stats from IPython.core.display import display,HTML,Javascript print rdBase.rdkitVersion ind = [x.split() for x in gzip.open('../data/chembl16_25K.pairs.txt.gz')] ms1 = [] ms2 = [] for i,row in enumerate(ind): m1 = Chem.MolFromSmiles(row[1]) ms1.append((row[0],m1)) m2 = Chem.MolFromSmiles(row[3]) ms2.append((row[2],m2)) random.seed(23) random.shuffle(ms2) t1=time.time() sims=[] for i,(m1,m2) in enumerate(zip(ms1,ms2)): sim,frag= FraggleSim.GetFraggleSimilarity(m1[-1],m2[-1]) sims.append((sim,i)) if not (i%200): print 'Done: %d in %.2f seconds'%(i,time.time()-t1) t2=time.time() print 'Finished in %.2f seconds'%(t2-t1) cPickle.dump(sims,gzip.open('../data/chembl16_25K.fraggle_randompairs.sims.pkl.gz','wb+')) sl = sorted(sims) np = len(sl) for bin in (.7,.8,.9,.95,.99): print bin,sl[int(bin*np)] hist([x[0] for x in sims],bins=20) xlabel("Fraggle") scoredLists = cPickle.load(gzip.open('../data/chembl16_25K.pairs.sims.pkl.gz','rb')) t1=time.time() rl=[] for i,(m1,m2) in enumerate(zip(ms1,ms2)): sim,frag= FraggleSim.GetFraggleSimilarity(m1[-1],m2[-1]) rl.append((sim,i)) if not (i%200): print 'Done: %d in %.2f seconds'%(i,time.time()-t1) t2=time.time() print 'Finished in %.2f seconds'%(t2-t1) scoredLists['Fraggle']=rl cPickle.dump(scoredLists,gzip.open('../data/chembl16_25K.pairs.sims2.pkl.gz','wb+')) scoredLists = cPickle.load(gzip.open('../data/chembl16_25K.pairs.sims2.pkl.gz','rb')) def directCompare(scoredLists,fp1,fp2,plotIt=True,silent=False): l1 = scoredLists[fp1] l2 = scoredLists[fp2] rl1=[x[-1] for x in l1] rl2=[x[-1] for x in l2] vl1=[x[0] for x in l1] vl2=[x[0] for x in l2] if plotIt: _=scatter(vl1,vl2,edgecolors='none') maxvl1=max(vl1) minvl1=min(vl1) maxvl2=max(vl2) minvl2=min(vl2) _=plot((minvl1,maxvl1),(minvl2,maxvl2),color='k',linestyle='-') xlabel(fp1) ylabel(fp2) tau,tau_p=stats.kendalltau(vl1,vl2) spearman_rho,spearman_p=stats.spearmanr(vl1,vl2) pearson_r,pearson_p = stats.pearsonr(vl1,vl2) if not silent: print fp1,fp2,tau,tau_p,spearman_rho,spearman_p,pearson_r,pearson_p return tau,spearman_rho,pearson_r _=directCompare(scoredLists,'Fraggle','RDKit5') df = pandas.DataFrame(index=range(len(ms1)),columns=['mol1','mol2','Fraggle','RDKit5']) df.mol1 = [x[1] for x in ms1] df.mol2 = [x[1] for x in ms2] df.Fraggle = [x[0] for x in scoredLists['Fraggle']] df.RDKit5 = [x[0] for x in scoredLists['RDKit5']] subset = df[df.RDKit5<0.2][df.Fraggle>0.8] subset.sort(columns=['Fraggle'],ascending=False,inplace=True) len(subset) frags = [] for row in subset.itertuples(): m1 = row[1] m2 = row[2] sim,frag= FraggleSim.GetFraggleSimilarity(m1,m2) frags.append(frag) mfrags = [Chem.MolFromSmiles(x) for x in frags] subset['Fragment']=frags subset['FragMol']=mfrags subset subset[subset.index==15634] Chem.MolToSmiles(subset.ix[15634]['mol2'],True) tmol = Chem.MolFromSmiles('CCCNc1ncnc2[nH]ncc21') fp1 = Chem.RDKFingerprint(subset.ix[15634]['mol1'],maxPath=5) fp2 = Chem.RDKFingerprint(tmol,maxPath=5) print 'RDKit5: ',DataStructs.TanimotoSimilarity(fp1,fp2) print 'Fraggle: ',FraggleSim.GetFraggleSimilarity(subset.ix[15634]['mol1'],tmol) subset2 = df[df.RDKit5>.5][df.Fraggle<0.1] subset2.sort(columns=['RDKit5'],ascending=False,inplace=True) len(subset2) frags = [] for row in subset2.itertuples(): m1 = row[1] m2 = row[2] sim,frag= FraggleSim.GetFraggleSimilarity(m1,m2) frags.append(frag) subset2['Fragment']=frags subset2 subset2.ix[21906]['mol1'] FraggleSim.generate_fraggle_fragmentation(subset2.ix[21906]['mol1']) nToDo=200 apl = sorted(scoredLists['AP'],reverse=True)[:nToDo] ttl = sorted(scoredLists['TT'],reverse=True)[:nToDo] avl = sorted(scoredLists['Avalon-1024'],reverse=True)[:nToDo] rdkl = sorted(scoredLists['RDKit5'],reverse=True)[:nToDo] fragl = sorted(scoredLists['Fraggle'],reverse=True)[:nToDo] idsToKeep=set() idsToKeep.update([x[1] for x in apl]) idsToKeep.update([x[1] for x in ttl]) idsToKeep.update([x[1] for x in avl]) idsToKeep.update([x[1] for x in rdkl]) idsToKeep.update([x[1] for x in fragl]) print 'Overall number:',len(idsToKeep) ids={} ids['AP']=set([x[1] for x in apl]) ids['TT']=set([x[1] for x in ttl]) ids['Avalon-1024']=set([x[1] for x in avl]) ids['RDKit5']=set([x[1] for x in rdkl]) ids['Fraggle']=set([x[1] for x in fragl]) ks = sorted(ids.keys()) for i,k in enumerate(ks): for j in range(i+1,len(ks)): overlap=len(ids[k].intersection(ids[ks[j]])) print ks[i],ks[j],overlap,'%.2f'%(float(overlap)/len(apl)) nToDo=100 apl = sorted(scoredLists['AP'],reverse=True)[:nToDo] ttl = sorted(scoredLists['TT'],reverse=True)[:nToDo] avl = sorted(scoredLists['Avalon-1024'],reverse=True)[:nToDo] rdkl = sorted(scoredLists['RDKit5'],reverse=True)[:nToDo] fragl = sorted(scoredLists['Fraggle'],reverse=True)[:nToDo] idsToKeep=set() idsToKeep.update([x[1] for x in apl]) idsToKeep.update([x[1] for x in ttl]) idsToKeep.update([x[1] for x in avl]) idsToKeep.update([x[1] for x in rdkl]) idsToKeep.update([x[1] for x in fragl]) print 'Overall number:',len(idsToKeep) ids={} ids['AP']=set([x[1] for x in apl]) ids['TT']=set([x[1] for x in ttl]) ids['Avalon-1024']=set([x[1] for x in avl]) ids['RDKit5']=set([x[1] for x in rdkl]) ids['Fraggle']=set([x[1] for x in fragl]) ks = sorted(ids.keys()) for i,k in enumerate(ks): for j in range(i+1,len(ks)): overlap=len(ids[k].intersection(ids[ks[j]])) print ks[i],ks[j],overlap,'%.2f'%(float(overlap)/len(apl))