import pandas as pd # Import pandas from chembl_webresource_client import * targets = TargetResource() print targets.status() targets.get(uniprot=['P00519']) targetsDF = pd.DataFrame.from_dict(targets.get(uniprot=['P00519'])) targetsDF bioactsDF = pd.DataFrame.from_dict(targets.bioactivities('CHEMBL1862')) bioactsDF.head(1) bioactsDF = bioactsDF[(bioactsDF['bioactivity_type'] == 'IC50') & # keep ony IC50 (bioactsDF['operator'] == '=') & # only exact measurements (bioactsDF['assay_type'] == 'B') & # only binding data (bioactsDF['target_confidence'] == 9)] # only high target confidence len(bioactsDF), len(bioactsDF['ingredient_cmpd_chemblid'].unique()) compounds = CompoundResource() compounds.status() cpdsDF = pd.DataFrame.from_dict(compounds.get(list(bioactsDF['ingredient_cmpd_chemblid'].unique()))) cpdsDF.head() bioactsDF['value'] = bioactsDF['value'].astype(float) # making sure everything is float import rdkit.Chem as Chem from rdkit.Chem import Draw from rdkit.Chem import PandasTools from rdkit.Chem.Draw import IPythonConsole # Enables RDKit IPython integration molnames = 'chemblId' smiles = 'smiles' PandasTools.AddMoleculeColumnToFrame(cpdsDF, smilesCol='smiles') cpdsDF = cpdsDF[[molnames, smiles, 'ROMol', 'knownDrug', 'preferredCompoundName']] import numpy as np def getBioacts(cpd, target): value = bioactsDF[(bioactsDF['ingredient_cmpd_chemblid'] == cpd)& # All rows of compound AND (bioactsDF['target_chemblid'] == target)]['value'].mean() # target. Get mean of values return np.log10(value*10**-9)*-1 # returns pIC50 def getLE(mol, pIC50): return 1.4*pIC50/mol.GetNumHeavyAtoms() cpdsDF['pIC50'] = cpdsDF.apply(lambda x: getBioacts(x[molnames], 'CHEMBL1862'), axis=1) cpdsDF['LE'] = cpdsDF.apply(lambda x: getLE(x['ROMol'], x['pIC50']), axis=1) cpdsDF.head(1) len(cpdsDF) from rdkit.Chem.Scaffolds import MurckoScaffold mol = Chem.MolFromSmiles(cpdsDF.ix[2]['smiles']) scaffold = MurckoScaffold.GetScaffoldForMol(mol) generic = MurckoScaffold.MakeScaffoldGeneric(MurckoScaffold.GetScaffoldForMol(mol)) Draw.MolsToGridImage([mol, scaffold, generic]) PandasTools.AddMurckoToFrame(cpdsDF) PandasTools.AlignToScaffold(cpdsDF, molCol='ROMol', scaffoldCol='Murcko_SMILES') cpdsDF.head(1) sortedScaffolds = cpdsDF.groupby(['Murcko_SMILES']).count().sort(smiles, ascending=False) sortedScaffolds = sortedScaffolds[[smiles]] # Keep only smiles column sortedScaffolds = sortedScaffolds.rename(columns={smiles:'count'}) # rename smiles column to count sortedScaffolds['Murcko_SMILES'] = sortedScaffolds.index # actual SMILES are only in index column, move it sortedScaffolds.index = range(len(sortedScaffolds)+1)[1:] sortedScaffolds.head() len(sortedScaffolds) PandasTools.AddMoleculeColumnToFrame(sortedScaffolds, smilesCol='Murcko_SMILES') PandasTools.FrameToGridImage(sortedScaffolds.dropna().head(8), legendsCol='count', molsPerRow=4) # dropna() drops compounds without scaffold PandasTools.FrameToGridImage(cpdsDF[cpdsDF['Murcko_SMILES'] == 'O=c1[nH]c2nc(Nc3ccccc3)ncc2cc1-c1ccccc1'].head(4), legendsCol=molnames, molsPerRow=4) cpdsDF[cpdsDF['Murcko_SMILES'] == 'O=c1[nH]c2nc(Nc3ccccc3)ncc2cc1-c1ccccc1' ]['pIC50'].max() # min(), mean(), sum(), anything you can do with pandas Series %matplotlib inline import pylab data = [cpdsDF[cpdsDF['Murcko_SMILES'] == x]['pIC50'] for x in sortedScaffolds['Murcko_SMILES'].head(8)] data.append(cpdsDF[cpdsDF['knownDrug'] == 'Yes']['pIC50']) pylab.boxplot(data) pylab.show() data = [cpdsDF[cpdsDF['Murcko_SMILES'] == x]['LE'] for x in sortedScaffolds['Murcko_SMILES'].head(8)] data.append(cpdsDF[cpdsDF['knownDrug'] == 'Yes']['LE']) pylab.boxplot(data) pylab.show()