import pandas as pd # Import pandas
from chembl_webresource_client import *

targets = TargetResource()

print targets.status()

targets.get(uniprot=['P00519'])

targetsDF = pd.DataFrame.from_dict(targets.get(uniprot=['P00519']))

targetsDF

bioactsDF = pd.DataFrame.from_dict(targets.bioactivities('CHEMBL1862'))

bioactsDF.head(1)

bioactsDF = bioactsDF[(bioactsDF['bioactivity_type'] == 'IC50') &  # keep ony IC50
                      (bioactsDF['operator'] == '=') & # only exact measurements
                      (bioactsDF['assay_type'] == 'B') & # only binding data
                      (bioactsDF['target_confidence'] == 9)] # only high target confidence

len(bioactsDF), len(bioactsDF['ingredient_cmpd_chemblid'].unique())

compounds = CompoundResource()

compounds.status()

cpdsDF = pd.DataFrame.from_dict(compounds.get(list(bioactsDF['ingredient_cmpd_chemblid'].unique())))

cpdsDF.head()

bioactsDF['value'] = bioactsDF['value'].astype(float) # making sure everything is float

import rdkit.Chem as Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole # Enables RDKit IPython integration

molnames = 'chemblId'
smiles = 'smiles'

PandasTools.AddMoleculeColumnToFrame(cpdsDF, smilesCol='smiles')

cpdsDF = cpdsDF[[molnames, smiles, 'ROMol', 'knownDrug', 'preferredCompoundName']]

import numpy as np
def getBioacts(cpd, target):
    value = bioactsDF[(bioactsDF['ingredient_cmpd_chemblid'] == cpd)& # All rows of compound AND
                     (bioactsDF['target_chemblid'] == target)]['value'].mean() # target. Get mean of values
    return np.log10(value*10**-9)*-1 # returns pIC50

def getLE(mol, pIC50): 
    return 1.4*pIC50/mol.GetNumHeavyAtoms()        

cpdsDF['pIC50'] = cpdsDF.apply(lambda x: getBioacts(x[molnames], 'CHEMBL1862'), axis=1)

cpdsDF['LE'] = cpdsDF.apply(lambda x: getLE(x['ROMol'], x['pIC50']), axis=1)

cpdsDF.head(1)

len(cpdsDF)

from rdkit.Chem.Scaffolds import MurckoScaffold

mol  = Chem.MolFromSmiles(cpdsDF.ix[2]['smiles'])

scaffold = MurckoScaffold.GetScaffoldForMol(mol)
generic = MurckoScaffold.MakeScaffoldGeneric(MurckoScaffold.GetScaffoldForMol(mol))

Draw.MolsToGridImage([mol, scaffold, generic])

PandasTools.AddMurckoToFrame(cpdsDF)

PandasTools.AlignToScaffold(cpdsDF, molCol='ROMol', scaffoldCol='Murcko_SMILES')

cpdsDF.head(1)

sortedScaffolds = cpdsDF.groupby(['Murcko_SMILES']).count().sort(smiles, ascending=False)

sortedScaffolds = sortedScaffolds[[smiles]] # Keep only smiles column
sortedScaffolds = sortedScaffolds.rename(columns={smiles:'count'}) # rename smiles column to count
sortedScaffolds['Murcko_SMILES'] = sortedScaffolds.index # actual SMILES are only in index column, move it
sortedScaffolds.index = range(len(sortedScaffolds)+1)[1:]
sortedScaffolds.head()

len(sortedScaffolds)

PandasTools.AddMoleculeColumnToFrame(sortedScaffolds, smilesCol='Murcko_SMILES')

PandasTools.FrameToGridImage(sortedScaffolds.dropna().head(8), legendsCol='count', 
                             molsPerRow=4) # dropna() drops compounds without scaffold

PandasTools.FrameToGridImage(cpdsDF[cpdsDF['Murcko_SMILES'] == 'O=c1[nH]c2nc(Nc3ccccc3)ncc2cc1-c1ccccc1'].head(4),
                            legendsCol=molnames, molsPerRow=4)

cpdsDF[cpdsDF['Murcko_SMILES'] == 'O=c1[nH]c2nc(Nc3ccccc3)ncc2cc1-c1ccccc1'
       ]['pIC50'].max() # min(), mean(), sum(), anything you can do with pandas Series

%matplotlib inline

import pylab

data = [cpdsDF[cpdsDF['Murcko_SMILES'] == x]['pIC50'] for x in sortedScaffolds['Murcko_SMILES'].head(8)]
data.append(cpdsDF[cpdsDF['knownDrug'] == 'Yes']['pIC50'])

pylab.boxplot(data)
pylab.show()

data = [cpdsDF[cpdsDF['Murcko_SMILES'] == x]['LE'] for x in sortedScaffolds['Murcko_SMILES'].head(8)]
data.append(cpdsDF[cpdsDF['knownDrug'] == 'Yes']['LE'])

pylab.boxplot(data)
pylab.show()