import requests import pandas as pd from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem import PandasTools from rdkit.Chem import AllChem as Chem from rdkit.Chem import DataStructs from scipy.spatial.distance import * import numpy as np from sklearn import manifold from ggplot import * TRGT = 'CHEMBL5024' re = requests.get('https://www.ebi.ac.uk/chemblws/targets/{0}/bioactivities.json'.format(TRGT)) data = pd.DataFrame(re.json()['bioactivities']) data.head() data = data.drop_duplicates(['parent_cmpd_chemblid']) data.shape assays = data[['assay_chemblid','target_chemblid']] assays = assays.groupby('assay_chemblid').count() goodassays = list(assays.ix[assays.assay_chemblid >= 4].index) data = data.ix[data.assay_chemblid.isin(goodassays)] data.shape def fetch_SMILES(chemblid): return str(requests.get('https://www.ebi.ac.uk/chemblws/compounds/{}.json'.format(chemblid)).json()['compound']['smiles']) data['SMILES'] = data['parent_cmpd_chemblid'].map(fetch_SMILES) PandasTools.AddMoleculeColumnToFrame(data, smilesCol = 'SMILES') mols = data[['parent_cmpd_chemblid','name_in_reference','SMILES', 'ROMol', 'assay_chemblid']] mols.shape mols.head() fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in mols['ROMol']] dist_mat = squareform(pdist(fps,'jaccard')) pd.DataFrame(dist_mat, columns = mols['parent_cmpd_chemblid'], index=mols['parent_cmpd_chemblid']).head() mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2) results = mds.fit(dist_mat) coords = results.embedding_ mols['X'] = [c[0] for c in coords] mols['Y'] = [c[1] for c in coords] mols.head() rcParams['figure.figsize'] = 12,12 scatter(mols['X'], mols['Y']) ggplot(aes(x='X', y='Y', colour='assay_chemblid'), data=mols) + geom_point() #TODO: D3 interactive visualisation