#!/usr/bin/env python # coding: utf-8 # In[27]: import itertools import gzip import pandas import rdkit.Chem import rdkit.Chem.AllChem import rdkit.DataStructs # In[ ]: # In[4]: # Download DrugBank SDF file of structures get_ipython().system(' wget --timestamping --directory-prefix download http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip') get_ipython().system(' unzip -d download download/all.sdf.zip') get_ipython().system(' rm download/all.sdf.zip') # In[9]: # Read SDF File supplier = rdkit.Chem.SDMolSupplier('download/all.sdf') molecules = [mol for mol in supplier if mol is not None] len(molecules) # In[21]: # Calculate fingerprints fingerprints = dict() for mol in molecules: drugbank_id = mol.GetProp('DATABASE_ID') fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2) fingerprints[drugbank_id] = fingerprint # In[29]: # Calculate pairwise compound similarities similarity_rows = list() for (id0, fp0), (id1, fp1) in itertools.combinations(fingerprints.items(), 2): similarity = rdkit.DataStructs.DiceSimilarity(fp0, fp1) similarity = round(similarity, 4) similarity_rows.append([id0, id1, similarity]) # In[30]: # Create a DataFrame of pairwise similarities similarity_df = pandas.DataFrame(similarity_rows, columns=['compound0', 'compound1', 'similarity']) with gzip.open('data/similarity.tsv.gz', 'w') as write_file: similarity_df.to_csv(write_file, sep='\t', index=False) similarity_df.head() # In[44]: # Save a similarity tsv with only compounds in our slim drugbank set drugbank_slim_df = pandas.read_table('data/drugbank-slim.tsv') slim_ids = set(drugbank_slim_df.drugbank_id) similarity_slim_df = similarity_df[similarity_df.compound0.isin(slim_ids) & similarity_df.compound1.isin(slim_ids)] with gzip.open('data/similarity-slim.tsv.gz', 'w') as write_file: similarity_slim_df.to_csv(write_file, sep='\t', index=False) float(len(similarity_slim_df)) / len(similarity_df) # In[ ]: # In[39]: # histogram of similarities import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.hist(similarity_df.similarity, 100); # In[49]: # histogram of similarities in slim subset plt.hist(list(similarity_slim_df.similarity), 100); # In[ ]: