import itertools
import gzip
import pandas
import rdkit.Chem
import rdkit.Chem.AllChem
import rdkit.DataStructs
# Download DrugBank SDF file of structures
! wget --timestamping --directory-prefix download http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip
! unzip -d download download/all.sdf.zip
! rm download/all.sdf.zip
--2015-05-18 14:03:23-- http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip Resolving www.drugbank.ca (www.drugbank.ca)... 162.243.242.197 Connecting to www.drugbank.ca (www.drugbank.ca)|162.243.242.197|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 4362948 (4.2M) [application/zip] Server file no newer than local file ‘download/all.sdf.zip’ -- not retrieving. Archive: download/all.sdf.zip inflating: download/all.sdf
# Read SDF File
supplier = rdkit.Chem.SDMolSupplier('download/all.sdf')
molecules = [mol for mol in supplier if mol is not None]
len(molecules)
6743
# Calculate fingerprints
fingerprints = dict()
for mol in molecules:
drugbank_id = mol.GetProp('DATABASE_ID')
fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)
fingerprints[drugbank_id] = fingerprint
# Calculate pairwise compound similarities
similarity_rows = list()
for (id0, fp0), (id1, fp1) in itertools.combinations(fingerprints.items(), 2):
similarity = rdkit.DataStructs.DiceSimilarity(fp0, fp1)
similarity = round(similarity, 4)
similarity_rows.append([id0, id1, similarity])
# Create a DataFrame of pairwise similarities
similarity_df = pandas.DataFrame(similarity_rows, columns=['compound0', 'compound1', 'similarity'])
with gzip.open('data/similarity.tsv.gz', 'w') as write_file:
similarity_df.to_csv(write_file, sep='\t', index=False)
similarity_df.head()
compound0 | compound1 | similarity | |
---|---|---|---|
0 | DB05107 | DB08426 | 0.0966 |
1 | DB05107 | DB05105 | 0.0441 |
2 | DB05107 | DB05104 | 0.1457 |
3 | DB05107 | DB08423 | 0.1268 |
4 | DB05107 | DB05102 | 0.1000 |
5 rows × 3 columns
# Save a similarity tsv with only compounds in our slim drugbank set
drugbank_slim_df = pandas.read_table('data/drugbank-slim.tsv')
slim_ids = set(drugbank_slim_df.drugbank_id)
similarity_slim_df = similarity_df[similarity_df.compound0.isin(slim_ids) & similarity_df.compound1.isin(slim_ids)]
with gzip.open('data/similarity-slim.tsv.gz', 'w') as write_file:
similarity_slim_df.to_csv(write_file, sep='\t', index=False)
float(len(similarity_slim_df)) / len(similarity_df)
0.05267679727458775
# histogram of similarities
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(similarity_df.similarity, 100);
# histogram of similarities in slim subset
plt.hist(list(similarity_slim_df.similarity), 100);