Back in September there were a couple of posts on the ChEMBL blog about identifying related documents. George and Mark started with an overview of the method and great graphic of the results and then John followed up with slide for those of us who are more graphically oriented. The general idea was to look at overlap (Tanimoto score) in compounds, targets, and text between the documents to something of a holistic view.
Here I'm going to explore a different approach: just looking at similarity between compounds in the papers.
I'll use the RDKit Morgan2 fingerprint for this example
This is going to use a mix of python and SQL and will take advantage of Catherine Devlin's excellent sql magic for ipython. It's available from PyPi. This is my first time playing with ipython-sql and I'm super impressed.
from rdkit import Chem
import psycopg2
from rdkit.Chem import Draw,PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from __future__ import print_function
import requests
from xml.etree import ElementTree
import pandas as pd
%load_ext sql
print(rdBase.rdkitVersion)
2014.03.1pre
I started out by generating a table in my chembl_16 instance containing all pairs of molecules from papers in 2012 that have at least 0.8 MFP2 tanimoto similiarity to each other:
create temporary table acts_2012 as select * from (select doc_id from docs where year=2012) t1 join (select doc_id,activity_id,molregno from activities) t2 using (doc_id);
create temporary table fps_2012 as select molregno,mfp2 from rdk.fps join (select distinct molregno from acts_2012) tbl using (molregno);
create index ffp2_idx on fps_2012 using gist (mfp2);
set rdkit.tanimoto_threshold=0.8;
create schema papers_pairs;
create table papers_pairs.pairs_2012 as select t1.molregno molregno_1, t2.molregno molregno_2,tanimoto_sml(t1.mfp2,t2.mfp2) sim from (select * from fps_2012) t1 cross join fps_2012 t2 where t1.mfp2%t2.mfp2;
This last bit, which in principle needs to do around 5.1 billion similarity comparisons, takes a while. For me it was about an hour and 20 minutes. This would be a great opportunity to use a special-purpose similarity calculator like Andrew Dalke's chemfp. That's for another blog post.
Now that we have the pairs, add the document info back in:
create table papers_pairs.pairs_and_docs_2012 as select distinct * from (select p1.doc_id doc_id_1,p2.doc_id doc_id_2,p1.molregno molregno_1,p2.molregno molregno_2 from acts_2012 p1 cross join acts_2012 p2 where p1.doc_id>p2.doc_id and p1.molregno!=p2.molregno) ttbl join papers_pairs.pairs_2012 using (molregno_1,molregno_2);
And now let's see what we got:
%sql postgresql://localhost/chembl_16 \
select count(*) from papers_pairs.pairs_2012;
1 rows affected.
count |
---|
225703 |
%sql postgresql://localhost/chembl_16 \
select count(*) from papers_pairs.pairs_and_docs_2012;
1 rows affected.
count |
---|
13401 |
data = %sql \
select (doc_id_1,doc_id_2) doc_ids,count(molregno_1) pair_count, avg(sim) avg_sim \
from papers_pairs.pairs_and_docs_2012 \
group by (doc_id_1, doc_id_2) \
order by pair_count desc
7073 rows affected.
The result object from the sql query can be trivally converted into a Pandas data frame:
df = data.DataFrame()
df.head()
doc_ids | pair_count | avg_sim | |
---|---|---|---|
0 | (67106,61208) | 248 | 0.854043190537 |
1 | (62996,60628) | 190 | 0.830029102851 |
2 | (61096,60864) | 161 | 0.852017644705 |
3 | (65439,65426) | 102 | 0.863242558624 |
4 | (67047,62125) | 102 | 0.841522325209 |
df
<class 'pandas.core.frame.DataFrame'> Int64Index: 7073 entries, 0 to 7072 Data columns (total 3 columns): doc_ids 7073 non-null values pair_count 7073 non-null values avg_sim 7073 non-null values dtypes: object(3)
Get the number of unique molecules in each document:
doc_counts = %sql \
select doc_id,count(distinct molregno) num_cmpds from (select doc_id,molregno from docs join activities using (doc_id) where year=2012) tbl group by (doc_id);
doc_counts_d=dict(list(doc_counts))
3661 rows affected.
And the number of molecules in common in the document pairs.
# turn off row count printing
%config SqlMagic.feedback = False
nInCommon=[]
for k in df.doc_ids:
docid1,docid2 = eval(k)
d = %sql select count(*) from (select distinct molregno from activities where doc_id=:docid1 intersect\
select distinct molregno from activities where doc_id=:docid2)t
nInCommon.append(d[0][0])
df['nInCommon']=nInCommon
And add columns for the compound tanimoto (based on number of compounds in common) and the fraction of possible compound pairs that are highly similar.
nDoc1=[]
nDoc2=[]
tanis=[]
for k in df.doc_ids:
docid1,docid2 = eval(k)
nDoc1.append(doc_counts_d[docid1])
nDoc2.append(doc_counts_d[docid2])
df['nDoc1']=nDoc1
df['nDoc2']=nDoc2
df['compond_id_tani']=df.apply(lambda row: float(row['nInCommon'])/(row['nDoc1']+row['nDoc2']-row['nInCommon']),axis=1)
df['frac_high_sim']=df.apply(lambda row: float(row['pair_count'])/((row['nDoc1']-row['nInCommon'])*(row['nDoc2']-row['nInCommon'])|10000),axis=1)
Note that the possible presence of duplicates in the sets and the fact that duplicates were dropped when calculating the high-similarity pairs means that we need to subtract out the number in common from each term in the denominator of frac_high_sim.
df.head()
doc_ids | pair_count | avg_sim | nInCommon | nDoc1 | nDoc2 | compond_id_tani | frac_high_sim | |
---|---|---|---|---|---|---|---|---|
0 | (67106,61208) | 248 | 0.854043190537 | 32 | 47 | 40 | 0.581818 | 0.024545 |
1 | (62996,60628) | 190 | 0.830029102851 | 40 | 49 | 40 | 0.816327 | 0.019000 |
2 | (61096,60864) | 161 | 0.852017644705 | 16 | 26 | 71 | 0.197531 | 0.016039 |
3 | (65439,65426) | 102 | 0.863242558624 | 1 | 22 | 15 | 0.027778 | 0.010161 |
4 | (67047,62125) | 102 | 0.841522325209 | 2 | 19 | 32 | 0.040816 | 0.009963 |
df.sort('frac_high_sim',ascending=False).head(10)
doc_ids | pair_count | avg_sim | nInCommon | nDoc1 | nDoc2 | compond_id_tani | frac_high_sim | |
---|---|---|---|---|---|---|---|---|
0 | (67106,61208) | 248 | 0.854043190537 | 32 | 47 | 40 | 0.581818 | 0.024545 |
1 | (62996,60628) | 190 | 0.830029102851 | 40 | 49 | 40 | 0.816327 | 0.019000 |
2 | (61096,60864) | 161 | 0.852017644705 | 16 | 26 | 71 | 0.197531 | 0.016039 |
3 | (65439,65426) | 102 | 0.863242558624 | 1 | 22 | 15 | 0.027778 | 0.010161 |
5 | (61777,61771) | 100 | 0.875883535125 | 5 | 25 | 31 | 0.098039 | 0.009992 |
4 | (67047,62125) | 102 | 0.841522325209 | 2 | 19 | 32 | 0.040816 | 0.009963 |
7 | (61733,60448) | 76 | 0.868513977117 | 0 | 21 | 21 | 0.000000 | 0.007474 |
6 | (66889,65537) | 92 | 0.833532600736 | 7 | 138 | 39 | 0.041176 | 0.006483 |
8 | (66968,61801) | 66 | 0.815926490544 | 1 | 27 | 9 | 0.028571 | 0.006476 |
9 | (65787,65440) | 64 | 0.872092468112 | 3 | 36 | 32 | 0.046154 | 0.006291 |
df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10)
doc_ids | pair_count | avg_sim | nInCommon | nDoc1 | nDoc2 | compond_id_tani | frac_high_sim | |
---|---|---|---|---|---|---|---|---|
0 | (67106,61208) | 248 | 0.854043190537 | 32 | 47 | 40 | 0.581818 | 0.024545 |
1 | (62996,60628) | 190 | 0.830029102851 | 40 | 49 | 40 | 0.816327 | 0.019000 |
2 | (61096,60864) | 161 | 0.852017644705 | 16 | 26 | 71 | 0.197531 | 0.016039 |
3 | (65439,65426) | 102 | 0.863242558624 | 1 | 22 | 15 | 0.027778 | 0.010161 |
5 | (61777,61771) | 100 | 0.875883535125 | 5 | 25 | 31 | 0.098039 | 0.009992 |
4 | (67047,62125) | 102 | 0.841522325209 | 2 | 19 | 32 | 0.040816 | 0.009963 |
7 | (61733,60448) | 76 | 0.868513977117 | 0 | 21 | 21 | 0.000000 | 0.007474 |
6 | (66889,65537) | 92 | 0.833532600736 | 7 | 138 | 39 | 0.041176 | 0.006483 |
9 | (65787,65440) | 64 | 0.872092468112 | 3 | 36 | 32 | 0.046154 | 0.006291 |
10 | (66833,66017) | 61 | 0.862428480606 | 21 | 42 | 75 | 0.218750 | 0.006034 |
Quick check: use the pubmed API to pull back titles
titlePairs=[]
for row in df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10).doc_ids:
tpl = eval(row)
did1,did2=tpl
pmids=%sql select doc_id,pubmed_id,title from docs where doc_id in (:did1,:did2)
tpl = pmids[0][1],pmids[1][1]
if tpl[0] is None:
print('no pmid found for item: %s'%(str(pmids[0])))
continue
if tpl[1] is None:
print('no pmid found for item: %s'%(str(pmids[1])))
continue
txt=requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%d,%d'%tpl).text
et = ElementTree.fromstring(txt.encode('utf-8'))
ts=[x.text for x in et.findall(".//*[@Name='Title']")]
titlePairs.append(ts)
print(str(tpl))
print(' '+ts[0])
print(' '+ts[1])
(22365562L, 23124213L) Fluorinated dual antithrombotic compounds based on 1,4-benzoxazine scaffold. Novel 1,4-benzoxazine and 1,4-benzodioxine inhibitors of angiogenesis. (22119125L, 22850214L) From COX-2 inhibitor nimesulide to potent anti-cancer agent: synthesis, in vitro, in vivo and pharmacokinetic evaluation. Identification of selective tubulin inhibitors as potential anti-trypanosomal agents. (22206869L, 22168134L) Semisynthetic neoboutomellerone derivatives as ubiquitin-proteasome pathway inhibitors. Proteasome inhibitors from Neoboutonia melleri. (22975302L, 22995619L) Serum stability of selected decapeptide agonists of KISS1R using pseudopeptides. Trypsin resistance of a decapeptide KISS1R agonist containing an Nω-methylarginine substitution. (22503248L, 22503453L) Identification of a series of 1,3,4-oxadiazol-2-amines as potent alpha-7 agonists with efficacy in the novel object recognition model of cognition. The discovery of 2-fluoro-N-(3-fluoro-4-(5-((4-morpholinobutyl)amino)-1,3,4-oxadiazol-2-yl)phenyl)benzamide, a full agonist of the alpha-7 nicotinic acetylcholine receptor showing efficacy in the novel object recognition model of cognition enhancement. (22471376L, 23142614L) Discovery of novel urea-based hepatitis C protease inhibitors with high potency against protease-inhibitor-resistant mutants. Synthesis and antiviral activity of novel HCV NS3 protease inhibitors with P4 capping groups. (22063755L, 22465634L) Anti-AIDS agents 85. Design, synthesis, and evaluation of 1R,2R-dicamphanoyl-3,3-dimethyldihydropyrano-[2,3-c]xanthen-7(1H)-one (DCX) derivatives as novel anti-HIV agents. Anti-AIDS agents 89. Identification of DCX derivatives as anti-HIV and chemosensitizing dual function agents to overcome P-gp-mediated drug resistance for AIDS therapy. no pmid found for item: (65537, None, None) (22995620L, 22650305L) Identification of a potent and metabolically stable series of fluorinated diphenylpyridylethanamine-based cholesteryl ester transfer protein inhibitors. Diphenylpyridylethanamine (DPPE) derivatives as cholesteryl ester transfer protein (CETP) inhibitors. (22989363L, 22320402L) Flavonoid dimers as novel, potent antileishmanial agents. Amine linked flavonoid dimers as modulators for P-glycoprotein-based multidrug resistance: structure-activity relationship and mechanism of modulation.
Those look pretty good.
Let's check ChEMBL to see what it says:
for row in df[(df.nDoc1>10)&(df.nDoc2>10)].sort('frac_high_sim',ascending=False).head(10).doc_ids:
tpl = eval(row)
did1,did2=tpl
pmids=%sql select doc_id,pubmed_id,chembl_id from docs where doc_id in (:did1,:did2)
print('https://www.ebi.ac.uk/chembl/doc/inspect/%s - https://www.ebi.ac.uk/chembl/doc/inspect/%s'%(pmids[0][2],pmids[1][2]))
https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL1949509 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2216753 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL1932938 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2069209 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL1938254 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL1949544 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2150961 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2151012 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2021886 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2021786 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2034968 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2202978 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL1926665 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2021899 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2150910 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2203235 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2150934 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2163208 https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2169835 - https://www.ebi.ac.uk/chembl/doc/inspect/CHEMBL2203168
Following those links (I did it manually... didn't feel like writing a web scraper), it looks like none of these documents are flagged as being related. Many of them don't have related docs flagged at all. So maybe this is another interesting way to find related docs.
d1,d2=67106,61208
data = %sql\
select molregno_1,t1.m m1,molregno_2,t2.m m2,sim from papers_pairs.pairs_and_docs_2012 \
join rdk.mols t1 on (molregno_1=t1.molregno) \
join rdk.mols t2 on (molregno_2=t2.molregno) \
where doc_id_1=:d1 and doc_id_2=:d2
data = data.DataFrame()
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m1',molCol='mol1')
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m2',molCol='mol2')
rows=[]
for m1,m2 in zip(data['mol1'],data['mol2']):
rows.append(m1)
rows.append(m2)
Draw.MolsToGridImage(rows[:6],molsPerRow=2)
Yeah, those are definitely related to each other.
Those two datasets share a lot of molecules in common, what about another one where there's less identity overlap?
d1,d2=61733,60448
data = %sql\
select molregno_1,t1.m m1,molregno_2,t2.m m2,sim from papers_pairs.pairs_and_docs_2012 \
join rdk.mols t1 on (molregno_1=t1.molregno) \
join rdk.mols t2 on (molregno_2=t2.molregno) \
where doc_id_1=:d1 and doc_id_2=:d2
data = data.DataFrame()
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m1',molCol='mol1')
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m2',molCol='mol2')
rows=[]
for m1,m2 in zip(data['mol1'],data['mol2']):
rows.append(m1)
rows.append(m2)
Draw.MolsToGridImage(rows[:6],molsPerRow=2)
That one may be a curation problem.
d1,d2=66968,61801
data = %sql\
select molregno_1,t1.m m1,molregno_2,t2.m m2,sim from papers_pairs.pairs_and_docs_2012 \
join rdk.mols t1 on (molregno_1=t1.molregno) \
join rdk.mols t2 on (molregno_2=t2.molregno) \
where doc_id_1=:d1 and doc_id_2=:d2
data = data.DataFrame()
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m1',molCol='mol1')
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m2',molCol='mol2')
rows=[]
for m1,m2 in zip(data['mol1'],data['mol2']):
rows.append(m1)
rows.append(m2)
Draw.MolsToGridImage(rows[:6],molsPerRow=2)
hmmm, maybe that one too
d1,d2=65787,65440
data = %sql\
select molregno_1,t1.m m1,molregno_2,t2.m m2,sim from papers_pairs.pairs_and_docs_2012 \
join rdk.mols t1 on (molregno_1=t1.molregno) \
join rdk.mols t2 on (molregno_2=t2.molregno) \
where doc_id_1=:d1 and doc_id_2=:d2
data = data.DataFrame()
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m1',molCol='mol1')
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m2',molCol='mol2')
rows=[]
for m1,m2 in zip(data['mol1'],data['mol2']):
rows.append(m1)
rows.append(m2)
Draw.MolsToGridImage(rows[:6],molsPerRow=2)
d1,d2=66889,65537
data = %sql\
select molregno_1,t1.m m1,molregno_2,t2.m m2,sim from papers_pairs.pairs_and_docs_2012 \
join rdk.mols t1 on (molregno_1=t1.molregno) \
join rdk.mols t2 on (molregno_2=t2.molregno) \
where doc_id_1=:d1 and doc_id_2=:d2
data = data.DataFrame()
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m1',molCol='mol1')
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='m2',molCol='mol2')
rows=[]
for m1,m2 in zip(data['mol1'],data['mol2']):
rows.append(m1)
rows.append(m2)
Draw.MolsToGridImage(rows[:6],molsPerRow=2)