import pickle
import pandas as pd
py.init_notebook_mode(connected=True)
(drugmatrix_archive,pubchem_archive,ds1_archive,serotonin_archive) = \
pickle.load(open('../data/unbalanced_data_results.pkl','rb'))
(drugmatrix_archive2,pubchem_archive2,ds1_archive2,serotonin_archive2) = \
pickle.load(open('../data/unbalanced_data_results.sklearn.pkl','rb'))
data = []
# columns: [dataset, name, Kappa_init, ROC, Kappa_final, threshold]
for nm,v in drugmatrix_archive.items():
init,final = v
data.append(['drugmatrix',nm,init[1],init[-1],final[1],final[0]])
for nm,v in pubchem_archive.items():
init,final = v
data.append(['pubchem',nm,init[1],init[-1],final[1],final[0]])
for nm,v in ds1_archive.items():
init,final = v
data.append(['DS1',nm,init[1],init[-1],final[1],final[0]])
for nm,v in serotonin_archive.items():
init,final = v
data.append(['serotonin',nm,init[1],init[-1],final[1],final[0]])
df1 = pd.DataFrame(data,columns=['dataset', 'name', 'kappa_init', 'ROC', 'kappa_final', 'threshold'])
data = []
# columns: [dataset, name, Kappa_init, ROC]
for nm,v in drugmatrix_archive2.items():
data.append(['drugmatrix',nm,v[0][1],v[0][-1]])
for nm,v in pubchem_archive2.items():
data.append(['pubchem',nm,v[0][1],v[0][-1]])
for nm,v in ds1_archive2.items():
data.append(['DS1',nm,v[0][1],v[0][-1]])
for nm,v in serotonin_archive2.items():
data.append(['serotonin',nm,v[0][1],v[0][-1]])
df2 = pd.DataFrame(data,columns=['dataset', 'name', 'kappa_BRF', 'ROC_BRF'])
df = df1.merge(df2,on=['dataset','name'])
df.head()
dataset | name | kappa_init | ROC | kappa_final | threshold | kappa_BRF | ROC_BRF | |
---|---|---|---|---|---|---|---|---|
0 | drugmatrix | CHEMBL1909215 | 0.000000 | 0.829830 | 0.416906 | 0.15 | 0.248659 | 0.989986 |
1 | drugmatrix | CHEMBL1909211 | 0.365774 | 0.935224 | 0.751379 | 0.25 | 0.551367 | 0.990435 |
2 | drugmatrix | CHEMBL1909210 | 0.000000 | 0.902642 | 0.281530 | 0.20 | 0.325429 | 0.987178 |
3 | drugmatrix | CHEMBL1909209 | 0.307850 | 0.920535 | 0.610599 | 0.20 | 0.409567 | 0.992174 |
4 | drugmatrix | CHEMBL1909204 | 0.000000 | 0.890900 | 0.384958 | 0.10 | 0.256293 | 0.985219 |
df['label'] = [x+"-"+y for x,y in zip(df.dataset,df.name)]
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, ColorBar
from bokeh.palettes import plasma
from bokeh.plotting import figure
from bokeh.transform import transform, linear_cmap
from bokeh.io import export_png
output_notebook()
source = ColumnDataSource(data=df)
hover = HoverTool(tooltips=[
("AUC", "(@ROC,@ROC_BRF)"),
("(x,y)", "(@kappa_init, @kappa_final)"),
('desc', '@label'),
])
mapper = linear_cmap(field_name='ROC',palette=plasma(256), low=min(df.ROC), high=max(df.ROC))
p = figure(plot_width=600, plot_height=400, tools=[hover], title="Kappas")
cp = p.circle('kappa_init', 'kappa_final', size=7, source=source, color=mapper, line_width=1, line_color='darkgray')
p.line((0,1),(0,1))
# fill_color=transform('y', mapper))
p.xaxis.axis_label='Kappa(init)'
p.yaxis.axis_label='Kappa(shifted)'
color_bar = ColorBar(color_mapper=mapper['transform'], width=8, location=(0,0))
p.add_layout(color_bar, 'right')
#output_file('test.html')
show(p)
export_png(p,"/tmp/p1.png");
source = ColumnDataSource(data=df)
hover = HoverTool(tooltips=[
("AUC", "(@ROC,@ROC_BRF)"),
("(x,y)", "(@kappa_final, @kappa_BRF)"),
('desc', '@label'),
])
mapper = linear_cmap(field_name='ROC',palette=plasma(256), low=min(df.ROC), high=max(df.ROC))
p = figure(plot_width=600, plot_height=400, tools=[hover], title="Kappas")
p.circle('kappa_final', 'kappa_BRF', size=7, source=source, color=mapper, line_width=1, line_color='darkgray')
p.line((0,1),(0,1))
# fill_color=transform('y', mapper))
p.xaxis.axis_label='Kappa(shifted)'
p.yaxis.axis_label='Kappa(BRF)'
color_bar = ColorBar(color_mapper=mapper['transform'], width=8, location=(0,0))
p.add_layout(color_bar, 'right')
#output_file('test.html')
export_png(p,"/tmp/p2.png");
show(p)
source = ColumnDataSource(data=df)
hover = HoverTool(tooltips=[
("AUC", "(@ROC,@ROC_BRF)"),
("(x,y)", "(@kappa_init, @kappa_BRF)"),
('desc', '@label'),
])
mapper = linear_cmap(field_name='ROC',palette=plasma(256), low=min(df.ROC), high=max(df.ROC))
p = figure(plot_width=600, plot_height=400, tools=[hover], title="Kappas")
p.circle('kappa_init', 'kappa_BRF', size=7, source=source, color=mapper, line_width=1, line_color='darkgray')
p.line((0,1),(0,1))
# fill_color=transform('y', mapper))
p.xaxis.axis_label='Kappa(init)'
p.yaxis.axis_label='Kappa(BRF)'
color_bar = ColorBar(color_mapper=mapper['transform'], width=8, location=(0,0))
p.add_layout(color_bar, 'right')
#output_file('test.html')
export_png(p,"/tmp/p3.png");
show(p)
Look at the properties of the datasets
data_summary = []
with open('../data/serotonin_data.pkl','rb') as inf:
serotonin_d,assay_lookup = pickle.load(inf)
tpls = sorted([(len(v),k) for k,v in serotonin_d.groupby('target_chembl_id').groups.items()],reverse=True)
for v,k in tpls:
if v<=900:
continue
assay = serotonin_d.loc[serotonin_d['target_chembl_id']==k]
acts = assay.loc[assay['pchembl_value']>9.0]
if len(acts)>=50:
inacts = assay.loc[assay['pchembl_value']<8.5]
else:
# but relax that if we don't end up with a reasonable number of actives:
acts = assay.loc[assay['pchembl_value']>8.0]
inacts = assay.loc[assay['pchembl_value']<7.5]
data_summary.append(['serotonin',k,len(acts),len(inacts)])
with open('../data/pubchem_data.pkl','rb') as inf:
pubchem_d,pubchem_assay_lookup = pickle.load(inf)
tpls = sorted([(len(v),k) for k,v in pubchem_d.groupby('assay_chembl_id').groups.items()],reverse=True)
for v,k in tpls:
assay = pubchem_d.loc[pubchem_d['assay_chembl_id']==k]
acts = pd.concat((assay.loc[assay['activity_comment'] == 'Active'],
assay.loc[assay['activity_comment'] == 'active']))
inacts = pd.concat((assay.loc[assay['activity_comment'] == 'inactive'],
assay.loc[assay['activity_comment'] == 'inconclusive'],
assay.loc[assay['activity_comment'] == 'Inconclusive'],
assay.loc[assay['activity_comment'] == 'Not Active']))
data_summary.append(['pubchem',k,len(acts),len(inacts)])
with open('../data/drugmatrix_data.pkl','rb') as inf:
drugmatrix_d,drugmatrix_assay_lookup = pickle.load(inf)
tpls = sorted([(len(v),k) for k,v in drugmatrix_d.groupby('assay_chembl_id').groups.items()],reverse=True)
for v,k in tpls:
assay = drugmatrix_d.loc[drugmatrix_d['assay_chembl_id']==k]
inact_indices = [x for x,y in enumerate(assay['activity_comment']) if y.find('Not Active')==0]
act_indices = [x for x,y in enumerate(assay['activity_comment']) if y.find('Active')==0]
if len(act_indices)<40:
continue
data_summary.append(['drugbank',k,len(act_indices),len(inact_indices)])
data_summary = pd.DataFrame(data_summary,columns=['dataset','name','nActive','nInactive'])
data_summary['nPts'] = data_summary.nActive+data_summary.nInactive
data_summary['fracActive'] = data_summary.nActive/data_summary.nPts
data_summary['label'] = [x+"-"+y for x,y in zip(data_summary.dataset,data_summary.name)]
data_summary['index'] = [x+1 for x in range(len(data_summary))]
data_summary.describe()
nActive | nInactive | nPts | fracActive | index | |
---|---|---|---|---|---|
count | 58.000000 | 58.000000 | 58.000000 | 58.000000 | 58.000000 |
mean | 295.189655 | 4582.913793 | 4878.103448 | 0.085216 | 29.500000 |
std | 929.402080 | 13246.338590 | 13988.705037 | 0.042631 | 16.886879 |
min | 3.000000 | 662.000000 | 773.000000 | 0.000420 | 1.000000 |
25% | 53.250000 | 768.000000 | 842.000000 | 0.057304 | 15.250000 |
50% | 71.500000 | 784.000000 | 842.000000 | 0.077654 | 29.500000 |
75% | 103.750000 | 802.000000 | 842.000000 | 0.104810 | 43.750000 |
max | 5614.000000 | 78690.000000 | 83178.000000 | 0.223431 | 58.000000 |
data_summary.groupby('dataset').describe()
nActive | nInactive | ... | fracActive | index | |||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
dataset | |||||||||||||||||||||
drugbank | 44.0 | 68.000000 | 21.700927 | 40.0 | 49.75 | 64.5 | 80.00 | 122.0 | 44.0 | 774.000 | ... | 0.095012 | 0.144893 | 44.0 | 36.5 | 12.845233 | 15.0 | 25.75 | 36.5 | 47.25 | 58.0 |
pubchem | 8.0 | 1606.500000 | 2173.794247 | 3.0 | 212.75 | 736.5 | 1815.75 | 5614.0 | 8.0 | 27778.125 | ... | 0.130125 | 0.204370 | 8.0 | 10.5 | 2.449490 | 7.0 | 8.75 | 10.5 | 12.25 | 14.0 |
serotonin | 6.0 | 212.833333 | 118.295252 | 71.0 | 136.00 | 212.0 | 253.50 | 404.0 | 6.0 | 1588.000 | ... | 0.155143 | 0.223431 | 6.0 | 3.5 | 1.870829 | 1.0 | 2.25 | 3.5 | 4.75 | 6.0 |
3 rows × 32 columns
source = ColumnDataSource(data=data_summary)
hover = HoverTool(tooltips=[
("(nAct,nInact)", "(@nActive, @nInactive)"),
('desc', '@label'),
])
p = figure(plot_width=600, plot_height=400, tools=[hover], title="fraction active")
p.line('index', 'fracActive', source=source)
#p.line((0,1),(0,1))
# fill_color=transform('y', mapper))
#p.xaxis.axis_label=''
p.yaxis.axis_label='frac_active'
#output_file('test.html')
show(p)
source = ColumnDataSource(data=data_summary)
hover = HoverTool(tooltips=[
("(nAct,nInact)", "(@nActive, @nInactive)"),
('desc', '@label'),
])
p = figure(plot_width=600, plot_height=400, tools=[hover], title="fraction active", x_axis_type='log')
p.circle('nPts', 'fracActive', size=7, source=source)
#p.line((0,1),(0,1))
# fill_color=transform('y', mapper))
p.xaxis.axis_label='dataset size'
p.yaxis.axis_label='frac_active'
#output_file('test.html')
show(p)