def createImportancePlot(splt,desc,importances,caption):
import numpy.numarray as na
labels = []
weights = []
threshold = sort([abs(w) for w in importances])[-11]
for d in zip(desc,importances):
if abs(d[1]) > threshold:
labels.append(d[0])
weights.append(d[1])
xlocations = na.array(range(len(labels)))+0.5
width = 0.8
splt.bar(xlocations, weights, width=width)
splt.set_xticks([r+1 for r in range(len(labels))])
splt.set_xticklabels(labels)
splt.set_xlim(0, xlocations[-1]+width*2)
splt.set_title(caption)
splt.get_xaxis().tick_bottom()
splt.get_yaxis().tick_left()
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
#http://cheminformatics.org/datasets/huuskonen/train.smi
with open('data/train.smi','r') as train_file:
train_file.readline()
raw = [line.split() for line in train_file.readlines()]
temp = []
for d in raw:
try:
t = float(d[3])
temp.append(d)
except:
print d
continue
molecules = [Chem.MolFromSmiles(d[-1]) for d in temp]
observations = [float(d[3]) for d in temp]
data = [(x,y) for x,y in zip(molecules,observations) if x is not None]
['976', '?', '7-Chloropteridine-0.87', '?', '-0.29', 'c1c(Cl)nc2cncnc2n1'] ['979', '?', '4-Mercaptopteridine-2.77', '?', '-0.24', 'c1cnc2cnc(S)nc2n1'] ['980', '?', '7-Mercaptopteridine-2.71', '?', '-0.24', 'c1c(S)nc2cncnc2n1']
#Loading and processing external test data set
#http://cheminformatics.org/datasets/huuskonen/test1.smi
with open('data/test1.smi','r') as test_file:
test_file.readline()
test_raw = [line.split() for line in test_file.readlines()]
temp = []
for d in test_raw:
try:
t = float(d[3])
temp.append(d)
except:
print d
continue
test_molecules = [Chem.MolFromSmiles(d[-1]) for d in temp]
test_observations = [float(d[3]) for d in temp]
test_data = [(x,y) for x,y in zip(test_molecules,test_observations) if x is not None]
test_patterns = []
test_ys = []
for sample in test_data:
pattern = []
use = True
for D in Descriptors.descList:
if D[0] == 'MolecularFormula': continue
try:
d = float(D[1](sample[0]))
pattern.append(d)
except:
use = False
print 'Error computing descriptor %s for %s'%(D[0],sample[0])
break
if use:
test_patterns.append(pattern)
test_ys.append(sample[1])
['978', '?', '2-Mercaptopteridine-2.36', '?', '-0.24', 'c1cnc2c(S)ncnc2n1']
patterns = []
ys = []
descriptors = []
for D in Descriptors._descList:
if D[0] == 'MolecularFormula': continue
descriptors.append(D[0])
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
for sample in data:
use = True
try:
pattern = calculator.CalcDescriptors(sample[0])
except e:
use = False
print 'Error computing descriptor %s for %s'%(D[0],sample[0])
if use:
patterns.append(pattern)
ys.append(sample[1])
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,15)
#simple linear regression
from sklearn import linear_model
simple_linearreg = linear_model.LinearRegression()
simple_linearreg.fit(patterns,ys)
simple_prediction = simple_linearreg.predict(patterns)
plt1.scatter(ys,simple_prediction)
plt1.set_aspect('equal')
plt1.set_title('Simple linear regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)
#simple decision tree regression
from sklearn import tree
simple_tree = tree.DecisionTreeRegressor(compute_importances=True)
simple_tree.fit(patterns,ys)
simple_treeprediction = simple_tree.predict(patterns)
plt2.scatter(ys,simple_treeprediction)
plt2.set_aspect('equal')
plt2.set_title('Standard Decision Tree')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)
#custom decision tree regression
from sklearn import tree
custom_tree = tree.DecisionTreeRegressor(max_depth=10, min_samples_split = 50,compute_importances=True)
custom_tree.fit(patterns,ys)
custom_treeprediction = custom_tree.predict(patterns)
plt3.scatter(ys,custom_treeprediction)
plt3.set_aspect('equal')
plt3.set_title('Custom Decision Tree (regularized)')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('Predicted logS')
plt3.set_xlim(-12,2)
plt3.set_ylim(-12,2)
from sklearn.metrics import mean_squared_error
print 'Explained variance (Simple): %0.5f'%(simple_linearreg.score(patterns,ys))
print 'MSE (Simple): %0.5f'%(mean_squared_error(ys,simple_prediction))
print 'Explained variance (Custom): %0.5f'%(simple_tree.score(patterns,ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(ys,simple_treeprediction))
print 'Explained variance (Custom): %0.5f'%(custom_tree.score(patterns,ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(ys,custom_treeprediction))
#print zip(simple_prediction,custom_prediction)
Explained variance (Simple): 0.90063 MSE (Simple): 0.41525 Explained variance (Custom): 0.99998 MSE (Custom): 0.00007 Explained variance (Custom): 0.91630 MSE (Custom): 0.34978
figure,(plt1,plt2,plt3) = pyplot.subplots(3,1)
figure.set_size_inches(15,5)
figure.tight_layout()
imp = simple_linearreg.coef_
createImportancePlot(plt1,descriptors,imp,"Most important descriptors in linear model")
imp2 = simple_tree.feature_importances_ #Gini importances
createImportancePlot(plt2,descriptors,imp2,"Most important descriptors in simple tree model")
imp3 = custom_tree.feature_importances_ #Gini importances
createImportancePlot(plt3,descriptors,imp3,"Most important descriptors in custom tree model")
#inspect logP contribution
from scipy.stats.stats import pearsonr
ind = descriptors.index('MolLogP')
logP = [d[ind] for d in patterns]
fig,ax = pyplot.subplots(1,1)
ax.scatter(logP,ys)
ax.set_title('rdkit:mollogP vs logS')
ax.set_xlabel('rdkit:mollogP')
ax.set_ylabel('Measured logS')
print 'Pearson Correlation (MolLogP): %0.5f'%pearsonr(logP,ys)[0]
Pearson Correlation (MolLogP): -0.84915
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,15)
#simple linear regression
from sklearn import linear_model
simple_ext_prediction = simple_linearreg.predict(test_patterns)
plt1.scatter(test_ys,simple_ext_prediction)
plt1.set_aspect('equal')
plt1.set_title('Simple linear regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)
#simple decision tree regression
simple_tree_ext_prediction = simple_tree.predict(test_patterns)
plt2.scatter(test_ys,simple_tree_ext_prediction)
plt2.set_aspect('equal')
plt2.set_title('Standard Decision Tree')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)
#custom decision tree regression
custom_tree_ext_prediction = custom_tree.predict(test_patterns)
plt3.scatter(test_ys,custom_tree_ext_prediction)
plt3.set_aspect('equal')
plt3.set_title('Custom Decision Tree (regularized)')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('Predicted logS')
plt3.set_xlim(-12,2)
plt3.set_ylim(-12,2)
from sklearn.metrics import mean_squared_error
print 'Explained variance (Simple): %0.5f'%(simple_linearreg.score(test_patterns,test_ys))
print 'MSE (Simple): %0.5f'%(mean_squared_error(test_ys,simple_ext_prediction))
print 'Explained variance (Custom): %0.5f'%(simple_tree.score(test_patterns,test_ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(test_ys,simple_tree_ext_prediction))
print 'Explained variance (Custom): %0.5f'%(custom_tree.score(test_patterns,test_ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(test_ys,custom_tree_ext_prediction))
Explained variance (Simple): 0.89059 MSE (Simple): 0.45049 Explained variance (Custom): 0.81740 MSE (Custom): 0.75182 Explained variance (Custom): 0.86516 MSE (Custom): 0.55519
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
params = {'max_depth':[2,5,10,20],'min_samples_split':[2,8,32,128],'min_samples_leaf':[1,2,5,10],'compute_importances':[True]}
cv = KFold(n=len(patterns),k=10,indices=False,shuffle=True)
gs = GridSearchCV(simple_tree, params, cv=cv,verbose=1,refit=True)
gs.fit(patterns, ys)
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
best_treemodel = gs.best_estimator_
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(15,15)
#training set evaluation
best_tree_int_prediction = best_treemodel.predict(patterns)
plt1.scatter(ys,best_tree_int_prediction)
plt1.set_aspect('equal')
plt1.set_title('Optimized tree on training set')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)
#test set evaluation
best_tree_ext_prediction = best_treemodel.predict(test_patterns)
plt2.scatter(test_ys,best_tree_ext_prediction)
plt2.set_aspect('equal')
plt2.set_title('Optimized tree on test set')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)
print 'Explained variance (Internal): %0.5f'%(best_treemodel.score(patterns,ys))
print 'MSE (Internal): %0.5f'%(mean_squared_error(ys,best_tree_int_prediction))
print 'Explained variance (External): %0.5f'%(best_treemodel.score(test_patterns,test_ys))
print 'MSE (External): %0.5f'%(mean_squared_error(test_ys,best_tree_ext_prediction))
fig,a = pyplot.subplots(1,1)
fig.set_size_inches(15,5)
createImportancePlot(a,descriptors,best_treemodel.feature_importances_,"Most important descriptors in the optimized tree")
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.0s [Parallel(n_jobs=1)]: Done 50 jobs | elapsed: 0.9s [Parallel(n_jobs=1)]: Done 200 jobs | elapsed: 4.1s [Parallel(n_jobs=1)]: Done 450 jobs | elapsed: 13.8s
Best score: 0.85 Training set performance using best parameters ({'compute_importances': True, 'min_samples_split': 32, 'max_depth': 20, 'min_samples_leaf': 10}) Explained variance (Internal): 0.92493 MSE (Internal): 0.31371 Explained variance (External): 0.86615 MSE (External): 0.55110
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed: 22.9s finished
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics.metrics import mean_squared_error
params = {'max_depth':[2,5,10,20],'min_samples_split':[2,8,32,128],'min_samples_leaf':[1,2,5,10],'compute_importances':[True]}
cv = KFold(n=len(patterns),k=10,indices=False,shuffle=True)
gs = GridSearchCV(simple_tree, params, cv=cv,verbose=1,refit=True,loss_func=mean_squared_error)
gs.fit(patterns, ys)
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
best_treemodel = gs.best_estimator_
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(15,15)
#training set evaluation
best_tree_int_prediction = best_treemodel.predict(patterns)
plt1.scatter(ys,best_tree_int_prediction)
plt1.set_aspect('equal')
plt1.set_title('Optimized tree on training set')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)
#test set evaluation
best_tree_ext_prediction = best_treemodel.predict(test_patterns)
plt2.scatter(test_ys,best_tree_ext_prediction)
plt2.set_aspect('equal')
plt2.set_title('Optimized tree on test set')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)
print 'Explained variance (Internal): %0.5f'%(best_treemodel.score(patterns,ys))
print 'MSE (Internal): %0.5f'%(mean_squared_error(ys,best_tree_int_prediction))
print 'Explained variance (External): %0.5f'%(best_treemodel.score(test_patterns,test_ys))
print 'MSE (External): %0.5f'%(mean_squared_error(test_ys,best_tree_ext_prediction))
fig,a = pyplot.subplots(1,1)
fig.set_size_inches(15,5)
createImportancePlot(a,descriptors,best_treemodel.feature_importances_,"Most important descriptors in the optimized tree")
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.0s [Parallel(n_jobs=1)]: Done 50 jobs | elapsed: 0.9s [Parallel(n_jobs=1)]: Done 200 jobs | elapsed: 4.0s [Parallel(n_jobs=1)]: Done 450 jobs | elapsed: 13.7s
Best score: -0.59 Training set performance using best parameters ({'compute_importances': True, 'min_samples_split': 2, 'max_depth': 10, 'min_samples_leaf': 10}) Explained variance (Internal): 0.93597 MSE (Internal): 0.26756 Explained variance (External): 0.85485 MSE (External): 0.59763
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed: 22.8s finished
#normalize descriptors
from sklearn.preprocessing import Scaler
scaler = Scaler().fit(patterns)
std_patterns = scaler.transform(patterns)
std_test_patterns = scaler.transform(test_patterns)
#compute pca; provide ten most descriptive principal components
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(std_patterns)
print 'rel. explained variance per pc',pca.explained_variance_ratio_
pc_patterns = pca.transform(std_patterns)
pc_test = pca.transform(std_test_patterns)
fig,(plt1,plt2) = pyplot.subplots(2,1)
fig.set_size_inches(15,5)
createImportancePlot(plt1,descriptors,pca.components_[0],"Most important descriptors in 1st principal component (Hypothesis: Size)")
createImportancePlot(plt2,descriptors,pca.components_[1],"Most important descriptors in 2nd principal component (Hypothesis: Electrostatic interactions)")
#evaluate modelling performance
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,5)
#training set
pc_linearreg = linear_model.LinearRegression()
pc_linearmodel = pc_linearreg.fit(pc_patterns,ys)
pc_prediction = pc_linearmodel.predict(pc_patterns)
plt1.scatter(ys,pc_prediction)
plt1.set_aspect('equal')
plt1.set_title('Full model regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)
plt2.scatter(ys, [pc[0] for pc in pc_patterns])
plt2.set_title('1st PC')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('1st PC')
plt3.scatter(ys, [pc[1] for pc in pc_patterns])
plt3.set_title('2nd PC')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('2nd PC')
#test set
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,5)
pc_prediction = pc_linearmodel.predict(pc_test)
plt1.scatter(test_ys,pc_prediction)
plt1.set_aspect('equal')
plt1.set_title('Full model regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)
plt2.scatter(test_ys, [pc[0] for pc in pc_test])
plt2.set_title('1st PC')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('1st PC')
plt3.scatter(test_ys, [pc[1] for pc in pc_test])
plt3.set_title('2nd PC')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('2nd PC')
rel. explained variance per pc [ 0.33812708 0.1139046 ]
<matplotlib.text.Text at 0x105390950>