This tutorial is based on the great notebook by Nikolas Fechner and Greg Landrum [1]. In addition, there is a demonstration of a regression model trained on a ChEMBL data set. The notebook also uses code snippets from a notebook by Isidro Cortés Ciriano [2]. The interested reader is referred to these notebooks for further information.
%matplotlib inline
%pylab inline
import warnings
warnings.filterwarnings('ignore')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.metrics import classification_report, confusion_matrix, roc_curve,auc,accuracy_score
def confusion_matrix_summary(acts,preds):
from cStringIO import StringIO
file_str = StringIO()
vTab=confusion_matrix(acts,preds)
#print vTab
nResultCodes=len(vTab)
file_str.write('\n\tResults Table (experiment in rows):\n')
colCounts = numpy.sum(vTab,0)
rowCounts = numpy.sum(vTab,1)
print
for i in range(nResultCodes):
if rowCounts[i]==0: rowCounts[i]=1
row = vTab[i]
file_str.write(' ')
for j in range(nResultCodes):
entry = row[j]
file_str.write(' % 6d'%entry),
file_str.write(' | % 4.2f\n'%(100.*vTab[i,i]/rowCounts[i]))
file_str.write(' ')
for i in range(nResultCodes):
file_str.write('-------')
file_str.write('\n')
file_str.write(' '),
for i in range(nResultCodes):
if colCounts[i]==0: colCounts[i]=1
file_str.write(' % 6.2f'%(100.*vTab[i,i]/colCounts[i])),
file_str.write('\n')
return file_str.getvalue()
def createROCPlot(pl,observations,probabilities,caption):
fpr, tpr, thresholds = roc_curve(observations, probabilities)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.set_xlim([0.0, 1.0])
pl.set_ylim([0.0, 1.0])
pl.set_xlabel('False Positive Rate')
pl.set_ylabel('True Positive Rate')
pl.set_title(caption)
def createPropAccPlot(pl,observations,probabilities,caption):
fpr, tpr, thresholds = roc_curve(observations, probabilities)
accuracies = []
for t in thresholds:
predictions = [1 if p >= t else 0 for p in probabilities]
acc = accuracy_score(observations,predictions)
accuracies.append(acc)
pl.plot(thresholds, accuracies, label='Prob. vs Accuracy curve')
pl.plot([0, 1], [0, 1], 'k--')
pl.set_xlim([0.0, 1.0])
pl.set_ylim([0.0, 1.0])
pl.set_xlabel('Probability threshold')
pl.set_ylabel('Accuracy')
pl.set_title(caption)
def createImportancePlot(splt,desc,importances,caption):
#plt.xkcd()
import numpy as np
labels = []
weights = []
threshold = sort([abs(w) for w in importances])[-11]
for d in zip(desc,importances):
if abs(d[1]) > threshold:
labels.append(d[0])
weights.append(d[1])
xlocations = np.array(range(len(labels)))+0.5
width = 0.8
splt.bar(xlocations, weights, width=width)
splt.set_xticks([r+1 for r in range(len(labels))])
splt.set_xticklabels(labels, rotation='vertical')
splt.set_xlim(0, xlocations[-1]+width*2)
splt.set_title(caption)
splt.get_xaxis().tick_bottom()
splt.get_yaxis().tick_left()
Populating the interactive namespace from numpy and matplotlib
Publicly available binary Ames test (mutagenicity) data compiled in "Benchmark Data Set for in Silico Prediction of Ames Mutagenicity", Katja Hansen et al., JCIM, 2009, 49(9),pp 2077-2081
Data available from http://doc.ml.tu-berlin.de/toxbenchmark/smiles_cas_N6512.smi
from rdkit import Chem
import csv
#obtained from http://doc.ml.tu-berlin.de/toxbenchmark/smiles_cas_N6512.smi
with open('/home/chembl/ipynb_workbench/smiles_cas_N6512.smi','rU') as train_file:
r = csv.reader(train_file, delimiter='\t')
r.next()
molecules = []
observations = []
for row in r:
try:
temp = Chem.MolFromSmiles(row[0])
except e:
print e,row
continue
molecules.append(temp)
observations.append(int(row[2]))
#molecules = [Chem.MolFromSmiles(d[0]) for d in row]
#observations = [int(d[2]) for d in raw]
data = [(x,y) for x,y in zip(molecules,observations) if x is not None]
from rdkit.Chem import AllChem
import random
ind = range(len(data))
random.seed(0)
random.shuffle(ind)
half = int(round(0.5*len(ind)))
train = ind[:half]
test = ind[half:]
train_morgan_fps = []
train_rd_fps = []
train_ys = []
train_info=[]
test_info=[]
for i in train:
mol = data[i][0]
info = {}
temp = AllChem.GetMorganFingerprintAsBitVect(mol,2,2048,bitInfo=info)
train_morgan_fps.append(temp)
train_info.append(info)
train_rd_fps.append(Chem.RDKFingerprint(mol))
train_ys.append(int(data[i][1]))
test_morgan_fps = []
test_rd_fps = []
test_ys = []
for i in test:
mol = data[i][0]
info = {}
temp = AllChem.GetMorganFingerprintAsBitVect(mol,2,2048,bitInfo=info)
test_morgan_fps.append(temp)
test_info.append(info)
test_rd_fps.append(Chem.RDKFingerprint(mol))
test_ys.append(int(data[i][1]))
#import cPickle
#outf=file('fps.pkl','wb+')
#cPickle.dump((train,train_morgan_fps,train_info,train_rd_fps,train_ys),outf)
#cPickle.dump((test,test_morgan_fps,test_info,test_rd_fps,test_ys),outf)
Writing out the data can be a bit lengthy (especially on the virtual machine), hence the written file is provided in mychembl and the text is commented out. When uncommenting, please give the script some time before continuing. If an error occurs below 'EOF: ' this means the writing was not completed. Try these two sections again in this case.
import cPickle
inf=file('/home/chembl/ipynb_workbench/fps.pkl','rb+')
(train,train_morgan_fps,train_info,train_rd_fps,train_ys)=cPickle.load(inf)
(test,test_morgan_fps,test_info,test_rd_fps,test_ys)=cPickle.load(inf)
from sklearn.naive_bayes import BernoulliNB
morgan_bnb = BernoulliNB(fit_prior=False)
morgan_bnb.fit(train_morgan_fps,train_ys)
morgan_predictions = morgan_bnb.predict(test_morgan_fps)
print 'Morgan Fingerprints'
print confusion_matrix_summary(test_ys,morgan_predictions)
print classification_report(test_ys,morgan_predictions)
print '------------------------------------------------------------'
print 'RDKit Fingerprints'
rd_bnb = BernoulliNB(fit_prior=False)
rd_bnb.fit(train_rd_fps,train_ys)
rd_predictions = rd_bnb.predict(test_rd_fps)
print confusion_matrix_summary(test_ys,rd_predictions)
print classification_report(test_ys,rd_predictions)
morgan_probabilities = [p[1] for p in morgan_bnb.predict_proba(test_morgan_fps)]
rd_probabilities = [p[1] for p in rd_bnb.predict_proba(test_rd_fps)]
Morgan Fingerprints Results Table (experiment in rows): 1153 332 | 77.64 548 1219 | 68.99 -------------- 67.78 78.59 precision recall f1-score support 0 0.68 0.78 0.72 1485 1 0.79 0.69 0.73 1767 avg / total 0.74 0.73 0.73 3252 ------------------------------------------------------------ RDKit Fingerprints Results Table (experiment in rows): 1012 473 | 68.15 750 1017 | 57.56 -------------- 57.43 68.26 precision recall f1-score support 0 0.57 0.68 0.62 1485 1 0.68 0.58 0.62 1767 avg / total 0.63 0.62 0.62 3252
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(10,5)
figure.tight_layout()
createROCPlot(plt1,test_ys,morgan_probabilities,'Morgan FP')
createROCPlot(plt2,test_ys,rd_probabilities,'RDKit FP')
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(10,5)
figure.tight_layout()
createPropAccPlot(plt1,test_ys,morgan_probabilities,'Morgan FP')
createPropAccPlot(plt2,test_ys,rd_probabilities,'RDKit FP')
Area under the ROC curve : 0.791429 Area under the ROC curve : 0.646198
bit_probabilities = morgan_bnb.feature_log_prob_
position = []
enrichment = []
for d in zip(range(len(bit_probabilities[1])),[(w[0]-w[1]) for w in zip(bit_probabilities[0],bit_probabilities[1])]):
if abs(d[1]) >= 1: #one log-level probablity difference between classes
position.append(d[0])
enrichment.append(d[1])
#as implemented in sklearn
check_predictions = [1 if t[1] >= t[0] else 0 for t in morgan_bnb.predict_proba(test_morgan_fps)]
print confusion_matrix_summary(test_ys,check_predictions)
#equivalent for binary classification
check_predictions = [1 if p >= 0.5 else 0 for p in [t[1] for t in morgan_bnb.predict_proba(test_morgan_fps)]]
print confusion_matrix_summary(test_ys,check_predictions)
Results Table (experiment in rows): 1153 332 | 77.64 548 1219 | 68.99 -------------- 67.78 78.59 Results Table (experiment in rows): 1153 332 | 77.64 548 1219 | 68.99 -------------- 67.78 78.59
Note, depending on your hardware this can take up to 30 minutes.
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
#params = {'max_depth':[2,5,10,20],'min_samples_split':[2,8,32,128],'min_samples_leaf':[1,2,5,10],'compute_importances':[True],'n_estimators':[200]}
#grid search is very timeconsuming, ~3hrs
params = {'min_samples_split': [8], 'max_depth': [20], 'min_samples_leaf': [1],'n_estimators':[200]}
cv = KFold(n=len(train),n_folds=10,shuffle=True)
gs = GridSearchCV(RandomForestClassifier(), params, cv=cv,verbose=1,refit=True)
gs.fit(train_morgan_fps, train_ys)
#print 'Best score: %0.2f'%gs.best_score_
#print 'Training set performance using best parameters (%s)'%gs.best_params_
best_morgan_treemodel = gs.best_estimator_
#training set evaluation
best_morgan_tree_prediction = best_morgan_treemodel.predict(test_morgan_fps)
print 'Morgan Fingerprints'
print confusion_matrix_summary(test_ys,best_morgan_tree_prediction)
print classification_report(test_ys,best_morgan_tree_prediction)
print '------------------------------------------------------------'
gs.fit(train_rd_fps, train_ys)
#print 'Best score: %0.2f'%gs.best_score_
#print 'Training set performance using best parameters (%s)'%gs.best_params_
best_rd_treemodel = gs.best_estimator_
#training set evaluation
best_rd_tree_prediction = best_rd_treemodel.predict(test_rd_fps)
print 'RDKit Fingerprints'
print confusion_matrix_summary(test_ys,best_rd_tree_prediction)
print classification_report(test_ys,best_rd_tree_prediction)
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 2.0min finished
Morgan Fingerprints Results Table (experiment in rows): 1187 298 | 79.93 456 1311 | 74.19 -------------- 72.25 81.48 precision recall f1-score support 0 0.72 0.80 0.76 1485 1 0.81 0.74 0.78 1767 avg / total 0.77 0.77 0.77 3252 ------------------------------------------------------------ Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 1.9min finished
RDKit Fingerprints Results Table (experiment in rows): 1177 308 | 79.26 372 1395 | 78.95 -------------- 75.98 81.91 precision recall f1-score support 0 0.76 0.79 0.78 1485 1 0.82 0.79 0.80 1767 avg / total 0.79 0.79 0.79 3252
morgan_probabilities = [p[1] for p in best_morgan_treemodel.predict_proba(test_morgan_fps)]
rd_probabilities = [p[1] for p in best_rd_treemodel.predict_proba(test_rd_fps)]
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(10,5)
figure.tight_layout()
createROCPlot(plt1,test_ys,morgan_probabilities,'Morgan FP')
createROCPlot(plt2,test_ys,rd_probabilities,'RDKit FP')
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(10,5)
figure.tight_layout()
createPropAccPlot(plt1,test_ys,morgan_probabilities,'Morgan FP')
createPropAccPlot(plt2,test_ys,rd_probabilities,'RDKit FP')
Area under the ROC curve : 0.847774 Area under the ROC curve : 0.865580
Idea: Compute the probability of a bit for each class, identify bits enriched in one class and report substructures associated with these bits. Then identify substructures that occur predominantly in molecules belonging to one class
images = []
smiles = {}
#find all substructures associated with enriched bits
for bit in zip(position,enrichment):
i = 0
class0_prob = bit_probabilities[0][bit[0]]
class1_prob = bit_probabilities[1][bit[0]]
for info in train_info:
if info.has_key(bit[0]):
try:
m = data[train[i]][0]
atomId,radius = info[bit[0]][0]
env=Chem.FindAtomEnvironmentOfRadiusN(m,radius,atomId)
amap={}
#submol = Chem.PathToSubmol(m,env,atomMap=amap)
ats = set([atomId])
for bidx in env:
bond = m.GetBondWithIdx(bidx)
ats.add(bond.GetBeginAtomIdx())
ats.add(bond.GetEndAtomIdx())
smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(ats),bondsToUse=env,rootedAtAtom=atomId)
#smi = Chem.FastFindRings(smi)
if smiles.has_key(smi):
num_molecules = smiles[smi][0]
num_positives = smiles[smi][1]
if data[train[i]][1] == 1: num_positives+=1
smiles[smi] = (num_molecules+1,num_positives,bit[0],class0_prob,class1_prob)
else:
num_positives = 0
if data[train[i]][1] == 1: num_positives=1
smiles[smi] = (1,num_positives,bit[0],class0_prob,class1_prob)
except:
continue
i+=1
results=[]
for smi in smiles.keys():
ratio = float(smiles[smi][1])/smiles[smi][0]
if smiles[smi][0] > 5 and (ratio < 0.2 or ratio > 0.8): #show only substructures occuring in at least 5 molecules, with either less than 20% or more than 80% active proportion
results.append((smiles[smi][2],smiles[smi][0],smiles[smi][1],(smiles[smi][3]),(smiles[smi][4]),smi))
results.sort(key=lambda x: x[0])
for line in results:
print 'bit %i: \t%i (%i) molecules, in-class log_probabilities %0.3f %0.3f \t SMILES: %s'%line
bit 6: 6 (6) molecules, in-class log_probabilities -6.637 -5.154 SMILES: n(C)(c(n)N)c(c)c bit 11: 8 (0) molecules, in-class log_probabilities -4.557 -5.848 SMILES: S(C)CC bit 16: 12 (1) molecules, in-class log_probabilities -4.622 -5.665 SMILES: C(C)(CC)(C(=C)C)C(C)C bit 34: 6 (5) molecules, in-class log_probabilities -5.250 -4.161 SMILES: C(C)n bit 64: 10 (10) molecules, in-class log_probabilities -5.720 -4.413 SMILES: c(cc)(cs)C(N)=O bit 79: 24 (1) molecules, in-class log_probabilities -3.803 -5.665 SMILES: C(c)C bit 106: 7 (1) molecules, in-class log_probabilities -4.194 -5.260 SMILES: c(c)(n)O bit 131: 15 (14) molecules, in-class log_probabilities -5.720 -4.624 SMILES: c(c)(c)s bit 144: 14 (14) molecules, in-class log_probabilities -5.250 -4.125 SMILES: N(C)=[N+] bit 164: 17 (16) molecules, in-class log_probabilities -3.774 -2.695 SMILES: c(c)([N+])s bit 199: 29 (26) molecules, in-class log_probabilities -4.765 -2.784 SMILES: c(c(c)[N+])c(c)[N+] bit 255: 36 (36) molecules, in-class log_probabilities -5.027 -3.506 SMILES: [N+](=O)([O-])c(c)o bit 256: 14 (14) molecules, in-class log_probabilities -5.250 -4.199 SMILES: c(C=C)(cc)oc bit 307: 7 (0) molecules, in-class log_probabilities -4.765 -6.071 SMILES: N(C)S bit 312: 11 (11) molecules, in-class log_probabilities -5.720 -4.567 SMILES: N(CC)=[N+]=[N-] bit 394: 15 (1) molecules, in-class log_probabilities -4.439 -6.358 SMILES: CS bit 505: 10 (9) molecules, in-class log_probabilities -5.943 -4.461 SMILES: C(CC)(OC)c(c)c bit 513: 7 (0) molecules, in-class log_probabilities -5.027 -6.071 SMILES: P(=O)(Oc)(Oc)Oc bit 587: 20 (20) molecules, in-class log_probabilities -5.384 -3.628 SMILES: c(cc)(cc)C(N)=O bit 598: 15 (2) molecules, in-class log_probabilities -3.666 -4.818 SMILES: C(c)(C)C bit 606: 7 (1) molecules, in-class log_probabilities -3.480 -4.624 SMILES: c(=O)(nc)n(C)c bit 628: 15 (2) molecules, in-class log_probabilities -4.239 -5.511 SMILES: C[N+] bit 647: 46 (39) molecules, in-class log_probabilities -4.765 -3.673 SMILES: c(N)(cc)cc bit 684: 7 (0) molecules, in-class log_probabilities -4.691 -6.764 SMILES: C(=O)(O)CO bit 715: 508 (423) molecules, in-class log_probabilities -2.518 -1.379 SMILES: [O-] bit 716: 206 (181) molecules, in-class log_probabilities -3.864 -2.194 SMILES: c(c)(c)[N+] bit 716: 8 (7) molecules, in-class log_probabilities -3.864 -2.194 SMILES: n(cc)c(c)c bit 724: 6 (1) molecules, in-class log_probabilities -4.385 -5.665 SMILES: C(c)(c)=N bit 725: 11 (11) molecules, in-class log_probabilities -2.657 -1.499 SMILES: c(c(C)c)c([N+])s bit 725: 237 (211) molecules, in-class log_probabilities -2.657 -1.499 SMILES: c(cc)(cc)c(c)c bit 740: 28 (27) molecules, in-class log_probabilities -4.845 -3.525 SMILES: [N-]=[N+] bit 753: 486 (411) molecules, in-class log_probabilities -2.852 -1.422 SMILES: O=[N+] bit 780: 20 (19) molecules, in-class log_probabilities -6.231 -4.818 SMILES: C(=Cc)c(c)c bit 801: 12 (1) molecules, in-class log_probabilities -4.691 -5.848 SMILES: c(cc)(c(N)s)c(c)n bit 808: 7 (0) molecules, in-class log_probabilities -5.027 -6.764 SMILES: C(C)(C)(Cl)Cl bit 811: 13 (13) molecules, in-class log_probabilities -5.384 -4.366 SMILES: O(C(C)=O)N(C)O bit 815: 20 (2) molecules, in-class log_probabilities -4.557 -3.180 SMILES: S(c)(N)(=O)=O bit 815: 101 (88) molecules, in-class log_probabilities -4.557 -3.180 SMILES: N(N)=O bit 838: 495 (416) molecules, in-class log_probabilities -2.948 -1.422 SMILES: [O-][N+] bit 851: 7 (0) molecules, in-class log_probabilities -4.497 -5.665 SMILES: C(Cc)(NC)C(=O)O bit 860: 48 (46) molecules, in-class log_probabilities -6.231 -4.322 SMILES: c(c(c)[N+])c(c)c bit 870: 13 (13) molecules, in-class log_probabilities -5.250 -4.056 SMILES: c(C)(c(c)c)c(c)c bit 909: 21 (17) molecules, in-class log_probabilities -4.285 -5.511 SMILES: N(C)(C)N bit 922: 7 (7) molecules, in-class log_probabilities -5.943 -4.238 SMILES: s(c(c)c)c(c)c bit 961: 99 (82) molecules, in-class log_probabilities -3.929 -2.903 SMILES: c(cc)c(c)n bit 978: 7 (0) molecules, in-class log_probabilities -4.285 -5.511 SMILES: c(cc)(c(c)C)c(c)[nH] bit 979: 133 (116) molecules, in-class log_probabilities -4.152 -2.621 SMILES: O=N bit 991: 7 (0) molecules, in-class log_probabilities -4.622 -5.665 SMILES: c(cc)(OC)c(c)Cl bit 994: 11 (11) molecules, in-class log_probabilities -5.943 -4.567 SMILES: C(ON)c(c)c bit 1007: 17 (15) molecules, in-class log_probabilities -5.720 -4.461 SMILES: c(c(-c)c)(c(-c)c)c(c)c bit 1015: 10 (10) molecules, in-class log_probabilities -5.720 -4.567 SMILES: c(cc)(cc)N=O bit 1033: 15 (13) molecules, in-class log_probabilities -5.720 -4.567 SMILES: c(nc)(c(n)N)c(n)n bit 1034: 6 (1) molecules, in-class log_probabilities -3.746 -4.749 SMILES: C(C)(C)(O)O bit 1056: 30 (30) molecules, in-class log_probabilities -5.250 -3.819 SMILES: c(cc)c([N+])o bit 1083: 20 (20) molecules, in-class log_probabilities -4.334 -3.314 SMILES: N(=O)N(C)C bit 1098: 19 (17) molecules, in-class log_probabilities -6.637 -4.413 SMILES: N(CC)c(c)c bit 1110: 16 (3) molecules, in-class log_probabilities -3.641 -5.260 SMILES: c(Cl)(cc)c(c)O bit 1129: 14 (1) molecules, in-class log_probabilities -4.765 -6.071 SMILES: C(NC)C(C)O bit 1195: 460 (395) molecules, in-class log_probabilities -3.026 -1.448 SMILES: [N+](c)(=O)[O-] bit 1216: 31 (31) molecules, in-class log_probabilities -6.637 -3.846 SMILES: c(cc)(oc)[N+](=O)[O-] bit 1228: 77 (62) molecules, in-class log_probabilities -5.720 -6.764 SMILES: O(C)C bit 1256: 23 (3) molecules, in-class log_probabilities -4.497 -6.071 SMILES: C(CC)C(=O)O bit 1263: 7 (6) molecules, in-class log_probabilities -6.231 -4.684 SMILES: C(N)(O)=O bit 1276: 6 (0) molecules, in-class log_probabilities -5.250 -6.358 SMILES: O(C(C)=O)C(C)C bit 1276: 8 (7) molecules, in-class log_probabilities -5.250 -6.358 SMILES: C(=CC)c(c)c bit 1300: 16 (13) molecules, in-class log_probabilities -4.557 -3.283 SMILES: C(Cc)C(c)=O bit 1300: 53 (48) molecules, in-class log_probabilities -4.557 -3.283 SMILES: n(c(c)c)c(c)c bit 1326: 12 (1) molecules, in-class log_probabilities -4.622 -6.764 SMILES: C(CC)(C(C)C)C(C)C bit 1354: 9 (0) molecules, in-class log_probabilities -5.538 -6.764 SMILES: c(Cl)(cc)c(c)C bit 1358: 6 (1) molecules, in-class log_probabilities -5.250 -6.764 SMILES: C(C)Nc bit 1365: 78 (13) molecules, in-class log_probabilities -3.666 -4.892 SMILES: c(C)(c)c bit 1365: 12 (11) molecules, in-class log_probabilities -3.666 -4.892 SMILES: c(O)(c(C)c)c(c)C bit 1366: 59 (9) molecules, in-class log_probabilities -2.841 -4.023 SMILES: C(CC)C(C)C bit 1382: 8 (0) molecules, in-class log_probabilities -4.439 -7.457 SMILES: C(=O)(CC)Nc bit 1386: 66 (12) molecules, in-class log_probabilities -3.322 -4.684 SMILES: C(C)(O)=O bit 1386: 6 (1) molecules, in-class log_probabilities -3.322 -4.684 SMILES: C(CC)C(C)=C bit 1393: 7 (0) molecules, in-class log_probabilities -5.538 -7.457 SMILES: C(CC)C(C)N bit 1421: 15 (2) molecules, in-class log_probabilities -5.538 -6.764 SMILES: C(C)(C)=O bit 1421: 6 (1) molecules, in-class log_probabilities -5.538 -6.764 SMILES: O(CC)P(O)(O)=O bit 1433: 24 (23) molecules, in-class log_probabilities -6.231 -4.567 SMILES: c(cc)(c(c)c)[N+](=O)[O-] bit 1434: 8 (1) molecules, in-class log_probabilities -5.720 -6.764 SMILES: c(cc)(cc)C(C)O bit 1444: 242 (40) molecules, in-class log_probabilities -3.039 -4.056 SMILES: C(CC)CC bit 1449: 28 (27) molecules, in-class log_probabilities -5.133 -4.023 SMILES: [N-] bit 1489: 13 (2) molecules, in-class log_probabilities -5.720 -6.764 SMILES: c(cc)(cc)S(N)(=O)=O bit 1490: 13 (12) molecules, in-class log_probabilities -5.027 -6.764 SMILES: C(=O)(NC)c(c)c bit 1500: 26 (25) molecules, in-class log_probabilities -7.330 -5.848 SMILES: c(cc)(c(c)C)c(c)c bit 1532: 8 (1) molecules, in-class log_probabilities -4.691 -6.358 SMILES: C(=O)(O)C=C bit 1532: 6 (1) molecules, in-class log_probabilities -4.691 -6.358 SMILES: c(c(c)S)c(c)N bit 1549: 8 (7) molecules, in-class log_probabilities -6.637 -5.154 SMILES: c(N=N)(c(c)O)c(c)S bit 1553: 8 (8) molecules, in-class log_probabilities -6.637 -5.154 SMILES: c(nc)(c(c)c)c(c)n bit 1554: 40 (7) molecules, in-class log_probabilities -4.194 -5.511 SMILES: c(Cl)(cc)c(c)Cl bit 1562: 6 (6) molecules, in-class log_probabilities -6.231 -5.059 SMILES: c(cc)(c(c)n)c(c)n bit 1566: 15 (14) molecules, in-class log_probabilities -6.637 -4.892 SMILES: c(cc)(c(c)c)c(c)n bit 1574: 260 (235) molecules, in-class log_probabilities -4.034 -5.665 SMILES: c(cc)(c(c)c)c(c)c bit 1574: 9 (0) molecules, in-class log_probabilities -4.034 -5.665 SMILES: N(CC)C(C)C bit 1582: 23 (2) molecules, in-class log_probabilities -4.152 -5.511 SMILES: C(CC)C(C)(C)C bit 1582: 12 (2) molecules, in-class log_probabilities -4.152 -5.511 SMILES: C(C)(C)=CC bit 1592: 10 (1) molecules, in-class log_probabilities -4.622 -6.071 SMILES: O(CC)C(c)=O bit 1611: 7 (6) molecules, in-class log_probabilities -5.250 -7.457 SMILES: n(c(n)N)c(c)c bit 1615: 9 (9) molecules, in-class log_probabilities -7.330 -4.624 SMILES: c(cc)(c(c)c)c(c)[nH] bit 1624: 25 (2) molecules, in-class log_probabilities -4.034 -5.154 SMILES: C(=O)(O)C(C)N bit 1648: 67 (56) molecules, in-class log_probabilities -4.497 -3.314 SMILES: c(c)(c)C bit 1655: 23 (23) molecules, in-class log_probabilities -5.943 -4.238 SMILES: o(c(C)c)c(c)[N+] bit 1660: 12 (0) molecules, in-class log_probabilities -4.932 -6.358 SMILES: C(=O)(NC)C(C)N bit 1660: 7 (0) molecules, in-class log_probabilities -4.932 -6.358 SMILES: c(cc)(CC)c(C)c bit 1674: 19 (19) molecules, in-class log_probabilities -5.720 -4.684 SMILES: C1(c(c)c)OC1c bit 1707: 44 (39) molecules, in-class log_probabilities -6.637 -5.511 SMILES: C1OC1C bit 1718: 187 (32) molecules, in-class log_probabilities -4.152 -5.848 SMILES: C(C)C bit 1718: 12 (0) molecules, in-class log_probabilities -4.152 -5.848 SMILES: C(CC)C(N)=O bit 1718: 7 (1) molecules, in-class log_probabilities -4.152 -5.848 SMILES: C(C=C)C(C)C bit 1733: 23 (2) molecules, in-class log_probabilities -3.480 -2.110 SMILES: C(=O)(CC)OC bit 1734: 27 (1) molecules, in-class log_probabilities -3.833 -5.059 SMILES: C(CC)OC bit 1737: 116 (22) molecules, in-class log_probabilities -3.053 -4.090 SMILES: C(C)(=O)O bit 1746: 47 (6) molecules, in-class log_probabilities -4.765 -6.071 SMILES: C(=O)(O)CC bit 1757: 27 (3) molecules, in-class log_probabilities -5.384 -6.764 SMILES: N(C)(C)C bit 1778: 16 (1) molecules, in-class log_probabilities -3.864 -5.154 SMILES: c(cc)(cc)CC bit 1792: 6 (6) molecules, in-class log_probabilities -6.637 -5.378 SMILES: C(Cl)CN bit 1805: 21 (3) molecules, in-class log_probabilities -6.637 -5.260 SMILES: C(CC)C(C)O bit 1805: 6 (6) molecules, in-class log_probabilities -6.637 -5.260 SMILES: c(c(c)N)c(c)[N+] bit 1809: 416 (357) molecules, in-class log_probabilities -3.719 -2.270 SMILES: [N+](=O)([O-])c(c)c bit 1814: 286 (252) molecules, in-class log_probabilities -4.239 -2.453 SMILES: c(cc)c(c)[N+] bit 1838: 27 (26) molecules, in-class log_probabilities -5.133 -4.090 SMILES: [N+](=N)=[N-] bit 1845: 34 (3) molecules, in-class log_probabilities -6.231 -5.154 SMILES: C(=O)(OC)c(c)c bit 1847: 22 (1) molecules, in-class log_probabilities -3.962 -5.260 SMILES: C(C)(C)N bit 1853: 26 (5) molecules, in-class log_probabilities -3.962 -5.059 SMILES: c(cc)c(C)c bit 1856: 30 (25) molecules, in-class log_probabilities -5.384 -4.125 SMILES: c(cc)c(c)N bit 1875: 23 (19) molecules, in-class log_probabilities -5.538 -6.764 SMILES: c(O)(cc)c(C)c bit 1880: 36 (36) molecules, in-class log_probabilities -5.720 -6.764 SMILES: c(cc)(c(c)[N+])c(c)c bit 1884: 6 (5) molecules, in-class log_probabilities -4.239 -5.665 SMILES: c(cc)(c(c)c)C(C)O bit 1884: 25 (3) molecules, in-class log_probabilities -4.239 -5.665 SMILES: C(c)(C)(C)C bit 1884: 61 (12) molecules, in-class log_probabilities -4.239 -5.665 SMILES: C(C)(C)(C)C bit 1894: 26 (2) molecules, in-class log_probabilities -5.720 -6.764 SMILES: C(CC)=C(C)C bit 1912: 122 (112) molecules, in-class log_probabilities -4.765 -3.223 SMILES: c(c(c)c)(c(c)c)c(c)c bit 1914: 7 (6) molecules, in-class log_probabilities -4.932 -3.545 SMILES: C(CCl)N(C)C bit 1914: 100 (90) molecules, in-class log_probabilities -4.932 -3.545 SMILES: c(c(c)c)c(c)c bit 1923: 8 (1) molecules, in-class log_probabilities -4.239 -3.238 SMILES: C(C)(C)OC bit 1925: 63 (52) molecules, in-class log_probabilities -5.027 -3.874 SMILES: C(=O)(c(c)c)c(c)c bit 1925: 7 (7) molecules, in-class log_probabilities -5.027 -3.874 SMILES: C(C)(N)O bit 1936: 7 (7) molecules, in-class log_probabilities -5.943 -4.413 SMILES: c(c(c)C)c(c)c bit 1940: 186 (154) molecules, in-class log_probabilities -4.152 -2.882 SMILES: c(cc)(cc)[N+](=O)[O-] bit 1940: 32 (6) molecules, in-class log_probabilities -4.152 -2.882 SMILES: C(CC)(C(C)C)C(C)(C)C bit 1950: 161 (25) molecules, in-class log_probabilities -3.864 -5.378 SMILES: C(C)(C)C bit 1963: 531 (448) molecules, in-class log_probabilities -2.887 -1.403 SMILES: [N+] bit 1963: 6 (0) molecules, in-class log_probabilities -2.887 -1.403 SMILES: O(C(C)C)C(C)O bit 1984: 19 (18) molecules, in-class log_probabilities -3.235 -1.805 SMILES: O(C)N bit 1984: 316 (263) molecules, in-class log_probabilities -3.235 -1.805 SMILES: c(c)(c)c bit 2006: 12 (1) molecules, in-class log_probabilities -5.250 -6.358 SMILES: C(c(c)c)C(C)N bit 2016: 10 (0) molecules, in-class log_probabilities -4.845 -6.764 SMILES: N(C(C)=O)C(C)C bit 2022: 30 (3) molecules, in-class log_probabilities -5.720 -6.764 SMILES: C(OC)(OC)C(C)O
The following section will demonstrate the construction of regression models using Random Forests from scikit-learn. The dataset consists of a csv file with precaclulated descriptors. The data is on the Adenosine A2A receptor (accession P29274) and consists of ~1500 compound obtained from ChEMBL_18. The tutorial uses pandas data frames.
The data can be obtained from here : http://www.gjpvanwesten.nl/miscellaneous/A2A_Adenosine_set_mychembl.csv
import numpy as np
import os,sys
import gzip
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit import DataStructs
from rdkit.DataStructs import BitVectToText
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs
from rdkit.Chem import MCS as MCS
from rdkit.Chem import Descriptors as Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
from rdkit.Chem import PandasTools as PandasTools
from rdkit.Chem import Descriptors as Descriptors
import pylab as plt
# Definition of a function to remove near zero variance descriptors
from collections import defaultdict
def rmNearZeroVarDescs(descs,ratio):
colNb = descs.shape[1]
to_keep=[]
for j in range(0,colNb):
descs_now = descs[:,j]
d = defaultdict(int)
for i in descs_now:
d[i] += 1
freqs=sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
if len(freqs) > 1: # there is more than one value..
most_frequent1 = freqs[0][1]
most_frequent2 = freqs[1][1]
if (np.array(most_frequent1)/np.array(most_frequent2)) < ratio:
to_keep.append(j)
return np.array(to_keep)
To demonstrate the use, the data is loaded and manipulated with pandas dataframes.
frame = pd.read_csv('/home/chembl/ipynb_workbench/A2A_Adenosine_set_mychembl.csv')
The data should be 1567 rows and 497 columns.
frame.shape
(1567, 497)
frame.head()
ACCESSION | MOLREGNO | PCHEMBL_VALUE | HBA | HBD | PSA | RTB | MED_CHEM_FRIENDLY | NUM_ALERTS | HEAVY_ATOMS | ... | CM#FCFP_6#-835254725 | CM#FCFP_6#238890401 | CM#FCFP_6#522623800 | CM#FCFP_6#426949827 | CM#FCFP_6#930691991 | CM#FCFP_6#-692413328 | CM#FCFP_6#-703392038 | CM#FCFP_6#238434898 | adenosine_temp | diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | P29274 | 591791 | 7.24 | 6 | 2 | 103.02 | 5 | 1 | 0 | 25 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.239994 | 0.000006 |
1 | P29274 | 439974 | 8.27 | 5 | 1 | 89.08 | 5 | 1 | 0 | 30 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8.269871 | 0.000129 |
2 | P29274 | 500067 | 6.48 | 6 | 3 | 106.24 | 10 | 1 | 1 | 39 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6.480154 | 0.000154 |
3 | P29274 | 1441734 | 6.91 | 6 | 2 | 106.30 | 7 | 1 | 1 | 25 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6.909724 | 0.000276 |
4 | P29274 | 1421790 | 5.96 | 3 | 2 | 70.67 | 4 | 1 | 0 | 17 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.959641 | 0.000359 |
5 rows × 497 columns
The full frame is copied to form the x variables. From this copy the y variable and columns not required for learning are dropped. Subsequently the y variable is copied to a seperate dataframe.
x = frame
PandasTools.AddMoleculeColumnToFrame(x,smilesCol='Smiles',molCol='mol',includeFingerprints=True)
#remove properties not required for learning
x = x.drop('Smiles',1)
x = x.drop('mol',1)
x = x.drop('ACCESSION',1)
x = x.drop('MOLREGNO',1)
x = x.drop('PCHEMBL_VALUE',1)
x = x.drop('adenosine_temp' ,1)
x = x.drop('diff' ,1)
#output variables are transferred to a frame called y
y = frame.PCHEMBL_VALUE
#use the column names as labels for later
Desc_values = list(x.columns)
Desc_values
['HBA', 'HBD', 'PSA', 'RTB', 'MED_CHEM_FRIENDLY', 'NUM_ALERTS', 'HEAVY_ATOMS', 'FULL_MWT', 'AROMATIC_RINGS', 'NUM_RO5_VIOLATIONS', 'RO3_PASS', 'Acid', 'Base', 'Unknown', 'Ki', 'Kd', 'IC50', 'EC50', 'AC50', 'Other_activity_type', 'LogP', 'Num_Atoms', 'Num_Hydrogens', 'Num_ExplicitBonds', 'Num_PositiveAtoms', 'Num_NegativeAtoms', 'Num_RingBonds', 'Num_RotatableBonds', 'Num_AromaticBonds', 'Num_BridgeBonds', 'Num_Rings', 'Num_RingAssemblies', 'Num_Chains', 'Num_ChainAssemblies', 'Num_MetalAtoms', 'Num_PiBonds', 'Num_StereoAtoms', 'Num_StereoBonds', 'Num_SingleBonds', 'Num_DoubleBonds', 'Num_TripleBonds', 'Num_AliphaticSingleBonds', 'Num_HydrogenBonds', 'Num_TerminalRotomers', 'Num_H_Acceptors', 'Num_H_Donors', 'H_Count', 'C_Count', 'N_Count', 'O_Count', 'F_Count', 'P_Count', 'S_Count', 'Cl_Count', 'Br_Count', 'I_Count', 'Molecular_Solubility', 'Molecular_Volume', 'Molecular_SASA', 'Molecular_PolarSASA_Fraction', 'Solubility', 'Total_Atoms', 'Num_AliphaticRings', 'Heavy_Atom_Bonds', 'Hydrogen_Atom_Bonds', 'SingleBonds_Frac', 'DoubleBonds_Frac', 'AromaticBonds_Frac', 'BridgeBonds_Frac', 'StereoBonds_Frac', 'RingBonds_Frac', 'Aliphatic_Ringbonds_Frac', 'RotatableBonds_Frac', 'Num_Halogens', 'Carbon_fraction', 'Hydrogen_fraction', 'Nitrogen_fraction', 'Heteroatom_fraction', 'Sulphur_fraction', 'Phosphorus_fraction', 'Halogen_fraction', 'OtherAtom_fraction', 'StereoAtom_Fraction', 'PositiveAtom_Fraction', 'NegativeAtom_Fraction', 'H_Acceptors_Fraction', 'H_Donors_fraction', 'Molecular_Weight', 'Lipinski_Pass', 'Molecular_PolarSurfaceArea', 'Molecular_PolarSurfaceArea_Fraction', 'CM#FCFP_6#-65982584', 'CM#FCFP_6#1220222990', 'CM#FCFP_6#-1982171447', 'CM#FCFP_6#-2001551693', 'CM#FCFP_6#-1771696584', 'CM#FCFP_6#1979799016', 'CM#FCFP_6#-1115205388', 'CM#FCFP_6#-65868451', 'CM#FCFP_6#-492065043', 'CM#FCFP_6#-1642107661', 'CM#FCFP_6#-189641276', 'CM#FCFP_6#1462500218', 'CM#FCFP_6#1531826645', 'CM#FCFP_6#-293671387', 'CM#FCFP_6#-1071560772', 'CM#FCFP_6#27378098', 'CM#FCFP_6#-1001873921', 'CM#FCFP_6#193706886', 'CM#FCFP_6#1164792596', 'CM#FCFP_6#-1146141487', 'CM#FCFP_6#285374308', 'CM#FCFP_6#-498663958', 'CM#FCFP_6#-1479282148', 'CM#FCFP_6#-971683173', 'CM#FCFP_6#-699707120', 'CM#FCFP_6#1253847022', 'CM#FCFP_6#391068706', 'CM#FCFP_6#1982718309', 'CM#FCFP_6#1703564034', 'CM#FCFP_6#1140530667', 'CM#FCFP_6#187601626', 'CM#FCFP_6#-1459071555', 'CM#FCFP_6#-434116010', 'CM#FCFP_6#230514996', 'CM#FCFP_6#-936120204', 'CM#FCFP_6#372254866', 'CM#FCFP_6#-917072037', 'CM#FCFP_6#1692787152', 'CM#FCFP_6#73264552', 'CM#FCFP_6#1827402357', 'CM#FCFP_6#-327407377', 'CM#FCFP_6#-67620761', 'CM#FCFP_6#-1941818284', 'CM#FCFP_6#-1286158246', 'CM#FCFP_6#-54973313', 'CM#FCFP_6#1779217362', 'CM#FCFP_6#-1526113663', 'CM#FCFP_6#-2137417524', 'CM#FCFP_6#-2074416831', 'CM#FCFP_6#-1073796186', 'CM#FCFP_6#-413791989', 'CM#FCFP_6#-1532598714', 'CM#FCFP_6#-378110045', 'CM#FCFP_6#365650923', 'CM#FCFP_6#-1059904848', 'CM#FCFP_6#-1005932975', 'CM#FCFP_6#-972801681', 'CM#FCFP_6#-969679530', 'CM#FCFP_6#1685259762', 'CM#FCFP_6#-2090379603', 'CM#FCFP_6#835252212', 'CM#FCFP_6#1618940161', 'CM#FCFP_6#-1968861350', 'CM#FCFP_6#461422072', 'CM#FCFP_6#1234403192', 'CM#FCFP_6#1430777963', 'CM#FCFP_6#1976692152', 'CM#FCFP_6#69791780', 'CM#FCFP_6#1791505287', 'CM#FCFP_6#1635208540', 'CM#FCFP_6#-1401467450', 'CM#FCFP_6#1972424121', 'CM#FCFP_6#-1670510168', 'CM#FCFP_6#254350405', 'CM#FCFP_6#-645813023', 'CM#FCFP_6#603150292', 'CM#FCFP_6#684199633', 'CM#FCFP_6#480222659', 'CM#FCFP_6#1603387139', 'CM#FCFP_6#-1637545813', 'CM#FCFP_6#-2129824562', 'CM#FCFP_6#-330170411', 'CM#FCFP_6#1213951251', 'CM#FCFP_6#1284402681', 'CM#FCFP_6#623390154', 'CM#FCFP_6#1148911516', 'CM#FCFP_6#1135800885', 'CM#FCFP_6#362731743', 'CM#FCFP_6#-1577786275', 'CM#FCFP_6#1868119574', 'CM#FCFP_6#1068095685', 'CM#FCFP_6#94632228', 'CM#FCFP_6#929973182', 'CM#FCFP_6#-1420307139', 'CM#FCFP_6#444284352', 'CM#FCFP_6#-2080611022', 'CM#FCFP_6#592529482', 'CM#FCFP_6#-222683903', 'CM#FCFP_6#426057100', 'CM#FCFP_6#-2072891781', 'CM#FCFP_6#1330036798', 'CM#FCFP_6#-18280642', 'CM#FCFP_6#18254621', 'CM#FCFP_6#-308845632', 'CM#FCFP_6#1253553371', 'CM#FCFP_6#-1142665917', 'CM#FCFP_6#1163770894', 'CM#FCFP_6#1790016274', 'CM#FCFP_6#982725372', 'CM#FCFP_6#-1145636268', 'CM#FCFP_6#1607499308', 'CM#FCFP_6#1299628955', 'CM#FCFP_6#-1430326583', 'CM#FCFP_6#1268234302', 'CM#FCFP_6#808131372', 'CM#FCFP_6#-769112624', 'CM#FCFP_6#1934560662', 'CM#FCFP_6#1543168363', 'CM#FCFP_6#-2113922173', 'CM#FCFP_6#1161264108', 'CM#FCFP_6#-822042736', 'CM#FCFP_6#-1788355948', 'CM#FCFP_6#-617816210', 'CM#FCFP_6#-742051626', 'CM#FCFP_6#439408450', 'CM#FCFP_6#-1492117349', 'CM#FCFP_6#659537339', 'CM#FCFP_6#1894053239', 'CM#FCFP_6#-458341008', 'CM#FCFP_6#83705358', 'CM#FCFP_6#731405538', 'CM#FCFP_6#514530057', 'CM#FCFP_6#-1915268225', 'CM#FCFP_6#360781885', 'CM#FCFP_6#899878122', 'CM#FCFP_6#-1920691791', 'CM#FCFP_6#761090368', 'CM#FCFP_6#-411183202', 'CM#FCFP_6#102056907', 'CM#FCFP_6#888862702', 'CM#FCFP_6#1838609335', 'CM#FCFP_6#-990511789', 'CM#FCFP_6#460813766', 'CM#FCFP_6#-343334360', 'CM#FCFP_6#728315148', 'CM#FCFP_6#-1050194120', 'CM#FCFP_6#-1978061625', 'CM#FCFP_6#304605719', 'CM#FCFP_6#-1113036043', 'CM#FCFP_6#772808984', 'CM#FCFP_6#486646069', 'CM#FCFP_6#-1943511212', 'CM#FCFP_6#-1339251179', 'CM#FCFP_6#-1094335641', 'CM#FCFP_6#-1001309318', 'CM#FCFP_6#362069713', 'CM#FCFP_6#242046271', 'CM#FCFP_6#870604905', 'CM#FCFP_6#-1481112978', 'CM#FCFP_6#-2140071559', 'CM#FCFP_6#-1946918893', 'CM#FCFP_6#186783495', 'CM#FCFP_6#1195356364', 'CM#FCFP_6#-466948462', 'CM#FCFP_6#-123076023', 'CM#FCFP_6#-371808660', 'CM#FCFP_6#257620176', 'CM#FCFP_6#-400546712', 'CM#FCFP_6#543850689', 'CM#FCFP_6#1628996979', 'CM#FCFP_6#-1438008766', 'CM#FCFP_6#458149324', 'CM#FCFP_6#344221219', 'CM#FCFP_6#1333957356', 'CM#FCFP_6#-230926664', 'CM#FCFP_6#-1351757408', 'CM#FCFP_6#-455356268', 'CM#FCFP_6#35656213', 'CM#FCFP_6#1308153601', 'CM#FCFP_6#744772509', 'CM#FCFP_6#-682235529', 'CM#FCFP_6#1284767392', 'CM#FCFP_6#2124897395', 'CM#FCFP_6#338549743', 'CM#FCFP_6#-1960096778', 'CM#FCFP_6#622039763', 'CM#FCFP_6#860901833', 'CM#FCFP_6#1199403296', 'CM#FCFP_6#-1855248144', 'CM#FCFP_6#-570509192', 'CM#FCFP_6#975191202', 'CM#FCFP_6#-1914943219', 'CM#FCFP_6#-1435777484', 'CM#FCFP_6#1767086105', 'CM#FCFP_6#1935218400', 'CM#FCFP_6#-499162405', 'CM#FCFP_6#-650347350', 'CM#FCFP_6#1690711612', 'CM#FCFP_6#1246724734', 'CM#FCFP_6#1818492813', 'CM#FCFP_6#843746921', 'CM#FCFP_6#1977019013', 'CM#FCFP_6#-174657146', 'CM#FCFP_6#1673458031', 'CM#FCFP_6#-1803454487', 'CM#FCFP_6#663794721', 'CM#FCFP_6#-1741978002', 'CM#FCFP_6#-1291456811', 'CM#FCFP_6#-972090189', 'CM#FCFP_6#-1943506009', 'CM#FCFP_6#1853719422', 'CM#FCFP_6#-1057606618', 'CM#FCFP_6#1988809665', 'CM#FCFP_6#-315789915', 'CM#FCFP_6#2074795133', 'CM#FCFP_6#-746063837', 'CM#FCFP_6#-196821185', 'CM#FCFP_6#-883103639', 'CM#FCFP_6#-1215432213', 'CM#FCFP_6#-1263741263', 'CM#FCFP_6#767545668', 'CM#FCFP_6#362122913', 'CM#FCFP_6#556912035', 'CM#FCFP_6#1977011098', 'CM#FCFP_6#1855929186', 'CM#FCFP_6#91530927', 'CM#FCFP_6#-1838807585', 'CM#FCFP_6#1931481677', 'CM#FCFP_6#1406130289', 'CM#FCFP_6#-1143626567', 'CM#FCFP_6#562470208', 'CM#FCFP_6#-1634335053', 'CM#FCFP_6#804569186', 'CM#FCFP_6#985033767', 'CM#FCFP_6#932957673', 'CM#FCFP_6#-1162472683', 'CM#FCFP_6#-1053061282', 'CM#FCFP_6#722376637', 'CM#FCFP_6#723745966', 'CM#FCFP_6#-401775922', 'CM#FCFP_6#-1460694974', 'CM#FCFP_6#814438839', 'CM#FCFP_6#283309451', 'CM#FCFP_6#1904223630', 'CM#FCFP_6#-249417308', 'CM#FCFP_6#-448308528', 'CM#FCFP_6#1508475473', 'CM#FCFP_6#-1374907068', 'CM#FCFP_6#-540974741', 'CM#FCFP_6#1526117259', 'CM#FCFP_6#869938685', 'CM#FCFP_6#-1957956567', 'CM#FCFP_6#737073353', 'CM#FCFP_6#-619727385', 'CM#FCFP_6#-2083973249', 'CM#FCFP_6#1156755609', 'CM#FCFP_6#-160877196', 'CM#FCFP_6#-1073063049', 'CM#FCFP_6#436975416', 'CM#FCFP_6#1117446859', 'CM#FCFP_6#1197197857', 'CM#FCFP_6#-7655301', 'CM#FCFP_6#-2123312895', 'CM#FCFP_6#-1340282797', 'CM#FCFP_6#610160119', 'CM#FCFP_6#655144241', 'CM#FCFP_6#597068753', 'CM#FCFP_6#1441491593', 'CM#FCFP_6#1734354255', 'CM#FCFP_6#1293778554', 'CM#FCFP_6#840041502', 'CM#FCFP_6#442764008', 'CM#FCFP_6#-2084694990', 'CM#FCFP_6#224707551', 'CM#FCFP_6#72243472', 'CM#FCFP_6#517496183', 'CM#FCFP_6#606312525', 'CM#FCFP_6#-965582482', 'CM#FCFP_6#1097027750', 'CM#FCFP_6#-1384732668', 'CM#FCFP_6#-1529179653', 'CM#FCFP_6#-547603237', 'CM#FCFP_6#-668700475', 'CM#FCFP_6#-1535076754', 'CM#FCFP_6#564330994', 'CM#FCFP_6#1039701897', 'CM#FCFP_6#-1760059174', 'CM#FCFP_6#364727144', 'CM#FCFP_6#873995365', 'CM#FCFP_6#1716732258', 'CM#FCFP_6#-697570237', 'CM#FCFP_6#881330265', 'CM#FCFP_6#-145818131', 'CM#FCFP_6#-1424104851', 'CM#FCFP_6#-1856161761', 'CM#FCFP_6#908827089', 'CM#FCFP_6#-1811624534', 'CM#FCFP_6#1895855546', 'CM#FCFP_6#-4881488', 'CM#FCFP_6#305671550', 'CM#FCFP_6#1676761321', 'CM#FCFP_6#1752769507', 'CM#FCFP_6#412483143', 'CM#FCFP_6#-1748509072', 'CM#FCFP_6#1930116051', 'CM#FCFP_6#1589758501', 'CM#FCFP_6#-753319155', 'CM#FCFP_6#682513269', 'CM#FCFP_6#417888861', 'CM#FCFP_6#-1615197798', 'CM#FCFP_6#2069726452', 'CM#FCFP_6#1650783316', 'CM#FCFP_6#-57892020', 'CM#FCFP_6#2019264906', 'CM#FCFP_6#1789478575', 'CM#FCFP_6#-285234767', 'CM#FCFP_6#305251661', 'CM#FCFP_6#1655769626', 'CM#FCFP_6#1773134291', 'CM#FCFP_6#-589813731', 'CM#FCFP_6#1905931260', 'CM#FCFP_6#1572590862', 'CM#FCFP_6#-682930309', 'CM#FCFP_6#-1523364508', 'CM#FCFP_6#79404425', 'CM#FCFP_6#-58569187', 'CM#FCFP_6#-16171250', 'CM#FCFP_6#1514546927', 'CM#FCFP_6#-1601936993', 'CM#FCFP_6#-1548711670', 'CM#FCFP_6#123029058', 'CM#FCFP_6#514431804', 'CM#FCFP_6#-1219848372', 'CM#FCFP_6#6866222', 'CM#FCFP_6#-2059327199', 'CM#FCFP_6#1245516777', 'CM#FCFP_6#721109147', 'CM#FCFP_6#1148739285', 'CM#FCFP_6#72082411', 'CM#FCFP_6#1664708079', 'CM#FCFP_6#-612705684', 'CM#FCFP_6#-243694182', 'CM#FCFP_6#-1261114447', 'CM#FCFP_6#839741273', 'CM#FCFP_6#1960525771', 'CM#FCFP_6#1957636292', 'CM#FCFP_6#-1781982802', 'CM#FCFP_6#-1968900341', 'CM#FCFP_6#1991572711', 'CM#FCFP_6#839265750', 'CM#FCFP_6#-1816854191', 'CM#FCFP_6#996042223', 'CM#FCFP_6#-291402986', 'CM#FCFP_6#-175209142', 'CM#FCFP_6#1880146817', 'CM#FCFP_6#-1170294589', 'CM#FCFP_6#-648965472', 'CM#FCFP_6#-926642086', 'CM#FCFP_6#-509337055', 'CM#FCFP_6#-925106659', 'CM#FCFP_6#101683474', 'CM#FCFP_6#-283882430', 'CM#FCFP_6#-415156552', 'CM#FCFP_6#-212177535', 'CM#FCFP_6#2073268805', 'CM#FCFP_6#-1815229831', 'CM#FCFP_6#-59057453', 'CM#FCFP_6#-1871596165', 'CM#FCFP_6#-1974151053', 'CM#FCFP_6#968733428', 'CM#FCFP_6#-918244187', 'CM#FCFP_6#-550198050', 'CM#FCFP_6#-636452210', 'CM#FCFP_6#261517095', 'CM#FCFP_6#-1105635900', 'CM#FCFP_6#1257570395', 'CM#FCFP_6#-1014592515', 'CM#FCFP_6#1394363492', 'CM#FCFP_6#-1257270106', 'CM#FCFP_6#1880999728', 'CM#FCFP_6#-1784895416', 'CM#FCFP_6#-712094106', 'CM#FCFP_6#1556104795', 'CM#FCFP_6#238586068', 'CM#FCFP_6#1642539255', 'CM#FCFP_6#-446585171', 'CM#FCFP_6#862622023', 'CM#FCFP_6#1983738699', 'CM#FCFP_6#1963526533', 'CM#FCFP_6#1983298659', 'CM#FCFP_6#2019060838', 'CM#FCFP_6#604539105', 'CM#FCFP_6#-835254725', 'CM#FCFP_6#238890401', 'CM#FCFP_6#522623800', 'CM#FCFP_6#426949827', 'CM#FCFP_6#930691991', 'CM#FCFP_6#-692413328', 'CM#FCFP_6#-703392038', 'CM#FCFP_6#238434898']
#set to floating point type
x = x.astype('float64')
# optional : visualize x.dtypes
#Optional : check for missing values
#pd.isnull(x).any()
from sklearn import tree, grid_search,cross_validation
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.grid_search import GridSearchCV
# Divide the original dataset into training (80%) and test (20%) set
X_train2, X_test2, y_train2, y_test2 = cross_validation.train_test_split(x, y, test_size=0.2, random_state=23)
#import and Learn the model
from sklearn import tree
from sklearn import ensemble
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
custom_forest = ensemble.RandomForestRegressor()
params = {'oob_score':[True],'n_estimators':[250]}
cv = KFold(n=len(X_train2),n_folds=10,shuffle=True)
cv_stratified = StratifiedKFold(y_train2, n_folds=5)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_train2,y_train2)
ztest2 = gs.predict(X_test2)
ztest_train2 = gs.predict(X_train2)
best_forest2 = gs.best_estimator_
print '80% - 20% completed'
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
print 'Explained variance (Internal): %0.5f'%(best_forest2.score(X_train2,y_train2))
print 'MSE (Internal): %0.5f'%(mean_squared_error(y_train2,ztest_train2))
print 'Explained variance (External): %0.5f'%(best_forest2.score(X_test2,y_test2))
print 'MSE (External): %0.5f'%(mean_squared_error(y_test2,ztest2))
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 39.7s finished
80% - 20% completed Best score: 0.83 Training set performance using best parameters ({'n_estimators': 250, 'oob_score': True}) Explained variance (Internal): 0.98161 MSE (Internal): 0.01817 Explained variance (External): 0.85877 MSE (External): 0.13041
figure,(plt5,plt6) = pylab.subplots(1,2)
figure.set_size_inches(15,5)
plt5.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt5.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt5.scatter(y_test2,ztest2,c='green',label='test',zorder=2)
minn = np.min(np.concatenate([y_test2,ztest2],axis=0))
maxx = np.max(np.concatenate([y_test2,ztest2],axis=0))
plt5.set_ylim([minn,maxx])
plt5.set_xlim([minn,maxx])
plt5.set_title("Prediction test set 20%")
plt6.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt6.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt6.scatter(y_train2,ztest_train2,c='green',label='test',zorder=2)
plt6.set_title("Prediction training set 80%")
minn = np.min(np.concatenate([y_train2,ztest_train2],axis=0))
maxx = np.max(np.concatenate([y_train2,ztest_train2],axis=0))
plt6.set_ylim([minn,maxx])
plt6.set_xlim([minn,maxx])
(4.7300000000000004, 9.5199999999999996)
Now we will use the best predictor to try and understand the data. We do so by directly interpreting the random forest (obviously these values can also be output to a text file for later interpretation.)
feat_imp = best_forest2.feature_importances_
print len(feat_imp)
fig,a = pyplot.subplots(1,1)
fig.set_size_inches(15,5)
createImportancePlot(a,list(x.columns),feat_imp,"Most important descriptors in the optimized tree")
491
print "Cross validated parameters"
Cross_val_MSE_80 = mean_squared_error(y_train2, ztest_train2)
print "Cross_val_MSE: ", Cross_val_MSE_80
Cross_val_Exp_var_80 = explained_variance_score(y_train2, ztest_train2)
print "Cross_val_Explained_variance: " , Cross_val_Exp_var_80
Cross_val_R2_80 = r2_score(y_train2, ztest_train2)
print "Cross_val_R2_score: " , Cross_val_R2_80
print ""
print 'Externally validated parameters'
Ext_val_MSE_80 = extvmetrics2 = mean_squared_error(y_test2 , ztest2)
print "Ext_val_MSE: ", Ext_val_MSE_80
Ext_val_Exp_var_80 =explained_variance_score(y_test2 , ztest2)
print "Ext_val_Explained_variance: " , Ext_val_Exp_var_80
Ext_val_R2_80 = r2_score(y_test2 , ztest2)
print "Ext_val_R2_score: " , Ext_val_R2_80
Cross validated parameters Cross_val_MSE: 0.0181688251179 Cross_val_Explained_variance: 0.981619146069 Cross_val_R2_score: 0.981607064373 Externally validated parameters Ext_val_MSE: 0.130414625752 Ext_val_Explained_variance: 0.858772996164 Ext_val_R2_score: 0.858765598653
Validation2 = pd.DataFrame.from_items([ ('Parameter', ['Mean Squared Error' , 'Explained variance' , 'R2 Score']), ('Cross Validation', [Cross_val_MSE_80, Cross_val_Exp_var_80, Cross_val_R2_80 ]), ('External Validation', [Ext_val_MSE_80, Ext_val_Exp_var_80, Ext_val_R2_80 ]), ('Difference', [Ext_val_MSE_80 - Cross_val_MSE_80, Ext_val_Exp_var_80 - Cross_val_Exp_var_80, Ext_val_R2_80 - Cross_val_R2_80])])
Validation2
Parameter | Cross Validation | External Validation | Difference | |
---|---|---|---|---|
0 | Mean Squared Error | 0.018169 | 0.130415 | 0.112246 |
1 | Explained variance | 0.981619 | 0.858773 | -0.122846 |
2 | R2 Score | 0.981607 | 0.858766 | -0.122841 |
It can be beneficial for a better understanding of the data and the performance gained by the machine learning algorithm to create learning curves. The concept it to create several splits of your data between training and tests sets. Subsequently multiple models are trained and model performance is plotted against training set size. As this can be computationally expensive it is here included as the last section. Again, depending on the hardware this can take some time.
# Divide the original dataset into training 20 - testing 80, and a 50-50 set
X_traina, X_testa, y_traina, y_testa = cross_validation.train_test_split(x, y, test_size=0.95, random_state=23)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.8, random_state=23)
X_train1, X_test1, y_train1, y_test1 = cross_validation.train_test_split(x, y, test_size=0.5, random_state=23)
X_trainb, X_testb, y_trainb, y_testb = cross_validation.train_test_split(x, y, test_size=0.05, random_state=23)
params = {'min_samples_split': [8], 'max_depth': [20], 'min_samples_leaf': [1],'n_estimators':[200]}
cv = KFold(n=len(X_traina),n_folds=10,shuffle=True)
cv_stratified = StratifiedKFold(y_traina)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_traina,y_traina)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_traina,y_traina)
ztesta = gs.predict(X_testa)
ztest_traina = gs.predict(X_traina)
best_foresta = gs.best_estimator_
#print intermediate results to keep an eye on progress
print '5% - 95% completed'
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
best_treemodel = gs.best_estimator_
print 'Explained variance (Internal): %0.5f'%(best_foresta.score(X_traina,y_traina))
print 'MSE (Internal): %0.5f'%(mean_squared_error(y_traina,ztest_traina))
print 'Explained variance (External): %0.5f'%(best_foresta.score(X_testa,y_testa))
print 'MSE (External): %0.5f'%(mean_squared_error(y_testa,ztesta))
cv = KFold(n=len(X_train),n_folds=10,shuffle=True)
cv_stratified = StratifiedKFold(y_train)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_train,y_train)
ztest = gs.predict(X_test)
ztest_train = gs.predict(X_train)
best_forest = gs.best_estimator_
#print intermediate results to keep an eye on progress
print '20% - 80% completed'
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
best_treemodel = gs.best_estimator_
print 'Explained variance (Internal): %0.5f'%(best_forest.score(X_train,y_train))
print 'MSE (Internal): %0.5f'%(mean_squared_error(y_train,ztest_train))
print 'Explained variance (External): %0.5f'%(best_forest.score(X_test,y_test))
print 'MSE (External): %0.5f'%(mean_squared_error(y_test,ztest))
cv = KFold(n=len(X_train1),n_folds=10,shuffle=True)
cv_stratified = StratifiedKFold(y_train1)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_train1,y_train1)
ztest1 = gs.predict(X_test1)
ztest_train1 = gs.predict(X_train1)
best_forest1 = gs.best_estimator_
#print intermediate results to keep an eye on progress
print '50% - 50% completed'
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
print 'Explained variance (Internal): %0.5f'%(best_forest1.score(X_train1,y_train1))
print 'MSE (Internal): %0.5f'%(mean_squared_error(y_train1,ztest_train1))
print 'Explained variance (External): %0.5f'%(best_forest1.score(X_test1,y_test1))
print 'MSE (External): %0.5f'%(mean_squared_error(y_test1,ztest1))
cv = KFold(n=len(X_trainb),n_folds=10,shuffle=True)
cv_stratified = StratifiedKFold(y_trainb)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_trainb,y_trainb)
ztestb = gs.predict(X_testb)
ztest_trainb = gs.predict(X_trainb)
best_forestb = gs.best_estimator_
#print intermediate results to keep an eye on progress
print '95% - 5% completed'
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
print 'Explained variance (Internal): %0.5f'%(best_forestb.score(X_trainb,y_trainb))
print 'MSE (Internal): %0.5f'%(mean_squared_error(y_trainb,ztest_trainb))
print 'Explained variance (External): %0.5f'%(best_forestb.score(X_testb,y_testb))
print 'MSE (External): %0.5f'%(mean_squared_error(y_testb,ztestb))
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 1.1s finished
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 1.1s finished
5% - 95% completed Best score: -0.02 Training set performance using best parameters ({'min_samples_split': 8, 'n_estimators': 200, 'max_depth': 20, 'min_samples_leaf': 1}) Explained variance (Internal): 0.88566 MSE (Internal): 0.11882 Explained variance (External): 0.45760 MSE (External): 0.52709 Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 2.9s finished
20% - 80% completed Best score: 0.66 Training set performance using best parameters ({'min_samples_split': 8, 'n_estimators': 200, 'max_depth': 20, 'min_samples_leaf': 1}) Explained variance (Internal): 0.94140 MSE (Internal): 0.06155 Explained variance (External): 0.68555 MSE (External): 0.30023 Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 7.1s finished
50% - 50% completed Best score: 0.76 Training set performance using best parameters ({'min_samples_split': 8, 'n_estimators': 200, 'max_depth': 20, 'min_samples_leaf': 1}) Explained variance (Internal): 0.95444 MSE (Internal): 0.04562 Explained variance (External): 0.79528 MSE (External): 0.19427 Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 13.9s finished
95% - 5% completed Best score: 0.81 Training set performance using best parameters ({'min_samples_split': 8, 'n_estimators': 200, 'max_depth': 20, 'min_samples_leaf': 1}) Explained variance (Internal): 0.96426 MSE (Internal): 0.03544 Explained variance (External): 0.75189 MSE (External): 0.16341
#create measured vs predicted plots
#plt.xkcd()
figure,(plt1,plt2) = pylab.subplots(1,2)
figure.set_size_inches(15,5)
plt1.plot(x-1,x,c='red',linewidth=1,zorder=1)
plt1.plot(x+1,x,c='red',linewidth=1,zorder=1)
plt1.scatter(y_test,ztest,c='green',label='test',zorder=2)
minn = np.min(np.concatenate([y_test,ztest],axis=0))
maxx = np.max(np.concatenate([y_test,ztest],axis=0))
plt1.set_ylim([minn,maxx])
plt1.set_xlim([minn,maxx])
plt1.set_title("Prediction test set 80%")
plt2.plot(x-1,x,c='red',linewidth=1,zorder=1)
plt2.plot(x+1,x,c='red',linewidth=1,zorder=1)
plt2.scatter(y_train,ztest_train,c='green',label='test',zorder=2)
plt2.set_title("Prediction training set 20%")
minn = np.min(np.concatenate([y_train,ztest_train],axis=0))
maxx = np.max(np.concatenate([y_train,ztest_train],axis=0))
plt2.set_ylim([minn,maxx])
plt2.set_xlim([minn,maxx])
figure,(plt3,plt4) = pylab.subplots(1,2)
figure.set_size_inches(15,5)
plt3.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt3.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt3.scatter(y_test1,ztest1,c='green',label='test',zorder=2)
minn = np.min(np.concatenate([y_test,ztest],axis=0))
maxx = np.max(np.concatenate([y_test,ztest],axis=0))
plt3.set_ylim([minn,maxx])
plt3.set_xlim([minn,maxx])
plt3.set_title("Prediction test set 50%")
plt4.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt4.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt4.scatter(y_train1,ztest_train1,c='green',label='test',zorder=2)
plt4.set_title("Prediction training set 50%")
minn = np.min(np.concatenate([y_train,ztest_train],axis=0))
maxx = np.max(np.concatenate([y_train,ztest_train],axis=0))
plt4.set_ylim([minn,maxx])
plt4.set_xlim([minn,maxx])
figure,(plt5,plt6) = pylab.subplots(1,2)
figure.set_size_inches(15,5)
plt5.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt5.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt5.scatter(y_test2,ztest2,c='green',label='test',zorder=2)
minn = np.min(np.concatenate([y_test,ztest],axis=0))
maxx = np.max(np.concatenate([y_test,ztest],axis=0))
plt5.set_ylim([minn,maxx])
plt5.set_xlim([minn,maxx])
plt5.set_title("Prediction test set 20%")
plt6.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt6.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt6.scatter(y_train2,ztest_train2,c='green',label='test',zorder=2)
plt6.set_title("Prediction training set 80%")
minn = np.min(np.concatenate([y_train,ztest_train],axis=0))
maxx = np.max(np.concatenate([y_train,ztest_train],axis=0))
plt6.set_ylim([minn,maxx])
plt6.set_xlim([minn,maxx])
figure,(plt7,plt8) = pylab.subplots(1,2)
figure.set_size_inches(15,5)
plt7.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt7.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt7.scatter(y_testb,ztestb,c='green',label='test',zorder=2)
minn = np.min(np.concatenate([y_test,ztest],axis=0))
maxx = np.max(np.concatenate([y_test,ztest],axis=0))
plt7.set_ylim([minn,maxx])
plt7.set_xlim([minn,maxx])
plt7.set_title("Prediction test set 5%")
plt8.plot(x-1,x,c='red', linewidth=1,zorder=1)
plt8.plot(x+1,x,c='red', linewidth=1,zorder=1)
plt8.scatter(y_trainb,ztest_trainb,c='green',label='test',zorder=2)
plt8.set_title("Prediction training set 80%")
minn = np.min(np.concatenate([y_train,ztest_train],axis=0))
maxx = np.max(np.concatenate([y_train,ztest_train],axis=0))
plt8.set_ylim([minn,maxx])
plt8.set_xlim([minn,maxx])
(4.7300000000000004, 9.5199999999999996)
print '5%-95%'
print "Cross validated parameters"
Cross_val_MSE_05 = mean_squared_error(y_traina, ztest_traina)
print "Cross_val_MSE: ", Cross_val_MSE_05
Cross_val_R2_05 = r2_score(y_traina, ztest_traina)
print "Cross_val_R2_score: " , Cross_val_R2_05
print ""
print 'Externally validated parameters'
Ext_val_MSE_05 = extvmetrics2 = mean_squared_error(y_testa , ztesta)
print "Ext_val_MSE: ", Ext_val_MSE_05
Ext_val_R2_05 = r2_score(y_testa , ztesta)
print "Ext_val_R2_score: " , Ext_val_R2_05
print ''
print ''
print '20%-80%'
Cross_val_MSE_20 = mean_squared_error(y_train, ztest_train)
print "Cross_val_MSE: ", Cross_val_MSE_20
Cross_val_Exp_var_20 = explained_variance_score(y_train, ztest_train)
print "Cross_val_Explained_variance: " , Cross_val_Exp_var_20
Cross_val_R2_20 = r2_score(y_train, ztest_train)
print "Cross_val_R2_score: " , Cross_val_R2_20
print ''
Ext_val_MSE_20 = mean_squared_error(y_test , ztest)
print "Ext_val_MSE: ", Ext_val_MSE_20
Ext_val_Exp_var_20 = explained_variance_score(y_test , ztest)
print "Ext_val_Explained_variance: " , Ext_val_Exp_var_20
Ext_val_R2_20 = r2_score(y_test , ztest)
print "Ext_val_R2_score: " , Ext_val_R2_20
print ''
print ''
print '50%-50%'
Cross_val_MSE_50 = mean_squared_error(y_train1, ztest_train1)
print "Cross_val_MSE: ", Cross_val_MSE_50
Cross_val_Exp_var_50 = explained_variance_score(y_train1, ztest_train1)
print "Cross_val_Explained_variance: " , Cross_val_Exp_var_50
Cross_val_R2_50 = r2_score(y_train1, ztest_train1)
print "Cross_val_R2_score: " , Cross_val_R2_50
print''
Ext_val_MSE_50 = mean_squared_error(y_test1 , ztest1)
print "Ext_val_MSE: ", Ext_val_MSE_20
Ext_val_Exp_var_50 = explained_variance_score(y_test1 , ztest1)
print "Ext_val_Explained_variance: " ,Ext_val_Exp_var_50
Ext_val_R2_50 = r2_score(y_test1 , ztest1)
print "Ext_val_R2_score: " , Ext_val_R2_50
print ''
print ''
print '95%-5%'
print "Cross validated parameters"
Cross_val_MSE_95 = mean_squared_error(y_trainb, ztest_trainb)
print "Cross_val_MSE: ", Cross_val_MSE_95
Cross_val_R2_95 = r2_score(y_trainb, ztest_trainb)
print "Cross_val_R2_score: " , Cross_val_R2_95
print ""
print 'Externally validated parameters'
Ext_val_MSE_95 = extvmetrics2 = mean_squared_error(y_testb , ztestb)
print "Ext_val_MSE: ", Ext_val_MSE_95
Ext_val_R2_95 = r2_score(y_testb , ztestb)
print "Ext_val_R2_score: " , Ext_val_R2_95
5%-95% Cross validated parameters Cross_val_MSE: 0.118817397541 Cross_val_R2_score: 0.885664691266 Externally validated parameters Ext_val_MSE: 0.527086143766 Ext_val_R2_score: 0.457598777352 20%-80% Cross_val_MSE: 0.0615506846579 Cross_val_Explained_variance: 0.9413985508 Cross_val_R2_score: 0.941398548178 Ext_val_MSE: 0.300225679856 Ext_val_Explained_variance: 0.690357044108 Ext_val_R2_score: 0.685551290939 50%-50% Cross_val_MSE: 0.045621674341 Cross_val_Explained_variance: 0.954451647962 Cross_val_R2_score: 0.954443212232 Ext_val_MSE: 0.300225679856 Ext_val_Explained_variance: 0.798075529688 Ext_val_R2_score: 0.795279712506 95%-5% Cross validated parameters Cross_val_MSE: 0.0354393897648 Cross_val_R2_score: 0.964263720486 Externally validated parameters Ext_val_MSE: 0.16341375919 Ext_val_R2_score: 0.751889460253
#generation of the arrays
learning_x = np.array([20, 50, 80, 95])
plot_ext_rmse = np.array([Ext_val_MSE_20 , Ext_val_MSE_50 , Ext_val_MSE_80 , Ext_val_MSE_95])
plot_cross_rmse = np.array([Cross_val_MSE_20 , Cross_val_MSE_50 , Cross_val_MSE_80 , Cross_val_MSE_95])
plot_ext_r2 = np.array([Ext_val_R2_20 , Ext_val_R2_50 , Ext_val_R2_80 , Ext_val_R2_95])
plot_cross_r2 = np.array([Cross_val_R2_20 , Cross_val_R2_50 , Cross_val_R2_80 , Cross_val_R2_95])
#the actual plot
figure,(plt1,plt2) = pylab.subplots(1,2)
figure.set_size_inches(15,5)
plt1.set_ylim([0, 1])
plt1.set_xlim([0, 100])
plt1.set_title('Mean Squared Error')
plt1.plot(learning_x,plot_ext_rmse,c='black',linewidth=1,zorder=1)
plt1.plot(learning_x,plot_cross_rmse,c='red',linewidth=1,zorder=1)
plt1.set_xlabel('Training Set Size (%)')
plt1.set_ylabel('MSE')
plt2.set_ylim([0, 1])
plt2.set_xlim([0, 100])
plt2.set_title('R2 Score')
plt2.plot(learning_x,plot_ext_r2,c='black',linewidth=1,zorder=1, label='validation')
plt2.plot(learning_x,plot_cross_r2,c='red',linewidth=1,zorder=1, label='training')
plt2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt2.set_xlabel('Training Set Size (%)')
plt2.set_ylabel('R2 Score')
plt.show()
Interestingly the performance appears to go down for the 95% training - 5% validaiton split. This is however caused by the much smaller sample. The influence of the outliers on the scores is relatively rather large. This effect will not be so pronounced when using a larger data set than the here used 1500 datapoints.