def createImportancePlot(splt,desc,importances,caption): import numpy.numarray as na labels = [] weights = [] threshold = sort([abs(w) for w in importances])[-11] for d in zip(desc,importances): if abs(d[1]) > threshold: labels.append(d[0]) weights.append(d[1]) xlocations = na.array(range(len(labels)))+0.5 width = 0.8 splt.bar(xlocations, weights, width=width) splt.set_xticks([r+1 for r in range(len(labels))]) splt.set_xticklabels(labels) splt.set_xlim(0, xlocations[-1]+width*2) splt.set_title(caption) splt.get_xaxis().tick_bottom() splt.get_yaxis().tick_left() from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.ML.Descriptors import MoleculeDescriptors #http://cheminformatics.org/datasets/huuskonen/train.smi with open('data/train.smi','r') as train_file: train_file.readline() raw = [line.split() for line in train_file.readlines()] temp = [] for d in raw: try: t = float(d[3]) temp.append(d) except: print d continue molecules = [Chem.MolFromSmiles(d[-1]) for d in temp] observations = [float(d[3]) for d in temp] data = [(x,y) for x,y in zip(molecules,observations) if x is not None] #Loading and processing external test data set #http://cheminformatics.org/datasets/huuskonen/test1.smi with open('data/test1.smi','r') as test_file: test_file.readline() test_raw = [line.split() for line in test_file.readlines()] temp = [] for d in test_raw: try: t = float(d[3]) temp.append(d) except: print d continue test_molecules = [Chem.MolFromSmiles(d[-1]) for d in temp] test_observations = [float(d[3]) for d in temp] test_data = [(x,y) for x,y in zip(test_molecules,test_observations) if x is not None] test_patterns = [] test_ys = [] for sample in test_data: pattern = [] use = True for D in Descriptors.descList: if D[0] == 'MolecularFormula': continue try: d = float(D[1](sample[0])) pattern.append(d) except: use = False print 'Error computing descriptor %s for %s'%(D[0],sample[0]) break if use: test_patterns.append(pattern) test_ys.append(sample[1]) patterns = [] ys = [] descriptors = [] for D in Descriptors._descList: if D[0] == 'MolecularFormula': continue descriptors.append(D[0]) calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors) for sample in data: use = True try: pattern = calculator.CalcDescriptors(sample[0]) except e: use = False print 'Error computing descriptor %s for %s'%(D[0],sample[0]) if use: patterns.append(pattern) ys.append(sample[1]) figure,(plt1,plt2,plt3) = pyplot.subplots(1,3) figure.set_size_inches(15,15) #simple linear regression from sklearn import linear_model simple_linearreg = linear_model.LinearRegression() simple_linearreg.fit(patterns,ys) simple_prediction = simple_linearreg.predict(patterns) plt1.scatter(ys,simple_prediction) plt1.set_aspect('equal') plt1.set_title('Simple linear regression') plt1.set_xlabel('Measured logS') plt1.set_ylabel('Predicted logS') plt1.set_xlim(-12,2) plt1.set_ylim(-12,2) #simple decision tree regression from sklearn import tree simple_tree = tree.DecisionTreeRegressor(compute_importances=True) simple_tree.fit(patterns,ys) simple_treeprediction = simple_tree.predict(patterns) plt2.scatter(ys,simple_treeprediction) plt2.set_aspect('equal') plt2.set_title('Standard Decision Tree') plt2.set_xlabel('Measured logS') plt2.set_ylabel('Predicted logS') plt2.set_xlim(-12,2) plt2.set_ylim(-12,2) #custom decision tree regression from sklearn import tree custom_tree = tree.DecisionTreeRegressor(max_depth=10, min_samples_split = 50,compute_importances=True) custom_tree.fit(patterns,ys) custom_treeprediction = custom_tree.predict(patterns) plt3.scatter(ys,custom_treeprediction) plt3.set_aspect('equal') plt3.set_title('Custom Decision Tree (regularized)') plt3.set_xlabel('Measured logS') plt3.set_ylabel('Predicted logS') plt3.set_xlim(-12,2) plt3.set_ylim(-12,2) from sklearn.metrics import mean_squared_error print 'Explained variance (Simple): %0.5f'%(simple_linearreg.score(patterns,ys)) print 'MSE (Simple): %0.5f'%(mean_squared_error(ys,simple_prediction)) print 'Explained variance (Custom): %0.5f'%(simple_tree.score(patterns,ys)) print 'MSE (Custom): %0.5f'%(mean_squared_error(ys,simple_treeprediction)) print 'Explained variance (Custom): %0.5f'%(custom_tree.score(patterns,ys)) print 'MSE (Custom): %0.5f'%(mean_squared_error(ys,custom_treeprediction)) #print zip(simple_prediction,custom_prediction) figure,(plt1,plt2,plt3) = pyplot.subplots(3,1) figure.set_size_inches(15,5) figure.tight_layout() imp = simple_linearreg.coef_ createImportancePlot(plt1,descriptors,imp,"Most important descriptors in linear model") imp2 = simple_tree.feature_importances_ #Gini importances createImportancePlot(plt2,descriptors,imp2,"Most important descriptors in simple tree model") imp3 = custom_tree.feature_importances_ #Gini importances createImportancePlot(plt3,descriptors,imp3,"Most important descriptors in custom tree model") #inspect logP contribution from scipy.stats.stats import pearsonr ind = descriptors.index('MolLogP') logP = [d[ind] for d in patterns] fig,ax = pyplot.subplots(1,1) ax.scatter(logP,ys) ax.set_title('rdkit:mollogP vs logS') ax.set_xlabel('rdkit:mollogP') ax.set_ylabel('Measured logS') print 'Pearson Correlation (MolLogP): %0.5f'%pearsonr(logP,ys)[0] figure,(plt1,plt2,plt3) = pyplot.subplots(1,3) figure.set_size_inches(15,15) #simple linear regression from sklearn import linear_model simple_ext_prediction = simple_linearreg.predict(test_patterns) plt1.scatter(test_ys,simple_ext_prediction) plt1.set_aspect('equal') plt1.set_title('Simple linear regression') plt1.set_xlabel('Measured logS') plt1.set_ylabel('Predicted logS') plt1.set_xlim(-12,2) plt1.set_ylim(-12,2) #simple decision tree regression simple_tree_ext_prediction = simple_tree.predict(test_patterns) plt2.scatter(test_ys,simple_tree_ext_prediction) plt2.set_aspect('equal') plt2.set_title('Standard Decision Tree') plt2.set_xlabel('Measured logS') plt2.set_ylabel('Predicted logS') plt2.set_xlim(-12,2) plt2.set_ylim(-12,2) #custom decision tree regression custom_tree_ext_prediction = custom_tree.predict(test_patterns) plt3.scatter(test_ys,custom_tree_ext_prediction) plt3.set_aspect('equal') plt3.set_title('Custom Decision Tree (regularized)') plt3.set_xlabel('Measured logS') plt3.set_ylabel('Predicted logS') plt3.set_xlim(-12,2) plt3.set_ylim(-12,2) from sklearn.metrics import mean_squared_error print 'Explained variance (Simple): %0.5f'%(simple_linearreg.score(test_patterns,test_ys)) print 'MSE (Simple): %0.5f'%(mean_squared_error(test_ys,simple_ext_prediction)) print 'Explained variance (Custom): %0.5f'%(simple_tree.score(test_patterns,test_ys)) print 'MSE (Custom): %0.5f'%(mean_squared_error(test_ys,simple_tree_ext_prediction)) print 'Explained variance (Custom): %0.5f'%(custom_tree.score(test_patterns,test_ys)) print 'MSE (Custom): %0.5f'%(mean_squared_error(test_ys,custom_tree_ext_prediction)) from sklearn.cross_validation import KFold from sklearn.grid_search import GridSearchCV params = {'max_depth':[2,5,10,20],'min_samples_split':[2,8,32,128],'min_samples_leaf':[1,2,5,10],'compute_importances':[True]} cv = KFold(n=len(patterns),k=10,indices=False,shuffle=True) gs = GridSearchCV(simple_tree, params, cv=cv,verbose=1,refit=True) gs.fit(patterns, ys) print 'Best score: %0.2f'%gs.best_score_ print 'Training set performance using best parameters (%s)'%gs.best_params_ best_treemodel = gs.best_estimator_ figure,(plt1,plt2) = pyplot.subplots(1,2) figure.set_size_inches(15,15) #training set evaluation best_tree_int_prediction = best_treemodel.predict(patterns) plt1.scatter(ys,best_tree_int_prediction) plt1.set_aspect('equal') plt1.set_title('Optimized tree on training set') plt1.set_xlabel('Measured logS') plt1.set_ylabel('Predicted logS') plt1.set_xlim(-12,2) plt1.set_ylim(-12,2) #test set evaluation best_tree_ext_prediction = best_treemodel.predict(test_patterns) plt2.scatter(test_ys,best_tree_ext_prediction) plt2.set_aspect('equal') plt2.set_title('Optimized tree on test set') plt2.set_xlabel('Measured logS') plt2.set_ylabel('Predicted logS') plt2.set_xlim(-12,2) plt2.set_ylim(-12,2) print 'Explained variance (Internal): %0.5f'%(best_treemodel.score(patterns,ys)) print 'MSE (Internal): %0.5f'%(mean_squared_error(ys,best_tree_int_prediction)) print 'Explained variance (External): %0.5f'%(best_treemodel.score(test_patterns,test_ys)) print 'MSE (External): %0.5f'%(mean_squared_error(test_ys,best_tree_ext_prediction)) fig,a = pyplot.subplots(1,1) fig.set_size_inches(15,5) createImportancePlot(a,descriptors,best_treemodel.feature_importances_,"Most important descriptors in the optimized tree") from sklearn.cross_validation import KFold from sklearn.grid_search import GridSearchCV from sklearn.metrics.metrics import mean_squared_error params = {'max_depth':[2,5,10,20],'min_samples_split':[2,8,32,128],'min_samples_leaf':[1,2,5,10],'compute_importances':[True]} cv = KFold(n=len(patterns),k=10,indices=False,shuffle=True) gs = GridSearchCV(simple_tree, params, cv=cv,verbose=1,refit=True,loss_func=mean_squared_error) gs.fit(patterns, ys) print 'Best score: %0.2f'%gs.best_score_ print 'Training set performance using best parameters (%s)'%gs.best_params_ best_treemodel = gs.best_estimator_ figure,(plt1,plt2) = pyplot.subplots(1,2) figure.set_size_inches(15,15) #training set evaluation best_tree_int_prediction = best_treemodel.predict(patterns) plt1.scatter(ys,best_tree_int_prediction) plt1.set_aspect('equal') plt1.set_title('Optimized tree on training set') plt1.set_xlabel('Measured logS') plt1.set_ylabel('Predicted logS') plt1.set_xlim(-12,2) plt1.set_ylim(-12,2) #test set evaluation best_tree_ext_prediction = best_treemodel.predict(test_patterns) plt2.scatter(test_ys,best_tree_ext_prediction) plt2.set_aspect('equal') plt2.set_title('Optimized tree on test set') plt2.set_xlabel('Measured logS') plt2.set_ylabel('Predicted logS') plt2.set_xlim(-12,2) plt2.set_ylim(-12,2) print 'Explained variance (Internal): %0.5f'%(best_treemodel.score(patterns,ys)) print 'MSE (Internal): %0.5f'%(mean_squared_error(ys,best_tree_int_prediction)) print 'Explained variance (External): %0.5f'%(best_treemodel.score(test_patterns,test_ys)) print 'MSE (External): %0.5f'%(mean_squared_error(test_ys,best_tree_ext_prediction)) fig,a = pyplot.subplots(1,1) fig.set_size_inches(15,5) createImportancePlot(a,descriptors,best_treemodel.feature_importances_,"Most important descriptors in the optimized tree") #normalize descriptors from sklearn.preprocessing import Scaler scaler = Scaler().fit(patterns) std_patterns = scaler.transform(patterns) std_test_patterns = scaler.transform(test_patterns) #compute pca; provide ten most descriptive principal components from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(std_patterns) print 'rel. explained variance per pc',pca.explained_variance_ratio_ pc_patterns = pca.transform(std_patterns) pc_test = pca.transform(std_test_patterns) fig,(plt1,plt2) = pyplot.subplots(2,1) fig.set_size_inches(15,5) createImportancePlot(plt1,descriptors,pca.components_[0],"Most important descriptors in 1st principal component (Hypothesis: Size)") createImportancePlot(plt2,descriptors,pca.components_[1],"Most important descriptors in 2nd principal component (Hypothesis: Electrostatic interactions)") #evaluate modelling performance figure,(plt1,plt2,plt3) = pyplot.subplots(1,3) figure.set_size_inches(15,5) #training set pc_linearreg = linear_model.LinearRegression() pc_linearmodel = pc_linearreg.fit(pc_patterns,ys) pc_prediction = pc_linearmodel.predict(pc_patterns) plt1.scatter(ys,pc_prediction) plt1.set_aspect('equal') plt1.set_title('Full model regression') plt1.set_xlabel('Measured logS') plt1.set_ylabel('Predicted logS') plt1.set_xlim(-12,2) plt1.set_ylim(-12,2) plt2.scatter(ys, [pc[0] for pc in pc_patterns]) plt2.set_title('1st PC') plt2.set_xlabel('Measured logS') plt2.set_ylabel('1st PC') plt3.scatter(ys, [pc[1] for pc in pc_patterns]) plt3.set_title('2nd PC') plt3.set_xlabel('Measured logS') plt3.set_ylabel('2nd PC') #test set figure,(plt1,plt2,plt3) = pyplot.subplots(1,3) figure.set_size_inches(15,5) pc_prediction = pc_linearmodel.predict(pc_test) plt1.scatter(test_ys,pc_prediction) plt1.set_aspect('equal') plt1.set_title('Full model regression') plt1.set_xlabel('Measured logS') plt1.set_ylabel('Predicted logS') plt1.set_xlim(-12,2) plt1.set_ylim(-12,2) plt2.scatter(test_ys, [pc[0] for pc in pc_test]) plt2.set_title('1st PC') plt2.set_xlabel('Measured logS') plt2.set_ylabel('1st PC') plt3.scatter(test_ys, [pc[1] for pc in pc_test]) plt3.set_title('2nd PC') plt3.set_xlabel('Measured logS') plt3.set_ylabel('2nd PC')