%matplotlib inline import matplotlib.pyplot as plt # Some nice default configuration for plots plt.rcParams['figure.figsize'] = 10, 7.5 plt.rcParams['axes.grid'] = True plt.gray() import numpy as np import pandas as pd # training (transformed) trainX = pd.read_csv("../datasets/solubility/solTrainXtrans.csv").drop("Unnamed: 0", axis=1) trainY = pd.read_csv("../datasets/solubility/solTrainY.csv").drop("Unnamed: 0", axis=1) # test (transformed) testX = pd.read_csv("../datasets/solubility/solTestXtrans.csv").drop("Unnamed: 0", axis=1) testY = pd.read_csv("../datasets/solubility/solTestY.csv").drop("Unnamed: 0", axis=1) # calculate the SSE for the continuum of splits def find_splits(values): '''Find all splits points for a set of values.''' uni_values = np.unique(values) split_points = [] for i in range(len(uni_values)-1): # insert split points in the middle of two unique values split_points.append((uni_values[i] + uni_values[i+1])/2.0) return split_points def sse_split(X, y, split_point): '''Calculate SSE for a given split point.''' s1 = np.where(X <= split_point)[0] s2 = np.where(X > split_point)[0] sse = 0 try: y1 = np.mean(y[s1]) y2 = np.mean(y[s2]) for i in s1: sse += (y[i] - y1)**2 for i in s2: sse += (y[i] - y2)**2 except IndexError: # if y is a pd.dataframe y1 = np.mean(y.values[s1]) y2 = np.mean(y.values[s2]) for i in s1: sse += (y.values[i][0] - y1)**2 for i in s2: sse += (y.values[i][0] - y2)**2 return sse # calculate SSE for continuum of splits for 'NumCarbon' split_points = find_splits(trainX['NumCarbon']) sse = [] for i in split_points: sse.append(sse_split(trainX['NumCarbon'], trainY, i)) # plot the SSE for the continuum of splits fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True) # number of carbon atoms ax1.scatter(trainX['NumCarbon'], trainY, alpha = 0.5) ax1.set_ylabel('Solubility') # SSEs ax2.plot(split_points, sse, 'o-') ax2.set_xlabel('Number of Carbon Atoms (transformed)') ax2.set_ylabel('SSE') s1 = np.where(trainX['NumCarbon'] <= 3.7768)[0] s2 = np.where(trainX['NumCarbon'] > 3.7768)[0] y = [trainY.values[s1, 0], trainY.values[s2, 0]] plt.boxplot(y) plt.ylabel('Solubility') plt.xticks([1, 2], ['<= 3.7768', '> 3.7768']) from sklearn.tree import DecisionTreeRegressor from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import ShuffleSplit cv = ShuffleSplit(trainX.shape[0], n_iter=10, random_state=3) treg = DecisionTreeRegressor() gs_param = { 'min_samples_split': range(2, 100), } gs_treg = GridSearchCV(treg, gs_param, cv=cv, scoring="mean_squared_error", n_jobs=-1) gs_treg.fit(trainX.values, trainY.values[:, 0]) # calculate RMSE for all candidates def gs_rmse(scores): '''Calcuate RMSE for a GridSearchCV grid_scores_ object.''' rmse = [] for dx, v in enumerate(scores): rmse.append([v[0]['min_samples_split'], np.sqrt(-v[1]), np.std(np.sqrt(-v[2]))]) return np.array(rmse) gs_treg_rmse = gs_rmse(gs_treg.grid_scores_) plt.plot(gs_treg_rmse[:,0], gs_treg_rmse[:,1], c='b') plt.xlabel('\'min_samples_split\'') plt.ylabel('RMSE') # use the one-standard-error rule thres_rmse = np.min(gs_treg_rmse[:,1]) + gs_treg_rmse[np.argmin(gs_treg_rmse[:,1]), 2] for idx, i in reversed(list(enumerate(gs_treg_rmse))): if i[1] < thres_rmse: best_param = i[0] break print "The best parameter: 'min_samples_split' = {0}.".format(best_param) # the important values for the 16 predictors treg_best = DecisionTreeRegressor(min_samples_split=53) treg_best.fit(trainX.values, trainY.values[:, 0]) def viz_importance(scores, num_predictors = 16, predictor_names = trainX.columns): '''Visualize the relative importance of predictors.''' idx = np.argsort(scores)[-num_predictors:] names = predictor_names[idx] importances = scores[idx] y_pos = np.arange(num_predictors) plt.barh(y_pos, importances, align='center', alpha=0.4) plt.yticks(y_pos, names) plt.xlabel('Importance') viz_importance(treg_best.feature_importances_) # use two datasets to illustrate the performance of bagging solu_trainX = pd.read_csv('../datasets/solubility/solTrainXtrans.csv').values[:, 1:] solu_trainY = pd.read_csv('../datasets/solubility/solTrainY.csv').values[:, 1] solu_testX = pd.read_csv('../datasets/solubility/solTestXtrans.csv').values[:, 1:] solu_testY = pd.read_csv('../datasets/solubility/solTestY.csv').values[:, 1] concre = pd.read_csv('../datasets/concrete/concrete.csv').drop('Unnamed: 0', 1) concreX = concre.drop('CompressiveStrength', 1) concreY = concre['CompressiveStrength'] from sklearn.cross_validation import train_test_split concre_trainX, concre_testX, concre_trainY, concre_testY = train_test_split(concreX, concreY, test_size = 0.2) # some basic setups def boostrap_sample(X, Y): '''Generate a boostrap sample of the original data.''' bsample = np.random.choice(range(len(Y)), len(Y), replace=True) bs_X = X[bsample] bs_Y = Y[bsample] return bs_X, bs_Y def test_pred(trainX, trainY, testX, model): '''Generate a test set prediction based on the given model.''' reg = model reg.fit(trainX, trainY) return reg.predict(testX) def test_rmse(Y, pred_Y): '''Calculate RMSE based on the test set prediction.''' return np.sqrt(np.mean((Y - pred_Y)**2)) # bagging: tree vs linear regression vs MARS from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import LinearRegression from pyearth import Earth np.random.seed(3) tree_solu_rmse = [] tree_concre_rmse = [] lm_solu_rmse = [] lm_concre_rmse = [] mars_solu_rmse = [] mars_concre_rmse = [] for i in range(0, 50, 2): # generate a boostrap sample bs_solu_trainX, bs_solu_trainY = boostrap_sample(solu_trainX, solu_trainY) bs_concre_trainX, bs_concre_trainY = boostrap_sample(concre_trainX, concre_trainY) # train a model tree_solu = test_pred(bs_solu_trainX, bs_solu_trainY, solu_testX, DecisionTreeRegressor()) lm_solu = test_pred(bs_solu_trainX, bs_solu_trainY, solu_testX, LinearRegression()) mars_solu = test_pred(bs_solu_trainX, bs_solu_trainY, solu_testX, Earth()) tree_concre = test_pred(bs_concre_trainX, bs_concre_trainY, concre_testX, DecisionTreeRegressor()) lm_concre = test_pred(bs_concre_trainX, bs_concre_trainY, concre_testX, LinearRegression()) mars_concre = test_pred(bs_concre_trainX, bs_concre_trainY, concre_testX, Earth()) # aggregate prediction if i != 0: tree_solu_agg = np.vstack([tree_solu_agg, tree_solu]) lm_solu_agg = np.vstack([lm_solu_agg, lm_solu]) mars_solu_agg = np.vstack([mars_solu_agg, mars_solu]) tree_concre_agg = np.vstack([tree_concre_agg, tree_concre]) lm_concre_agg = np.vstack([lm_concre_agg, lm_concre]) mars_concre_agg = np.vstack([mars_concre_agg, mars_concre]) else: # if agg not defined tree_solu_agg = tree_solu lm_solu_agg = lm_solu mars_solu_agg = mars_solu tree_concre_agg = tree_concre lm_concre_agg = lm_concre mars_concre_agg = mars_concre # calculate rmse tree_solu_rmse.append(test_rmse(solu_testY, np.mean(tree_solu_agg, 0))) lm_solu_rmse.append(test_rmse(solu_testY, np.mean(lm_solu_agg, 0))) mars_solu_rmse.append(test_rmse(solu_testY, np.mean(mars_solu_agg, 0))) tree_concre_rmse.append(test_rmse(concre_testY, np.mean(tree_concre_agg, 0))) lm_concre_rmse.append(test_rmse(concre_testY, np.mean(lm_concre_agg, 0))) mars_concre_rmse.append(test_rmse(concre_testY, np.mean(mars_concre_agg, 0))) fig, (ax1, ax2) = plt.subplots(nrows = 2, sharex = True) # solubility data l1, l2, l3 = ax1.plot(range(0, 50, 2)[1:], tree_solu_rmse[1:], '-o', range(0, 50, 2)[1:], lm_solu_rmse[1:], '-x', range(0, 50, 2)[1:], mars_solu_rmse[1:], '-^') ax1.set_title('Solubility') # concrete date ax2.plot(range(0, 50, 2)[1:], tree_concre_rmse[1:], '-o', range(0, 50, 2)[1:], lm_concre_rmse[1:], '-x', range(0, 50, 2)[1:], mars_concre_rmse[1:], '-^') ax2.set_title('Concrete') fig.legend((l1, l2, l3), ('Tree', 'Linear Regression', 'MARS'), loc='upper center', ncol=3, frameon=False) fig.text(0.5, 0.06, 'Number of Bagging Iterations', ha='center', va='center') fig.text(0.06, 0.5, 'RMSE', va='center', rotation='vertical') # simulated 20 sin waves np.random.seed(3) X = np.zeros([20, 100]) Y = np.zeros([20, 100]) for i in range(20): X[i,:] = np.sort(np.random.uniform(2, 10, 100)) Y[i,:] = np.sin(X[i,:]) + np.random.normal(0, 0.2, 100) def model_pred(x, y, model): '''Fit model and generate predictions.''' reg = model reg.fit(x, y) return reg.predict(x) def bagg_model_pred(x, y, model, niter=50): '''Fit bagging model with 50 iterations and generate predictions.''' for i in range(niter): bs_x, bs_y = boostrap_sample(x, y) bs_order = np.argsort(bs_x, axis=0)[:,0] pred = model_pred(bs_x[bs_order], bs_y[bs_order], model) if i != 0: pred_agg = np.vstack([pred_agg, pred]) else: pred_agg = pred return np.mean(pred_agg, 0) # build models for each sin wave and make predictions tree_pred = [] mars_pred = [] bagg_tree_pred = [] bagg_mars_pred = [] for i in range(20): x = X[i,:][:,np.newaxis] y = Y[i,:] tree_pred.append(model_pred(x, y, DecisionTreeRegressor(max_depth=3))) mars_pred.append(model_pred(x, y, Earth())) bagg_tree_pred.append(bagg_model_pred(x, y, DecisionTreeRegressor(max_depth=3), 50)) bagg_mars_pred.append(bagg_model_pred(x, y, Earth(), 50)) fig, axarr = plt.subplots(2, 2) x_test = np.arange(2.0, 10.0, 0.01) # tree axarr[0, 0].plot(x_test, np.sin(x_test), 'r') for i in range(20): axarr[0, 0].plot(X[i,:], tree_pred[i], 'black', alpha=0.3) axarr[0, 0].set_title('Tree') axarr[0, 0].set_ylim([-1.5, 1.5]) # bagged tree axarr[1, 0].plot(x_test, np.sin(x_test), 'r') for i in range(20): axarr[1, 0].plot(X[i,:], bagg_tree_pred[i], 'black', alpha=0.3) axarr[1, 0].set_title('Bagged Tree') axarr[1, 0].set_ylim([-1.5, 1.5]) # MARS axarr[0, 1].plot(x_test, np.sin(x_test), 'r') for i in range(20): axarr[0, 1].plot(X[i,:], mars_pred[i], 'black', alpha=0.25) axarr[0, 1].set_title('MARS') axarr[0, 1].set_ylim([-1.5, 1.5]) # bagged MARS axarr[1, 1].plot(x_test, np.sin(x_test), 'r') for i in range(20): axarr[1, 1].plot(X[i,:], bagg_mars_pred[i], 'black', alpha=0.25) axarr[1, 1].set_title('Bagged MARS') axarr[1, 1].set_ylim([-1.5, 1.5]) # display cross-validated predictive performance (RMSE) for varing number of bootstraped samples def bagg_rmse(trainX, trainY, testX, testY, num_bagged, model): '''Calculate RMSE on test set for bagged models.''' for i in range(num_bagged): bs_x, bs_y = boostrap_sample(trainX, trainY) reg = model reg.fit(bs_x, bs_y) pred = reg.predict(testX) if i != 0: pred_agg = np.vstack([pred_agg, pred]) else: pred_agg = pred pred_avg = np.mean(pred_agg, 0) return np.sqrt(np.mean((pred_avg - testY)**2)) from sklearn.cross_validation import ShuffleSplit cv = ShuffleSplit(solu_trainX.shape[0], n_iter=10, random_state=3) cv_rmse = [] for num_bagged in np.logspace(0, 2, num=7).astype(int): temp_rmse = [] for train_indices, test_indices in cv: trainX = solu_trainX[train_indices] trainY = solu_trainY[train_indices] testX = solu_trainX[test_indices] testY = solu_trainY[test_indices] temp_rmse.append(bagg_rmse(trainX, trainY, testX, testY, num_bagged, DecisionTreeRegressor(max_depth=5))) cv_rmse.append(temp_rmse) plt.errorbar(np.logspace(0, 2, num=7).astype(int), np.mean(cv_rmse, 1), yerr = np.std(cv_rmse, 1)) plt.xlim(-2, 102) plt.show() from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error cv = ShuffleSplit(solu_trainX.shape[0], n_iter=10, random_state=3) cv_rmse = [] oob_rmse = [] for m_try in range(10, 228, 22): # out-of-bag score rf = RandomForestRegressor(n_estimators=1000, max_features=m_try, oob_score=True) rf.fit(solu_trainX, solu_trainY) oob_rmse.append(np.sqrt(mean_squared_error(solu_trainY, rf.oob_prediction_))) # cross-validation score cv_temp_rmse = [] for train_indices, test_indices in cv: trainX = solu_trainX[train_indices] trainY = solu_trainY[train_indices] testX = solu_trainX[test_indices] testY = solu_trainY[test_indices] rf = RandomForestRegressor(n_estimators=1000, max_features=m_try) rf.fit(trainX, trainY) cv_temp_rmse.append(np.sqrt(mean_squared_error(testY, rf.predict(testX)))) cv_rmse.append(cv_temp_rmse) plt.plot(range(10, 228, 22), np.mean(cv_rmse, 1), 'o-', label = 'CV score') plt.plot(range(10, 228, 22), oob_rmse, 'x-', label = 'OOB score') plt.legend() plt.xlim(0, 228) plt.show() rf = RandomForestRegressor(n_estimators=1000, max_features=120) rf.fit(solu_trainX, solu_trainY) viz_importance(rf.feature_importances_) from sklearn.ensemble import GradientBoostingRegressor from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import ShuffleSplit cv = ShuffleSplit(solu_trainX.shape[0], n_iter=10, random_state=3) gs_param = { 'learning_rate': [0.01, 0.1], 'max_depth': range(1, 8, 2), 'n_estimators': range(100, 1001, 50), 'subsample': [0.5], } gbr = GradientBoostingRegressor() gs_gbr = GridSearchCV(gbr, gs_param, cv=cv, scoring='mean_squared_error', n_jobs=-1) gs_gbr.fit(solu_trainX, solu_trainY) scores1 = np.zeros([len(range(100, 1001, 50)), len(range(1, 8, 2))]) scores2 = np.zeros([len(range(100, 1001, 50)), len(range(1, 8, 2))]) for score in gs_gbr.grid_scores_: if score[0]['learning_rate'] == 0.01: idx = (score[0]['n_estimators'] - 100)/50 jdx = score[0]['max_depth']/2 scores1[idx, jdx] = np.sqrt(-score[1]) else: idx = (score[0]['n_estimators'] - 100)/50 jdx = score[0]['max_depth']/2 scores2[idx, jdx] = np.sqrt(-score[1]) fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True) for i in range(4): ax1.plot(range(100, 1001, 50), scores1[:, i], 'x-') ax1.set_title('shrinkage: 0.01') for i in range(4): ax2.plot(range(100, 1001, 50), scores2[:, i], 'x-', label = 'tree depth:' + str(2*i+1)) ax2.set_title('shrinkage: 0.10') ax2.legend(loc='upper right') fig.text(0.5, 0.06, '# Trees', ha='center', va='center') fig.text(0.08, 0.5, 'RMESE (Cross-Validation)', ha='center', va='center', rotation=90) gbr = GradientBoostingRegressor(learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.5) gbr.fit(solu_trainX, solu_trainY) viz_importance(gbr.feature_importances_)