%matplotlib inline import matplotlib.pyplot as plt # Some nice default configuration for plots plt.rcParams['figure.figsize'] = 10, 7.5 plt.rcParams['axes.grid'] = True plt.gray() import numpy as np import pandas as pd # neural networks # training (transformed) trainX = pd.read_csv("../datasets/solubility/solTrainXtrans.csv").drop("Unnamed: 0", axis=1) trainY = pd.read_csv("../datasets/solubility/solTrainY.csv").drop("Unnamed: 0", axis=1) # test (transformed) testX = pd.read_csv("../datasets/solubility/solTestXtrans.csv").drop("Unnamed: 0", axis=1) testY = pd.read_csv("../datasets/solubility/solTestY.csv").drop("Unnamed: 0", axis=1) from pyearth import Earth mars = Earth() mars.fit(trainX.values, trainY.values) print mars.summary() trainX.columns[208] mars = Earth() mars.fit(trainX['MolWeight'].values, trainY.values) c_trainX = np.arange(np.min(trainX['MolWeight'].values), np.max(trainX['MolWeight'].values), 0.1) mars_predict = mars.predict(c_trainX) plt.scatter(trainX['MolWeight'].values, trainY.values, alpha=0.5) plt.plot(c_trainX, mars_predict, 'r', linewidth=2) plt.xlabel('Molecular Weight (transformed)') plt.ylabel('Log Solubility') np.random.seed(3) # toy example x_sim = np.random.uniform(-2.5, 2.5, 100) y_sim = 1 + 4*x_sim + np.random.normal(0, 1, 100) # arbitrarily set outlier xmin_idx = np.argmin(x_sim) y_sim[xmin_idx] = 10 # simple linear regression from sklearn.linear_model import LinearRegression ols = LinearRegression() ols.fit(x_sim[:, np.newaxis], y_sim) ols_pred = ols.predict(x_sim[:, np.newaxis]) print "y = {0} + {1} x".format(ols.intercept_, ols.coef_[0]) # support vectors machine regression from sklearn.svm import SVR eps = 0.1 svr = SVR('linear', epsilon = eps) svr.fit(x_sim[:, np.newaxis], y_sim) svr_pred = svr.predict(x_sim[:, np.newaxis]) print "y = {0} + {1} x".format(svr.intercept_[0], -svr.coef_[0][0]) plt.scatter(x_sim, y_sim, alpha=0.5, s=26) plt_ols, = plt.plot(x_sim, ols_pred, 'g') plt_svr, = plt.plot(x_sim, svr_pred, color='r') plt.xlabel("Predictor") plt.ylabel("Outcome") plt.ylim(-11, 11) plt.legend([plt_ols, plt_svr], ['Least Squares', 'SVM'], loc = 4) svr_residuals = np.delete(svr_pred - y_sim, xmin_idx, 0) plt.scatter(np.delete(x_sim, xmin_idx, 0), svr_residuals, alpha=0.5, s=26) plt.xlim(-3, 3) plt.plot(plt.xlim(), (eps, eps), 'g--', linewidth=2) plt.plot(plt.xlim(), (-eps, -eps), 'g--', linewidth=2) plt.xlabel('Predicted Value') plt.ylabel('Residual') print "Out of 100 data points, {0} of these were support vectors.".format(np.sum(np.abs(svr_residuals) >= eps)) np.random.seed(3) # sin wave x_sim = np.random.uniform(2, 10, 145) y_sim = np.sin(x_sim) + np.random.normal(0, 0.4, 145) # arbitrarily set outlier x_outliers = np.arange(2.5, 5, 0.5) y_outliers = -5*np.ones(5) x_sim_idx = np.argsort(np.concatenate([x_sim, x_outliers])) x_sim = np.concatenate([x_sim, x_outliers])[x_sim_idx] y_sim = np.concatenate([y_sim, y_outliers])[x_sim_idx] # simple linear regression from sklearn.linear_model import LinearRegression ols = LinearRegression() ols.fit(np.sin(x_sim[:, np.newaxis]), y_sim) ols_pred = ols.predict(np.sin(x_sim[:, np.newaxis])) print "y = {0} + {1} sin(x)".format(ols.intercept_, ols.coef_[0]) # support vectors machine regression from sklearn.svm import SVR eps = 0.1 svr = SVR('rbf', epsilon = eps) svr.fit(x_sim[:, np.newaxis], y_sim) svr_pred = svr.predict(x_sim[:, np.newaxis]) plt.scatter(x_sim, y_sim, alpha=0.5, s=26) plt_ols, = plt.plot(x_sim, ols_pred, 'g') plt_svr, = plt.plot(x_sim, svr_pred, color='r') plt.xlabel("Predictor") plt.ylabel("Outcome") plt.ylim(-5.2, 2.2) plt.legend([plt_ols, plt_svr], ['Least Squares', 'SVM'], loc = 4) from pprint import pprint # radial basis kernel opt_sigma = 0.0039 svr = SVR(kernel='rbf', gamma=opt_sigma, epsilon=0.1) svr_params = { 'C': np.logspace(-2, 11, num=14, base=2), } pprint(svr_params) from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import ShuffleSplit cv = ShuffleSplit(trainX.shape[0], n_iter=10, random_state=3) gs_svr = GridSearchCV(svr, svr_params, cv=cv, scoring="mean_squared_error", n_jobs=-1) gs_svr.fit(trainX.values, trainY.values[:,0]) gs_grid_rmse = [np.sqrt(-d[1]) for d in gs_svr.grid_scores_] plt.plot(np.logspace(-2, 11, num=14, base=2), gs_grid_rmse, '-x') plt.xscale('log', basex=2) plt.xlim(2**-2.5, 2**11.5) plt.xlabel('Cost') plt.ylabel('RMSE (Cross-Validation)') print "Best cost value associated with the smallest RMSE was {0}".format(gs_svr.best_params_) # polynomial kernel svr_poly = SVR(kernel='poly', epsilon=0.1) svr_poly_params = { 'C': np.logspace(-2, 5, num=8, base=2), 'gamma': [0.001, 0.005, 0.01], 'degree': [1, 2] } pprint(svr_poly_params) cv = ShuffleSplit(trainX.shape[0], n_iter=10, random_state=3) gs_svr_poly = GridSearchCV(svr_poly, svr_poly_params, cv=cv, scoring="mean_squared_error", n_jobs=-1) gs_svr_poly.fit(trainX.values, trainY.values[:,0]) def split_rmse_scores(grid_scores, scale, degree): '''get the grid scores for each combination of scale and degree''' rmse_scores = [np.sqrt(-d[1]) for d in grid_scores if (d[0]['degree'] == degree and d[0]['gamma'] == scale)] return rmse_scores fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) line1, = ax1.plot(np.logspace(-2, 5, num=8, base=2), split_rmse_scores(gs_svr_poly.grid_scores_, 0.001, 1), '-x') line2, = ax1.plot(np.logspace(-2, 5, num=8, base=2), split_rmse_scores(gs_svr_poly.grid_scores_, 0.001, 2), '-o') ax1.set_xscale('log', basex=2) ax1.set_xlim(2**-2.5, 2**5.5) ax1.set_title('scale: 0.001') ax2.plot(np.logspace(-2, 5, num=8, base=2), split_rmse_scores(gs_svr_poly.grid_scores_, 0.005, 1), '-x') ax2.plot(np.logspace(-2, 5, num=8, base=2), split_rmse_scores(gs_svr_poly.grid_scores_, 0.005, 2), '-o') ax2.set_xscale('log', basex=2) ax2.set_xlim(2**-2.5, 2**5.5) ax2.set_title('scale: 0.005') ax3.plot(np.logspace(-2, 5, num=8, base=2), split_rmse_scores(gs_svr_poly.grid_scores_, 0.01, 1), '-x') ax3.plot(np.logspace(-2, 5, num=8, base=2), split_rmse_scores(gs_svr_poly.grid_scores_, 0.01, 2), '-o') ax3.set_xscale('log', basex=2) ax3.set_xlim(2**-2.5, 2**5.5) ax3.set_title('scale: 0.01') fig.legend([line1, line2], ('Degree 1', 'Degree 2'), loc='upper center', ncol=2, frameon=False) fig.text(0.08, 0.5, 'RMSE (Cross-Validation)', ha='center', va='center', rotation=90) fig.text(0.5, 0.07, 'Cost', ha='center', va='center') from sklearn.neighbors import KNeighborsRegressor knnreg = KNeighborsRegressor() knn_params = { 'n_neighbors': np.arange(1, 21, 1) } pprint(knn_params) cv = ShuffleSplit(trainX.shape[0], n_iter=10, random_state=3) gs_knnreg = GridSearchCV(knnreg, knn_params, cv=cv, scoring='mean_squared_error', n_jobs=-1) gs_knnreg.fit(trainX.values, trainY.values) gs_knnreg_rmse = [np.sqrt(-d[1]) for d in gs_knnreg.grid_scores_] plt.plot(np.arange(1, 21, 1), gs_knnreg_rmse, '-x') plt.xlim(None, 21) plt.xlabel('#Neighbors') plt.ylabel('RMSE (Cross-Validation)')