%matplotlib inline import matplotlib.pyplot as plt # Some nice default configuration for plots plt.rcParams['figure.figsize'] = 10, 7.5 plt.rcParams['axes.grid'] = True plt.gray() import numpy as np import pandas as pd !ls -l ../datasets/solubility/ # training trainX = pd.read_csv("../datasets/solubility/solTrainX.csv").drop("Unnamed: 0", axis=1) trainY = pd.read_csv("../datasets/solubility/solTrainY.csv").drop("Unnamed: 0", axis=1) # test testX = pd.read_csv("../datasets/solubility/solTestX.csv").drop("Unnamed: 0", axis=1) testY = pd.read_csv("../datasets/solubility/solTestY.csv").drop("Unnamed: 0", axis=1) trainX.head(5) trainY.head(5) print "training set size: {0} \ntest set size: {1}".format(trainX.shape, testX.shape) print "The range of the outcome is {0} with an avearge of {1}.".format((round(np.min(trainY)[0], 2), round(np.max(trainY)[0], 2)), round(np.mean(trainY)[0], 2)) fig, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(trainX['MolWeight'], trainY) ax1.set_xlabel("Molecular Weight") ax1.set_ylabel("Solubility (log)") trainY_strPresent = trainY.iloc[np.where(trainX['FP003'] == 1)[0]] trainY_strAbsent = trainY.iloc[np.where(trainX['FP003'] == 0)[0]] trainY_str = [trainY_strPresent, trainY_strAbsent] ax2.boxplot(trainY_str) ax2.set_xticklabels(['structure present', 'structure absent']) ax2.set_ylabel("Solubility (log)") # evaluate the continuous predictors for skewness trainX_c = trainX[trainX.columns[-20:]] # continuous predictors from scipy.stats import skew trainX_skewness = trainX_c.apply(skew) print "Summary of skewness: range {0}; mean {1}".format((round(np.min(trainX_skewness), 2), round(np.max(trainX_skewness), 2)), round(np.mean(trainX_skewness), 2)) # Box-Cox transformed predictors trainX_trans = pd.read_csv("../datasets/solubility/solTrainXtrans.csv").drop("Unnamed: 0", axis=1) testX_trans = pd.read_csv("../datasets/solubility/solTestXtrans.csv").drop("Unnamed: 0", axis=1) trainX_trans_c = trainX_trans[trainX_trans.columns[-20:]] # 20 continuous predictors # lowess (local smoother) import statsmodels.api as sm lowess = sm.nonparametric.lowess def plotContinuousPredictors(X, y, nrows=4, ncols=5): '''plot the continous predictors (default 20) versus the outcome''' fig, axarr = plt.subplots(nrows, ncols) fig.tight_layout() for r in range(nrows): for c in range(ncols): plotOnePredictor(axarr[r, c], X.iloc[:, (r+1)*(c+1)-1], y) def plotOnePredictor(ax, X, y): '''plot one predictor versus the outcome''' z = lowess(np.squeeze(y), np.squeeze(X), 0.88) # return sorted X and y_fit ax.scatter(X, y, alpha=0.4) ax.plot(z[:, 0], z[:, 1], 'r', linewidth=2) ax.set_title(X.name) plotContinuousPredictors(trainX_trans_c, trainY) # PCA scree plot from sklearn.decomposition import PCA from sklearn.preprocessing import scale # scale before PCA trainX_trans_scale = scale(trainX_trans) pca = PCA() pca.fit(trainX_trans_scale) # generate scree plot plt.plot(pca.explained_variance_ratio_) plt.xlim((-5, None)) plt.ylim((-0.005, None)) plt.xlabel('Percent of Total Variance') plt.ylabel('Component') # correlation matrix for continuous predictors corr_dataframe_c = trainX_trans_c.corr() # compute hierarchical cluster on both rows and columns for correlation matrix and plot heatmap def corr_heatmap(corr_dataframe): import scipy.cluster.hierarchy as sch corr_matrix = np.array(corr_dataframe) col_names = corr_dataframe.columns Y = sch.linkage(corr_matrix, 'single', 'correlation') Z = sch.dendrogram(Y, color_threshold=0, no_plot=True)['leaves'] corr_matrix = corr_matrix[Z, :] corr_matrix = corr_matrix[:, Z] col_names = col_names[Z] im = plt.imshow(corr_matrix, interpolation='nearest', aspect='auto', cmap='bwr') plt.colorbar() plt.xticks(range(corr_matrix.shape[0]), col_names, rotation='vertical', fontsize=12) plt.yticks(range(corr_matrix.shape[0]), col_names[::-1], fontsize=12) # plot corr_heatmap(corr_dataframe_c) corr_dataframe = trainX_trans.corr() def remove_high_corr(corr_dataframe, thresh = 0.9): '''remove predictors with high pairwise correlation''' abs_corr = np.abs(corr_dataframe).as_matrix() # absolute correlation matrix col_names = list(corr_dataframe.columns) # set up diagonal to 0 np.fill_diagonal(abs_corr, 0) print "Removed predictors (in order): \n" while np.max(abs_corr) >= thresh: i, j = np.unravel_index(abs_corr.argmax(), abs_corr.shape) # find maximum element # print abs_corr[i, j] rdx = which_to_remove(i, j, abs_corr) # remove corresponding predictor print col_names.pop(rdx) abs_corr = np.delete(abs_corr, rdx, 0) abs_corr = np.delete(abs_corr, rdx, 1) return col_names def which_to_remove(i, j, abs_corr): '''compare two predictors and remove the one with higher abs correlation with other predictors''' i_absmean = np.mean(abs_corr[i, np.where(abs_corr[i,:] == 0)]) j_absmean = np.mean(abs_corr[j, np.where(abs_corr[j,:] == 0)]) return i if i_absmean > j_absmean else j # remained predictors col_remained = remove_high_corr(corr_dataframe) trainX_trans_filtered = trainX_trans[col_remained] testX_trans_filtered = testX_trans[col_remained] from sklearn.linear_model import LinearRegression from sklearn.cross_validation import cross_val_score, ShuffleSplit cv = ShuffleSplit(trainX_trans_filtered.shape[0], n_iter=10, random_state=0) lm = LinearRegression() train_scores_mse = cross_val_score(lm, trainX_trans_filtered, trainY.values, cv=cv, scoring = 'mean_squared_error') train_scores_rmse = np.sqrt(-1.0 * train_scores_mse) train_scores_r2 = cross_val_score(lm, trainX_trans_filtered, trainY.values, cv=cv, scoring = 'r2') print "CV estimated RMSE: {0} \nCV estimated R2: {1}".format(np.mean(train_scores_rmse), np.mean(train_scores_r2)) # apply to the test set lm.fit(trainX_trans_filtered, trainY.values) testY_pred = lm.predict(testX_trans_filtered) from sklearn.metrics import r2_score, mean_squared_error test_score_r2 = r2_score(testY.values, testY_pred) test_score_rmse = np.sqrt(mean_squared_error(testY.values, testY_pred)) print "Test set RMSE: {0} \nTest set R2: {1}".format(test_score_rmse, test_score_r2) # regression diagnostic fig, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(testY_pred, testY.values) ax1.set(xlim=(-11, 2), ylim=(-11, 2)) ax1.plot(ax1.get_xlim(), ax1.get_ylim(), ls="--", c=".3", linewidth=2) ax1.set_xlabel("Predicted") ax1.set_ylabel("Observed") ax2.scatter(testY_pred, testY.values-testY_pred) ax2.set(xlim=(-11, 2)) ax2.plot(ax1.get_xlim(), (0, 0), ls="--", c=".3", linewidth=2) ax2.set_xlabel("Predicted") ax2.set_ylabel("Residual") # RMSEs for 50 components were examined pcr_rmse = np.zeros([50, 2]) plsr_rmse = np.zeros([50, 2]) # 10-fold CV cv = ShuffleSplit(trainX_trans.shape[0], n_iter=10) def optimal_with_osr(rmse): '''Return the minimum RMSE and its # of components using "one-standard error" rule''' n_best = np.argmin(rmse[:, 0]) rmse_min = rmse[n_best, 0] + rmse[n_best, 1] n_best = np.min(np.where(rmse[:, 0] <= rmse_min)) return (n_best+1), rmse_min # PCR from sklearn.decomposition import PCA for n_compo in range(len(pcr_rmse)): pca = PCA(n_components= n_compo+1) pca_compo = pca.fit_transform(trainX_trans_scale) train_scores_mse = cross_val_score(LinearRegression(), pca_compo, trainY.values, cv=cv, scoring = 'mean_squared_error') pcr_rmse[n_compo, 0] = np.mean(np.sqrt(-1.0 * train_scores_mse)) pcr_rmse[n_compo, 1] = np.std(np.sqrt(-1.0 * train_scores_mse)) n_best, rmse_min = optimal_with_osr(pcr_rmse) print "With one-standard error rule, the minimum PCR RMSE is {0} with {1} components".format(rmse_min, n_best) # PLSR from sklearn.cross_decomposition import PLSRegression for n_compo in range(len(plsr_rmse)): plsr= PLSRegression(n_components= n_compo+1, scale=False) train_scores_mse = cross_val_score(plsr, trainX_trans_scale, trainY.values, cv=cv, scoring = 'mean_squared_error') plsr_rmse[n_compo, 0] = np.mean(np.sqrt(-1.0 * train_scores_mse)) plsr_rmse[n_compo, 1] = np.std(np.sqrt(-1.0 * train_scores_mse)) n_best, rmse_min = optimal_with_osr(plsr_rmse) print "With one-standard error rule, the minimum PLSR RMSE is {0} with {1} components".format(rmse_min, n_best) plt_pcr, = plt.plot(pcr_rmse[:,0], '-x') plt_plsr, = plt.plot(plsr_rmse[:,0], '-o') plt.xlim((-1, None)) plt.xlabel("# Components") plt.ylabel("RMSE (Cross-Validation)") plt.legend([plt_pcr, plt_plsr], ['PCR', 'PLSR']) from scipy.stats import pearsonr # compoare the first two components pca2 = PCA(n_components=2) pca2.fit(trainX_trans_scale) plsr2 = PLSRegression(n_components=2, scale=False) plsr2.fit(trainX_trans_scale, trainY.values) # visulization fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True) #fig.tight_layout() axarr[0,0].scatter(pca2.transform(trainX_trans_scale)[:, 0], trainY.values) axarr[0,0].set_title("PCA Component 1") corr = pearsonr(pca2.transform(trainX_trans_scale)[:, 0], trainY.values[:,0])[0] axarr[0,0].text(10, -12, 'corr:' + str(round(corr, 2))) axarr[0,1].scatter(pca2.transform(trainX_trans_scale)[:, 1], trainY.values) axarr[0,1].set_title("PCA Component 2") corr = pearsonr(pca2.transform(trainX_trans_scale)[:, 1], trainY.values[:,0])[0] axarr[0,1].text(10, -12, 'corr:' + str(round(corr, 2))) axarr[1,0].scatter(plsr2.transform(trainX_trans_scale)[:, 0], trainY.values) axarr[1,0].set_title("PLS Component 1") corr = pearsonr(plsr2.transform(trainX_trans_scale)[:, 0], trainY.values[:,0])[0] axarr[1,0].text(10, -12, 'corr:' + str(round(corr, 2))) axarr[1,1].scatter(plsr2.transform(trainX_trans_scale)[:, 1], trainY.values) axarr[1,1].set_title("PLS Component 2") corr = pearsonr(plsr2.transform(trainX_trans_scale)[:, 0], trainY.values[:,0])[0] axarr[1,1].text(10, -12, 'corr:' + str(round(corr, 2))) fig.text(0.5, 0.06, 'Component Score', ha='center', va='center') fig.text(0.06, 0.5, 'Solubility', va='center', rotation='vertical') from sklearn.preprocessing import scale testX_trans_scale = scale(testX_trans) # PCR pca = PCA(n_components=34) # optimal pca.fit(trainX_trans_scale) trainX_pca = pca.transform(trainX_trans_scale) testX_pca = pca.transform(testX_trans_scale) lm = LinearRegression() lm.fit(trainX_pca, trainY.values) testY_pca_pred = lm.predict(testX_pca) # PLSR plsr = PLSRegression(n_components=7, scale=False) plsr.fit(trainX_trans_scale, trainY.values) testY_plsr_pred = plsr.predict(testX_trans_scale) # regression diagnostic fig, axarr = plt.subplots(2, 2) axarr[0, 0].scatter(testY_pca_pred, testY) axarr[0, 0].set(xlim=(-11, 2), ylim=(-11, 2)) axarr[0, 0].plot(axarr[0,0].get_xlim(), axarr[0,0].get_ylim(), ls="--", c=".3", linewidth=2) axarr[0, 0].set_xlabel("Predicted") axarr[0, 0].set_ylabel("Observed") axarr[0, 1].scatter(testY_pca_pred, testY-testY_pca_pred) axarr[0, 1].set(xlim=(-11, 2)) axarr[0, 1].plot(axarr[0,1].get_xlim(), (0,0), ls="--", c=".3", linewidth=2) axarr[0, 1].set_xlabel("Predicted") axarr[0, 1].set_ylabel("Residual") axarr[1, 0].scatter(testY_plsr_pred, testY) axarr[1, 0].set(xlim=(-11, 2), ylim=(-11, 2)) axarr[1, 0].plot(axarr[0,0].get_xlim(), axarr[1,0].get_ylim(), ls="--", c=".3", linewidth=2) axarr[1, 0].set_xlabel("Predicted") axarr[1, 0].set_ylabel("Observed") axarr[1, 1].scatter(testY_plsr_pred, testY-testY_plsr_pred) axarr[1, 1].set(xlim=(-11, 2)) axarr[1, 1].plot(axarr[1,1].get_xlim(), (0,0), ls="--", c=".3", linewidth=2) axarr[1, 1].set_xlabel("Predicted") axarr[1, 1].set_ylabel("Residual") # path of the ridge regression coefficients from sklearn.linear_model import Ridge lambda_range = np.linspace(0.0, 0.01, 600) ridgeModel = Ridge(normalize=True) coeff_path = [] for a in lambda_range: ridgeModel.set_params(alpha = a) ridgeModel.fit(trainX_trans, trainY) coeff_path.append(ridgeModel.coef_[0]) ax = plt.gca() ax.plot(lambda_range, coeff_path) ax.set_xlim((-0.0005, 0.0105)) plt.xlabel("Penalty") plt.ylabel("Standardized Coefficient") # penalty value versus RMSE lambda_range = np.linspace(0.0, 0.1, 15) score_rmse = np.zeros(15) ridgeModel = Ridge(normalize=True) for adx, a in enumerate(lambda_range): temp_scores = cross_val_score(ridgeModel.set_params(alpha = a), trainX_trans, trainY.values, cv=10, scoring="mean_squared_error") temp_scores = np.sqrt(-1.0 * temp_scores) score_rmse[adx] = np.mean(temp_scores) plt.plot(lambda_range, score_rmse, '-o') plt.xlabel("Penalty") plt.ylabel("RMSE (Cross-Validation)") plt.xlim((-0.005, 0.105)) # path of the lasso regression coefficients from sklearn.linear_model import lars_path # compute regularization path using the LARS lambdas, _, coefs = lars_path(trainX_trans.values, trainY.values[:,0], method = 'lasso', verbose=True) xx = np.sum(np.abs(coefs.T), axis=1) xx /= xx[-1] plt.plot(xx, coefs.T) plt.xlabel('Fraction of Full Solution') plt.ylabel('Standardized Coefficient')