%matplotlib inline
import matplotlib.pyplot as plt

# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()

import numpy as np
import pandas as pd

!ls -l ../datasets/solubility/

# training
trainX = pd.read_csv("../datasets/solubility/solTrainX.csv").drop("Unnamed: 0", axis=1)
trainY = pd.read_csv("../datasets/solubility/solTrainY.csv").drop("Unnamed: 0", axis=1)

# test 
testX = pd.read_csv("../datasets/solubility/solTestX.csv").drop("Unnamed: 0", axis=1)
testY = pd.read_csv("../datasets/solubility/solTestY.csv").drop("Unnamed: 0", axis=1)

trainX.head(5)

trainY.head(5)

print "training set size: {0} \ntest set size: {1}".format(trainX.shape, testX.shape)

print "The range of the outcome is {0} with an avearge of {1}.".format((round(np.min(trainY)[0], 2), 
                                                                        round(np.max(trainY)[0], 2)),
                                                                       round(np.mean(trainY)[0], 2))

fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(trainX['MolWeight'], trainY)
ax1.set_xlabel("Molecular Weight")
ax1.set_ylabel("Solubility (log)")

trainY_strPresent = trainY.iloc[np.where(trainX['FP003'] == 1)[0]]
trainY_strAbsent = trainY.iloc[np.where(trainX['FP003'] == 0)[0]]
trainY_str = [trainY_strPresent, trainY_strAbsent]
ax2.boxplot(trainY_str)
ax2.set_xticklabels(['structure present', 'structure absent'])
ax2.set_ylabel("Solubility (log)")

# evaluate the continuous predictors for skewness
trainX_c = trainX[trainX.columns[-20:]] # continuous predictors

from scipy.stats import skew
trainX_skewness = trainX_c.apply(skew)
print "Summary of skewness: range {0}; mean {1}".format((round(np.min(trainX_skewness), 2), 
                                                         round(np.max(trainX_skewness), 2)), 
                                                        round(np.mean(trainX_skewness), 2))

# Box-Cox transformed predictors
trainX_trans = pd.read_csv("../datasets/solubility/solTrainXtrans.csv").drop("Unnamed: 0", axis=1)
testX_trans = pd.read_csv("../datasets/solubility/solTestXtrans.csv").drop("Unnamed: 0", axis=1)

trainX_trans_c = trainX_trans[trainX_trans.columns[-20:]] # 20 continuous predictors

# lowess (local smoother)
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

def plotContinuousPredictors(X, y, nrows=4, ncols=5):
    '''plot the continous predictors (default 20) versus the outcome'''
    fig, axarr = plt.subplots(nrows, ncols)
    fig.tight_layout()
    
    for r in range(nrows):
        for c in range(ncols):
            plotOnePredictor(axarr[r, c], X.iloc[:, (r+1)*(c+1)-1], y)
            
def plotOnePredictor(ax, X, y):
    '''plot one predictor versus the outcome'''
    z = lowess(np.squeeze(y), np.squeeze(X), 0.88) # return sorted X and y_fit
    ax.scatter(X, y, alpha=0.4)
    ax.plot(z[:, 0], z[:, 1], 'r', linewidth=2)
    ax.set_title(X.name)
    
    
plotContinuousPredictors(trainX_trans_c, trainY)

# PCA scree plot
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

# scale before PCA
trainX_trans_scale = scale(trainX_trans)

pca = PCA()
pca.fit(trainX_trans_scale)

# generate scree plot
plt.plot(pca.explained_variance_ratio_)
plt.xlim((-5, None))
plt.ylim((-0.005, None))
plt.xlabel('Percent of Total Variance')
plt.ylabel('Component')

# correlation matrix for continuous predictors
corr_dataframe_c = trainX_trans_c.corr()

# compute hierarchical cluster on both rows and columns for correlation matrix and plot heatmap 
def corr_heatmap(corr_dataframe):
    import scipy.cluster.hierarchy as sch
    
    corr_matrix = np.array(corr_dataframe)
    col_names = corr_dataframe.columns
    
    Y = sch.linkage(corr_matrix, 'single', 'correlation')
    Z = sch.dendrogram(Y, color_threshold=0, no_plot=True)['leaves']
    corr_matrix = corr_matrix[Z, :]
    corr_matrix = corr_matrix[:, Z]
    col_names = col_names[Z]
    im = plt.imshow(corr_matrix, interpolation='nearest', aspect='auto', cmap='bwr')
    plt.colorbar()
    plt.xticks(range(corr_matrix.shape[0]), col_names, rotation='vertical', fontsize=12)
    plt.yticks(range(corr_matrix.shape[0]), col_names[::-1], fontsize=12)
    
# plot
corr_heatmap(corr_dataframe_c)

corr_dataframe = trainX_trans.corr()

def remove_high_corr(corr_dataframe, thresh = 0.9):
    '''remove predictors with high pairwise correlation'''
    abs_corr = np.abs(corr_dataframe).as_matrix() # absolute correlation matrix
    col_names = list(corr_dataframe.columns)
    
    # set up diagonal to 0
    np.fill_diagonal(abs_corr, 0)
    
    print "Removed predictors (in order): \n"
    while np.max(abs_corr) >= thresh:
        i, j = np.unravel_index(abs_corr.argmax(), abs_corr.shape) # find maximum element
        # print abs_corr[i, j]
        rdx = which_to_remove(i, j, abs_corr)
        # remove corresponding predictor
        print col_names.pop(rdx)
        abs_corr = np.delete(abs_corr, rdx, 0)
        abs_corr = np.delete(abs_corr, rdx, 1)
        
    return col_names

def which_to_remove(i, j, abs_corr):
    '''compare two predictors and remove the one with higher abs correlation with other predictors'''
    i_absmean = np.mean(abs_corr[i, np.where(abs_corr[i,:] == 0)])
    j_absmean = np.mean(abs_corr[j, np.where(abs_corr[j,:] == 0)])
    
    return i if i_absmean > j_absmean else j

# remained predictors
col_remained = remove_high_corr(corr_dataframe)
trainX_trans_filtered = trainX_trans[col_remained]
testX_trans_filtered = testX_trans[col_remained]

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, ShuffleSplit

cv = ShuffleSplit(trainX_trans_filtered.shape[0], n_iter=10, random_state=0)

lm = LinearRegression()
train_scores_mse = cross_val_score(lm, trainX_trans_filtered, trainY.values, cv=cv, 
                                   scoring = 'mean_squared_error')
train_scores_rmse = np.sqrt(-1.0 * train_scores_mse)

train_scores_r2 = cross_val_score(lm, trainX_trans_filtered, trainY.values, cv=cv, 
                                  scoring = 'r2')
print "CV estimated RMSE: {0} \nCV estimated R2: {1}".format(np.mean(train_scores_rmse), np.mean(train_scores_r2))

# apply to the test set
lm.fit(trainX_trans_filtered, trainY.values)
testY_pred = lm.predict(testX_trans_filtered)

from sklearn.metrics import r2_score, mean_squared_error
test_score_r2 = r2_score(testY.values, testY_pred)
test_score_rmse = np.sqrt(mean_squared_error(testY.values, testY_pred))

print "Test set RMSE: {0} \nTest set R2: {1}".format(test_score_rmse, test_score_r2)

# regression diagnostic
fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(testY_pred, testY.values)
ax1.set(xlim=(-11, 2), ylim=(-11, 2))
ax1.plot(ax1.get_xlim(), ax1.get_ylim(), ls="--", c=".3", linewidth=2)
ax1.set_xlabel("Predicted")
ax1.set_ylabel("Observed")

ax2.scatter(testY_pred, testY.values-testY_pred)
ax2.set(xlim=(-11, 2))
ax2.plot(ax1.get_xlim(), (0, 0), ls="--", c=".3", linewidth=2)
ax2.set_xlabel("Predicted")
ax2.set_ylabel("Residual")

# RMSEs for 50 components were examined
pcr_rmse = np.zeros([50, 2])
plsr_rmse = np.zeros([50, 2])

# 10-fold CV
cv = ShuffleSplit(trainX_trans.shape[0], n_iter=10)

def optimal_with_osr(rmse):
    '''Return the minimum RMSE and its # of components using "one-standard error" rule'''
    n_best = np.argmin(rmse[:, 0])
    rmse_min = rmse[n_best, 0] + rmse[n_best, 1]
    n_best = np.min(np.where(rmse[:, 0] <= rmse_min))
    return (n_best+1), rmse_min

# PCR
from sklearn.decomposition import PCA

for n_compo in range(len(pcr_rmse)):
    pca = PCA(n_components= n_compo+1)
    pca_compo = pca.fit_transform(trainX_trans_scale)
    train_scores_mse = cross_val_score(LinearRegression(), pca_compo, trainY.values, cv=cv, scoring = 'mean_squared_error')
    pcr_rmse[n_compo, 0] = np.mean(np.sqrt(-1.0 * train_scores_mse))
    pcr_rmse[n_compo, 1] = np.std(np.sqrt(-1.0 * train_scores_mse))

n_best, rmse_min = optimal_with_osr(pcr_rmse)
print "With one-standard error rule, the minimum PCR RMSE is {0} with {1} components".format(rmse_min, n_best)

# PLSR
from sklearn.cross_decomposition import PLSRegression

for n_compo in range(len(plsr_rmse)):
    plsr= PLSRegression(n_components= n_compo+1, scale=False)
    train_scores_mse = cross_val_score(plsr, trainX_trans_scale, trainY.values, cv=cv, scoring = 'mean_squared_error')
    plsr_rmse[n_compo, 0] = np.mean(np.sqrt(-1.0 * train_scores_mse))
    plsr_rmse[n_compo, 1] = np.std(np.sqrt(-1.0 * train_scores_mse))

n_best, rmse_min = optimal_with_osr(plsr_rmse)
print "With one-standard error rule, the minimum PLSR RMSE is {0} with {1} components".format(rmse_min, n_best)

plt_pcr, = plt.plot(pcr_rmse[:,0], '-x')
plt_plsr, = plt.plot(plsr_rmse[:,0], '-o')
plt.xlim((-1, None))
plt.xlabel("# Components")
plt.ylabel("RMSE (Cross-Validation)")
plt.legend([plt_pcr, plt_plsr], ['PCR', 'PLSR'])

from scipy.stats import pearsonr

# compoare the first two components
pca2 = PCA(n_components=2)
pca2.fit(trainX_trans_scale)

plsr2 = PLSRegression(n_components=2, scale=False)
plsr2.fit(trainX_trans_scale, trainY.values)

# visulization
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
#fig.tight_layout()

axarr[0,0].scatter(pca2.transform(trainX_trans_scale)[:, 0], trainY.values)
axarr[0,0].set_title("PCA Component 1")
corr = pearsonr(pca2.transform(trainX_trans_scale)[:, 0], trainY.values[:,0])[0]
axarr[0,0].text(10, -12, 'corr:' + str(round(corr, 2)))

axarr[0,1].scatter(pca2.transform(trainX_trans_scale)[:, 1], trainY.values)
axarr[0,1].set_title("PCA Component 2")
corr = pearsonr(pca2.transform(trainX_trans_scale)[:, 1], trainY.values[:,0])[0]
axarr[0,1].text(10, -12, 'corr:' + str(round(corr, 2)))

axarr[1,0].scatter(plsr2.transform(trainX_trans_scale)[:, 0], trainY.values)
axarr[1,0].set_title("PLS Component 1")
corr = pearsonr(plsr2.transform(trainX_trans_scale)[:, 0], trainY.values[:,0])[0]
axarr[1,0].text(10, -12, 'corr:' + str(round(corr, 2)))

axarr[1,1].scatter(plsr2.transform(trainX_trans_scale)[:, 1], trainY.values)
axarr[1,1].set_title("PLS Component 2")
corr = pearsonr(plsr2.transform(trainX_trans_scale)[:, 0], trainY.values[:,0])[0]
axarr[1,1].text(10, -12, 'corr:' + str(round(corr, 2)))

fig.text(0.5, 0.06, 'Component Score', ha='center', va='center')
fig.text(0.06, 0.5, 'Solubility', va='center', rotation='vertical')

from sklearn.preprocessing import scale
testX_trans_scale = scale(testX_trans)

# PCR
pca = PCA(n_components=34) # optimal
pca.fit(trainX_trans_scale)
trainX_pca = pca.transform(trainX_trans_scale)
testX_pca = pca.transform(testX_trans_scale)
lm = LinearRegression()
lm.fit(trainX_pca, trainY.values)
testY_pca_pred = lm.predict(testX_pca)

# PLSR
plsr = PLSRegression(n_components=7, scale=False)
plsr.fit(trainX_trans_scale, trainY.values)
testY_plsr_pred = plsr.predict(testX_trans_scale)

# regression diagnostic
fig, axarr = plt.subplots(2, 2)

axarr[0, 0].scatter(testY_pca_pred, testY)
axarr[0, 0].set(xlim=(-11, 2), ylim=(-11, 2))
axarr[0, 0].plot(axarr[0,0].get_xlim(), axarr[0,0].get_ylim(), ls="--", c=".3", linewidth=2)
axarr[0, 0].set_xlabel("Predicted")
axarr[0, 0].set_ylabel("Observed")

axarr[0, 1].scatter(testY_pca_pred, testY-testY_pca_pred)
axarr[0, 1].set(xlim=(-11, 2))
axarr[0, 1].plot(axarr[0,1].get_xlim(), (0,0), ls="--", c=".3", linewidth=2)
axarr[0, 1].set_xlabel("Predicted")
axarr[0, 1].set_ylabel("Residual")

axarr[1, 0].scatter(testY_plsr_pred, testY)
axarr[1, 0].set(xlim=(-11, 2), ylim=(-11, 2))
axarr[1, 0].plot(axarr[0,0].get_xlim(), axarr[1,0].get_ylim(), ls="--", c=".3", linewidth=2)
axarr[1, 0].set_xlabel("Predicted")
axarr[1, 0].set_ylabel("Observed")

axarr[1, 1].scatter(testY_plsr_pred, testY-testY_plsr_pred)
axarr[1, 1].set(xlim=(-11, 2))
axarr[1, 1].plot(axarr[1,1].get_xlim(), (0,0), ls="--", c=".3", linewidth=2)
axarr[1, 1].set_xlabel("Predicted")
axarr[1, 1].set_ylabel("Residual")

# path of the ridge regression coefficients
from sklearn.linear_model import Ridge

lambda_range = np.linspace(0.0, 0.01, 600)
ridgeModel = Ridge(normalize=True)
coeff_path = []

for a in lambda_range:
    ridgeModel.set_params(alpha = a)
    ridgeModel.fit(trainX_trans, trainY)
    coeff_path.append(ridgeModel.coef_[0])

ax = plt.gca()
ax.plot(lambda_range, coeff_path)
ax.set_xlim((-0.0005, 0.0105))
plt.xlabel("Penalty")
plt.ylabel("Standardized Coefficient")

# penalty value versus RMSE
lambda_range = np.linspace(0.0, 0.1, 15)
score_rmse = np.zeros(15)

ridgeModel = Ridge(normalize=True)

for adx, a in enumerate(lambda_range):
    temp_scores = cross_val_score(ridgeModel.set_params(alpha = a),
                                  trainX_trans, trainY.values,
                                  cv=10, scoring="mean_squared_error")
    temp_scores = np.sqrt(-1.0 * temp_scores)
    score_rmse[adx] = np.mean(temp_scores)
    
plt.plot(lambda_range, score_rmse, '-o')
plt.xlabel("Penalty")
plt.ylabel("RMSE (Cross-Validation)")
plt.xlim((-0.005, 0.105))

# path of the lasso regression coefficients
from sklearn.linear_model import lars_path

# compute regularization path using the LARS
lambdas, _, coefs = lars_path(trainX_trans.values, trainY.values[:,0], method = 'lasso', verbose=True)

xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]

plt.plot(xx, coefs.T)
plt.xlabel('Fraction of Full Solution')
plt.ylabel('Standardized Coefficient')