Homepage: luispedro.org
Email: luis@luispedro.org
@luispedrocoelho
github/luispedro
import numpy as np from matplotlib import pyplot as plt from IPython.display import display %matplotlib inline from sklearn.datasets import load_boston boston = load_boston() from sklearn.cross_validation import train_test_split train_data, test_data, train_target, test_target = \ train_test_split(boston.data, boston.target, train_size=.8) print(train_data.shape) _=plt.hist(train_target, bins=100) RoomNr_Index = 5 fig,ax = plt.subplots() ax.scatter(train_data[:,RoomNr_Index], train_target) ax.set_xlabel("Number of rooms") ax.set_ylabel("House price") pass from IPython.html.widgets import interact from scipy import stats @interact(index=(0, train_data.shape[1])) def plot_scatter(index): fig,ax = plt.subplots() x, y = train_data[:,index], train_target ax.scatter(x, y) ax.set_xlabel(boston.feature_names[index]) ax.set_ylabel("House price") print("Correlation: {0[0]:.1} (p-value: {0[1]:.1})".format(stats.pearsonr(x, y))) return fig E = np.linspace(0, 2, 100) squared_error_figure, ax = plt.subplots(figsize=(6,6)) ax.plot(E, E**2) ax.plot([0,1], [1,1], 'k-') ax.plot([1,1], [0,1], 'k-') ax.set_xlim(0,4.) ax.set_xlabel('$E$') ax.set_ylabel('$E^2$') pass display(squared_error_figure) from sklearn import linear_model linreg = linear_model.LinearRegression() _=linreg.fit(train_data[:,RoomNr_Index:RoomNr_Index+1], train_target) fig,ax = plt.subplots() ax.scatter(train_data[:,RoomNr_Index], train_target) ax.plot([4, 9], linreg.predict([[4],[9]]), 'k-') ax.set_xlabel("Number of rooms") _=ax.set_ylabel("House price") linreg.fit(train_data, train_target) prediction = linreg.predict(test_data) linreg.fit(train_data, train_target) fig,ax = plt.subplots() ax.scatter(test_target, linreg.predict(test_data)) ax.plot([0,50], [0,50], 'k-') ax.set_xlabel('Target (test)') ax.set_ylabel('Predicted (test)') pass from sklearn import metrics mse = metrics.mean_squared_error(test_target, linreg.predict(test_data)) print("MSE is {}".format(mse)) rmse = np.sqrt(mse) print("RMSE is {}".format(rmse)) cod = metrics.r2_score(test_target, linreg.predict(test_data)) print("COD is {}".format(cod)) print(linreg.score(test_data, test_target)) linreg.fit(train_data, train_target) r2_train = metrics.r2_score(train_target, linreg.predict(train_data)) r2_test = metrics.r2_score(test_target, linreg.predict(test_data)) print("R2 on training: {:.1%}".format(r2_train)) print("R2 on testing: {:.1%}".format(r2_test)) lasso = linear_model.Lasso() lasso.fit(train_data, train_target) pass lasso = linear_model.Lasso(alpha=0.1) lasso.fit(train_data, train_target) pass fig,ax = plt.subplots() ax.plot(train_data.ptp(axis=0)) ax.set_xticklabels(boston.feature_names) ax.set_xticks(np.arange(train_data.shape[1])) ax.set_ylabel('Point to point (max-min)') pass lasso = linear_model.Lasso(normalize=True, alpha=.1) lasso.fit(train_data, train_target) pass linreg.fit(train_data, train_target) r2_ols_train = linreg.score(train_data, train_target) r2_ols = linreg.score(test_data, test_target) lasso.fit(train_data, train_target) r2_lasso_train = lasso.score(train_data, train_target) r2_lasso = lasso.score(test_data, test_target) results = """\ | TRAINING | TESTING ------+----------+--------- OLS | {:.2%} | {:.2%} ------+----------+--------- Lasso | {:.2%} | {:.2%} --------------------------- """.format(r2_ols_train, r2_ols, r2_lasso_train, r2_lasso) print(results) ridge = linear_model.Ridge(normalize=True, alpha=.1) ridge.fit(train_data, train_target) r2_ridge_train = ridge.score(train_data, train_target) r2_ridge = ridge.score(test_data, test_target) results = """\ | TRAINING | TESTING ------+----------+--------- OLS | {:.2%} | {:.2%} ------+----------+--------- Lasso | {:.2%} | {:.2%} ------+----------+--------- Ridge | {:.2%} | {:.2%} --------------------------- """.format(r2_ols_train, r2_ols, r2_lasso_train, r2_lasso, r2_ridge_train, r2_ridge) print(results) print(lasso.coef_) lasso.alpha = .2 lasso.fit(train_data, train_target) print(lasso.coef_) lasso.alpha = 1. lasso.fit(train_data, train_target) print(lasso.coef_) alphas = np.linspace(.01, 1000., 1000) alphas, coefs, _= lasso.path(train_data, train_target, alphas=alphas) fig,ax = plt.subplots() ax.plot(alphas, coefs.T) ax.set_xscale('log') ax.set_xlim(alphas.max(), alphas.min()) ax.set_xlabel(r'$\alpha$') ax.set_ylabel('Coefficient value') pass fig,ax = plt.subplots() ax.plot(alphas, np.sum(coefs != 0.0, axis=0)) ax.set_xscale('log') ax.set_xlim(alphas.max(), alphas.min()) ax.set_ylim(-.1, 13.1) ax.set_xlabel(r'$\alpha$') ax.set_ylabel('Nr of non-zero coefficients') pass ridge.fit(train_data, train_target) print(ridge.coef_) lasso.alpha = 0.1 lasso.fit(train_data, train_target) ridge.alpha = 0.1 ridge.fit(train_data, train_target) fig,ax = plt.subplots() ax.plot(linreg.coef_, label='OLS') ax.plot(lasso.coef_, label=r'Lasso ($\alpha$=0.1)') ax.plot(ridge.coef_, label=r'Ridge ($\alpha$=0.1)') ax.set_xticklabels(boston.feature_names) ax.set_xticks(np.arange(train_data.shape[1])) ax.legend(loc='best') pass en = linear_model.ElasticNet(normalize=True) en = linear_model.ElasticNet(normalize=True, alpha=0.1, l1_ratio=.5) lasso = linear_model.ElasticNet(normalize=True, alpha=0.1, l1_ratio=1.) ridge = linear_model.ElasticNet(normalize=True, alpha=0.1, l1_ratio=0.) half_way =linear_model.ElasticNet(normalize=True, alpha=0.1, l1_ratio=.5) almost_lasso = linear_model.ElasticNet(normalize=True, alpha=0.1, l1_ratio=.95) almost_lasso.fit(train_data, train_target) lasso.fit(train_data, train_target) print(almost_lasso.coef_) print(lasso.coef_) source_location = 'http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2' from os import path if not path.exists('E2006.train.bz2'): import urllib urllib.urlretrieve(source_location, 'E2600.train.bz2') from sklearn.datasets import load_svmlight_file E2006_data, E2006_target = load_svmlight_file('E2006.train.bz2') print(E2006_data.shape) print(type(E2006_data)) E2_train_data, E2_test_data, E2_train_target, E2_test_target = \ train_test_split(E2006_data, E2006_target, train_size=.8) linreg.fit(E2_train_data, E2_train_target) r2_ols_train = linreg.score(E2_train_data, E2_train_target) r2_ols = linreg.score(E2_test_data, E2_test_target) print("R2 Training (OLS): {:.1%}".format(r2_ols_train)) print("R2 Testing (OLS): {:.1%}".format(r2_ols)) lasso = linear_model.Lasso(normalize=True, alpha=.01) lasso.fit(E2_train_data, E2_train_target) r2_lasso_train = lasso.score(E2_train_data, E2_train_target) r2_lasso = lasso.score(E2_test_data, E2_test_target) ridge = linear_model.Ridge(normalize=True, alpha=.01) ridge.fit(E2_train_data, E2_train_target) r2_ridge_train = ridge.score(E2_train_data, E2_train_target) r2_ridge = ridge.score(E2_test_data, E2_test_target) results_p_gt_n = """\ | TRAINING | TESTING ------+----------+--------- OLS | {:.2%} | {:.2%} ------+----------+--------- Lasso | {:.2%} | {:.2%} ------+----------+--------- Ridge | {:.2%} | {:.2%} --------------------------- """.format(r2_ols_train, r2_ols, r2_lasso_train, r2_lasso, r2_ridge_train, r2_ridge) print(results_p_gt_n) enCV = linear_model.ElasticNetCV(normalize=True, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], ) enCV = linear_model.ElasticNetCV(normalize=True, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], n_jobs=-1, ) from sklearn import linear_model enCV = linear_model.ElasticNetCV(normalize=True, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], n_jobs=-1, ) enCV = linear_model.ElasticNetCV(normalize=True, l1_ratio=[.1, .5, .9, .99], alphas=[.001, .01, .05, .1, .25, .5, 1.], n_jobs=-1, ) enCV.fit(E2_train_data, E2_train_target) r2_enCV_train = enCV.score(E2_train_data, E2_train_target) r2_enCV = enCV.score(E2_test_data, E2_test_target) result_en_cv = """\ | TRAINING | TESTING ------+----------+--------- OLS | {:.2%} | {:.2%} ------+----------+--------- Lasso | {:.2%} | {:.2%} ------+----------+--------- Ridge | {:.2%} | {:.2%} ------+----------+--------- EN-CV | {:.2%} | {:.2%} --------------------------- """.format(r2_ols_train, r2_ols, r2_lasso_train, r2_lasso, r2_ridge_train, r2_ridge, r2_enCV_train, r2_enCV) print(result_en_cv) print(enCV)