%pylab inline import numpy as np import pylab as pl X = np.c_[ .5, 1].T y = [.5, 1] X_test = np.c_[ 0, 2].T X y X_test from sklearn import linear_model regr = linear_model.LinearRegression() regr.fit(X, y) pl.plot(X, y, 'o') pl.plot(X_test, regr.predict(X_test)) np.random.seed(0) for _ in range(6): noise = np.random.normal(loc=0, scale=.1, size=X.shape) noisy_X = X + noise pl.plot(noisy_X, y, 'o') regr.fit(noisy_X, y) pl.plot(X_test, regr.predict(X_test)) regr = linear_model.Ridge(alpha=.1) np.random.seed(0) for _ in range(6): noise = np.random.normal(loc=0, scale=.1, size=X.shape) noisy_X = X + noise pl.plot(noisy_X, y, 'o') regr.fit(noisy_X, y) pl.plot(X_test, regr.predict(X_test)) from figures import plot_bias_variance plot_bias_variance(8, random_seed=42) def test_func(x, err=0.5): return np.random.normal(10 - 1. / (x + 0.1), err) def compute_error(x, y, p): yfit = np.polyval(p, x) return np.sqrt(np.mean((y - yfit) ** 2)) from sklearn.cross_validation import train_test_split N = 200 test_size = 0.4 error = 1.0 # randomly sample the data np.random.seed(1) x = np.random.random(N) y = test_func(x, error) # split into training, validation, and testing sets. xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=test_size) # show the training and validation sets plt.scatter(xtrain, ytrain, color='red') plt.scatter(xtest, ytest, color='blue') # suppress warnings from Polyfit import warnings warnings.filterwarnings('ignore', message='Polyfit*') degrees = np.arange(21) train_err = np.zeros(len(degrees)) validation_err = np.zeros(len(degrees)) for i, d in enumerate(degrees): p = np.polyfit(xtrain, ytrain, d) train_err[i] = compute_error(xtrain, ytrain, p) validation_err[i] = compute_error(xtest, ytest, p) fig, ax = plt.subplots() ax.plot(degrees, validation_err, lw=2, label = 'cross-validation error') ax.plot(degrees, train_err, lw=2, label = 'training error') ax.plot([0, 20], [error, error], '--k', label='intrinsic error') ax.legend(loc=0) ax.set_xlabel('degree of fit') ax.set_ylabel('rms error') # suppress warnings from Polyfit import warnings warnings.filterwarnings('ignore', message='Polyfit*') def plot_learning_curve(d, N=200): n_sizes = 50 n_runs = 10 sizes = np.linspace(2, N, n_sizes).astype(int) train_err = np.zeros((n_runs, n_sizes)) validation_err = np.zeros((n_runs, n_sizes)) for i in range(n_runs): for j, size in enumerate(sizes): xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size=test_size, random_state=i) # Train on only the first `size` points p = np.polyfit(xtrain[:size], ytrain[:size], d) # Validation error is on the *entire* validation set validation_err[i, j] = compute_error(xtest, ytest, p) # Training error is on only the points used for training train_err[i, j] = compute_error(xtrain[:size], ytrain[:size], p) fig, ax = plt.subplots() ax.plot(sizes, validation_err.mean(axis=0), lw=2, label='mean validation error') ax.plot(sizes, train_err.mean(axis=0), lw=2, label='mean training error') ax.plot([0, N], [error, error], '--k', label='intrinsic error') ax.set_xlabel('traning set size') ax.set_ylabel('rms error') ax.legend(loc=0) ax.set_xlim(0, N-1) ax.set_title('d = %i' % d) plot_learning_curve(d=1) plot_learning_curve(d=20, N=100) plt.ylim(0, 15)