#!/usr/bin/env python # coding: utf-8 # # Chapter 5 - Resampling Methods # - [Load dataset](#Load-dataset) # - [Cross-Validation](#5.1-Cross-Validation) # In[1]: # %load ../standard_import.txt import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import sklearn.linear_model as skl_lm from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score from sklearn.preprocessing import PolynomialFeatures get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('seaborn-white') # ### Load dataset # Dataset available on http://www-bcf.usc.edu/~gareth/ISL/data.html # In[2]: df1 = pd.read_csv('Data/Auto.csv', na_values='?').dropna() df1.info() # ## 5.1 Cross-Validation # ### Figure 5.2 - Validation Set Approach # Using Polynomial feature generation in scikit-learn
# http://scikit-learn.org/dev/modules/preprocessing.html#generating-polynomial-features # In[3]: t_prop = 0.5 p_order = np.arange(1,11) r_state = np.arange(0,10) X, Y = np.meshgrid(p_order, r_state, indexing='ij') Z = np.zeros((p_order.size,r_state.size)) regr = skl_lm.LinearRegression() # Generate 10 random splits of the dataset for (i,j),v in np.ndenumerate(Z): poly = PolynomialFeatures(int(X[i,j])) X_poly = poly.fit_transform(df1.horsepower.values.reshape(-1,1)) X_train, X_test, y_train, y_test = train_test_split(X_poly, df1.mpg.ravel(), test_size=t_prop, random_state=Y[i,j]) regr.fit(X_train, y_train) pred = regr.predict(X_test) Z[i,j]= mean_squared_error(y_test, pred) fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4)) # Left plot (first split) ax1.plot(X.T[0],Z.T[0], '-o') ax1.set_title('Random split of the data set') # Right plot (all splits) ax2.plot(X,Z) ax2.set_title('10 random splits of the data set') for ax in fig.axes: ax.set_ylabel('Mean Squared Error') ax.set_ylim(15,30) ax.set_xlabel('Degree of Polynomial') ax.set_xlim(0.5,10.5) ax.set_xticks(range(2,11,2)); # ### Figure 5.4 # In[4]: p_order = np.arange(1,11) r_state = np.arange(0,10) # LeaveOneOut CV regr = skl_lm.LinearRegression() loo = LeaveOneOut() loo.get_n_splits(df1) scores = list() for i in p_order: poly = PolynomialFeatures(i) X_poly = poly.fit_transform(df1.horsepower.values.reshape(-1,1)) score = cross_val_score(regr, X_poly, df1.mpg, cv=loo, scoring='neg_mean_squared_error').mean() scores.append(score) # In[5]: # k-fold CV folds = 10 elements = len(df1.index) X, Y = np.meshgrid(p_order, r_state, indexing='ij') Z = np.zeros((p_order.size,r_state.size)) regr = skl_lm.LinearRegression() for (i,j),v in np.ndenumerate(Z): poly = PolynomialFeatures(X[i,j]) X_poly = poly.fit_transform(df1.horsepower.values.reshape(-1,1)) kf_10 = KFold(n_splits=folds, random_state=Y[i,j]) Z[i,j] = cross_val_score(regr, X_poly, df1.mpg, cv=kf_10, scoring='neg_mean_squared_error').mean() # In[6]: fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4)) # Note: cross_val_score() method return negative values for the scores. # https://github.com/scikit-learn/scikit-learn/issues/2439 # Left plot ax1.plot(p_order, np.array(scores)*-1, '-o') ax1.set_title('LOOCV') # Right plot ax2.plot(X,Z*-1) ax2.set_title('10-fold CV') for ax in fig.axes: ax.set_ylabel('Mean Squared Error') ax.set_ylim(15,30) ax.set_xlabel('Degree of Polynomial') ax.set_xlim(0.5,10.5) ax.set_xticks(range(2,11,2));