!ls -l ../datasets/FuelEconomy/ from __future__ import division import numpy as np import pandas as pd cars10 = pd.read_csv("../datasets/FuelEconomy/cars2010.csv") cars11 = pd.read_csv("../datasets/FuelEconomy/cars2011.csv") cars10.head(5) cars10.count() print cars10.shape print cars11.shape cars10_feature = cars10.get(['EngDispl']) cars10_target = cars10.get(['FE']) cars11_feature = cars11.get(['EngDispl']) cars11_target = cars11.get(['FE']) cars10_feature.head(5) cars10_target.head(5) %matplotlib inline import matplotlib.pyplot as plt # Some nice default configuration for plots plt.rcParams['figure.figsize'] = 10, 7.5 plt.rcParams['axes.grid'] = True plt.gray() fig, (ax1, ax2) = plt.subplots(1, 2, sharey = True) ax1.scatter(cars10_feature, cars10_target) ax1.set_title('2010 Model Year') ax2.scatter(cars11_feature, cars11_target) ax2.set_title('2011 Model Year') fig.text(0.5, 0.04, 'Engine Displacement', ha='center', va='center') fig.text(0.06, 0.5, 'Fuel Efficiency (MPG)', ha='center', va='center', rotation='vertical') # Define the evaluation metric: root mean squared error (RMSE) from sklearn.metrics import mean_squared_error def rmse(y_actual, y_predicted): '''calculate Root Mean Squared Error''' return np.sqrt(mean_squared_error(y_actual, y_predicted)) # simple linear model from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit(cars10_feature, cars10_target) print "Least square estimate: intercept = {0}, coefficient ={1}".format(reg.intercept_, reg.coef_[0]) X = np.linspace(np.min(cars10_feature)[0], np.max(cars10_feature)[0])[:, np.newaxis] y = reg.predict(X) cars10_target_pred = reg.predict(cars10_feature) y_range = np.linspace(np.min(cars10_target)[0], np.max(cars10_target)[0])[:, np.newaxis] fig, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(cars10_feature, cars10_target) ax1.plot(X, y, 'r') ax1.set_title('2010 Model Year') ax1.set_xlabel('Engine Displacement') ax1.set_ylabel('Fuel Efficiency (MPG)') ax2.scatter(cars10_target, cars10_target_pred) ax2.plot(y_range, y_range, 'r--') ax2.set_xlabel('Observed') ax2.set_ylabel('Predicted') # calculate root mean square error (RMSE) from sklearn.cross_validation import cross_val_score scores = np.sqrt(np.abs(cross_val_score(reg, cars10_feature, cars10_target, cv=10, scoring='mean_squared_error'))) print "RMSE: {0}".format(np.mean(scores)) # quadratic model from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline quad = make_pipeline(PolynomialFeatures(2), LinearRegression()) quad.fit(cars10_feature, cars10_target) scores = np.sqrt(np.abs(cross_val_score(quad, cars10_feature, cars10_target, cv=10, scoring='mean_squared_error'))) print "RMSE: {0}".format(np.mean(scores)) X = np.linspace(np.min(cars10_feature)[0], np.max(cars10_feature)[0])[:, np.newaxis] y = quad.predict(X) cars10_target_pred = quad.predict(cars10_feature) y_range = np.linspace(np.min(cars10_target)[0], np.max(cars10_target)[0])[:, np.newaxis] fig, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(cars10_feature, cars10_target) ax1.plot(X, y, 'r') ax1.set_title('2010 Model Year') ax1.set_xlabel('Engine Displacement') ax1.set_ylabel('Fuel Efficiency (MPG)') ax2.scatter(cars10_target, cars10_target_pred) ax2.plot(y_range, y_range, 'r--') ax2.set_xlabel('Observed') ax2.set_ylabel('Predicted') # MARS from pyearth import Earth mars = Earth() mars.fit(cars10_feature, cars10_target) scores = np.sqrt(np.abs(cross_val_score(mars, cars10_feature, cars10_target, cv=10, scoring='mean_squared_error'))) print "RMSE: {0}".format(np.mean(scores)) X = np.linspace(np.min(cars10_feature)[0], np.max(cars10_feature)[0])[:, np.newaxis] y = mars.predict(X) cars10_target_pred = mars.predict(cars10_feature) y_range = np.linspace(np.min(cars10_target)[0], np.max(cars10_target)[0])[:, np.newaxis] fig, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(cars10_feature, cars10_target) ax1.plot(X, y, 'r') ax1.set_title('2010 Model Year') ax1.set_xlabel('Engine Displacement') ax1.set_ylabel('Fuel Efficiency (MPG)') ax2.scatter(cars10_target, cars10_target_pred) ax2.plot(y_range, y_range, 'r--') ax2.set_xlabel('Observed') ax2.set_ylabel('Predicted') X = np.linspace(np.min(cars11_feature)[0], np.max(cars11_feature)[0])[:, np.newaxis] fig, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(cars11_feature, cars11_target) ax1.plot(X, quad.predict(X), 'r') ax1.set_xlabel('Engine Displacement') ax1.set_ylabel('Fuel Efficiency (MPG)') ax1.set_title('Quadratic model') ax2.scatter(cars11_feature, cars11_target) ax2.plot(X, mars.predict(X), 'r') ax2.set_xlabel('Engine Displacement') ax2.set_ylabel('Fuel Efficiency (MPG)') ax2.set_title('MARS') # RMSE quad_scores = np.sqrt(np.abs(cross_val_score(quad, cars11_feature, cars11_target, cv=10, scoring='mean_squared_error'))) mars_scores = np.sqrt(np.abs(cross_val_score(mars, cars11_feature, cars11_target, cv=10, scoring='mean_squared_error'))) print "Quadratic model RMSE: {0} and MARS RMSE: {1}".format(np.mean(quad_scores), np.mean(mars_scores))