#!/usr/bin/env python # coding: utf-8 # # Chapter 7 - Moving Beyond Linearity # - [Lab: 7.8.1 Polynomial Regression and Step Functions](#7.8.1-Polynomial-Regression-and-Step-Functions) # - [Lab: 7.8.2 Splines](#7.8.2-Splines) # In[22]: # %load ../standard_import.txt import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import PolynomialFeatures import statsmodels.api as sm import statsmodels.formula.api as smf from patsy import dmatrix get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('seaborn-white') # ### Load dataset # Using write.csv in R, I exported the dataset from package 'ISLR' to a csv file. # In[2]: df = pd.read_csv('Data/Wage.csv') df.head(3) # In[3]: df.info() # ## Lab # ### 7.8.1 Polynomial Regression and Step Functions # Create polynomials for 'age'. These correspond to those in R, when using raw=TRUE in poly() function. # In[8]: X1 = PolynomialFeatures(1).fit_transform(df.age.values.reshape(-1,1)) X2 = PolynomialFeatures(2).fit_transform(df.age.values.reshape(-1,1)) X3 = PolynomialFeatures(3).fit_transform(df.age.values.reshape(-1,1)) X4 = PolynomialFeatures(4).fit_transform(df.age.values.reshape(-1,1)) X5 = PolynomialFeatures(5).fit_transform(df.age.values.reshape(-1,1)) y = (df.wage > 250).map({False:0, True:1}).values print('X4:\n', X4[:5]) print('y:\n', y[:5]) # #### Linear regression model. (Degree 4) # In[9]: fit2 = sm.GLS(df.wage, X4).fit() fit2.summary().tables[1] # Selecting a suitable degree for the polynomial of age. # In[10]: fit_1 = sm.GLS(df.wage, X1).fit() fit_2 = sm.GLS(df.wage, X2).fit() fit_3 = sm.GLS(df.wage, X3).fit() fit_4 = sm.GLS(df.wage, X4).fit() fit_5 = sm.GLS(df.wage, X5).fit() sm.stats.anova_lm(fit_1, fit_2, fit_3, fit_4, fit_5, typ=1) # The polynomial degree 4 seems best. # In[11]: X = X4 # Scikit-learn implements a regularized logistic regression model particularly suitable for high dimensional data. Since we just have one feature (age) we use the GLM model from statsmodels. # In[12]: clf = sm.GLM(y, X, family=sm.families.Binomial(sm.families.links.logit)) res = clf.fit() # Create array of test data. Transform to polynomial degree 4 and run prediction. # In[13]: age_grid = np.arange(df.age.min(), df.age.max()).reshape(-1,1) # In[14]: X_test = PolynomialFeatures(4).fit_transform(age_grid) pred = res.predict(X_test) # ### Figure 7.1 # In[15]: # creating plots fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5)) fig.suptitle('Degree-4 Polynomial', fontsize=14) # Scatter plot with polynomial regression line ax1.scatter(df.age, df.wage, facecolor='None', edgecolor='k', alpha=0.3) sns.regplot(df.age, df.wage, order = 4, truncate=True, scatter=False, ax=ax1) ax1.set_ylim(ymin=0) # Logistic regression showing Pr(wage>250) for the age range. ax2.plot(age_grid, pred, color='b') # Rug plot showing the distribution of wage>250 in the training data. # 'True' on the top, 'False' on the bottom. ax2.scatter(df.age, y/5, s=30, c='grey', marker='|', alpha=0.7) ax2.set_ylim(-0.01,0.21) ax2.set_xlabel('age') ax2.set_ylabel('Pr(wage>250|age)'); # #### Step function # In[16]: df_cut, bins = pd.cut(df.age, 4, retbins=True, right=True) df_cut.value_counts(sort=False) # In[17]: df_steps = pd.concat([df.age, df_cut, df.wage], keys=['age','age_cuts','wage'], axis=1) df_steps.head(5) # In[18]: # Create dummy variables for the age groups df_steps_dummies = pd.get_dummies(df_steps['age_cuts']) # Statsmodels requires explicit adding of a constant (intercept) df_steps_dummies = sm.add_constant(df_steps_dummies) df_steps_dummies.head(5) # In[23]: # Using statsmodels because it has a more complete output for coefficients fit3 = sm.GLM(df_steps.wage, df_steps_dummies.drop(df_steps_dummies.columns[1], axis=1)).fit() fit3.summary().tables[1] # In[24]: # Put the test data in the same bins as the training data. bin_mapping = np.digitize(age_grid.ravel(), bins) bin_mapping # In[25]: # Get dummies, drop first dummy category, add constant X_test2 = sm.add_constant(pd.get_dummies(bin_mapping).drop(1, axis=1)) X_test2.head() # #### Linear Regression # In[26]: pred2 = fit3.predict(X_test2) # #### Logistic Regression # In[27]: clf2 = sm.GLM(y, df_steps_dummies.drop(df_steps_dummies.columns[1], axis=1), family=sm.families.Binomial(sm.families.links.logit)) res2 = clf2.fit() pred3 = res2.predict(X_test2) # ### Figure 7.2 # In[28]: # creating plots fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5)) fig.suptitle('Piecewise Constant', fontsize=14) # Scatter plot with polynomial regression line ax1.scatter(df.age, df.wage, facecolor='None', edgecolor='k', alpha=0.3) ax1.plot(age_grid, pred2, c='b') ax1.set_xlabel('age') ax1.set_ylabel('wage') ax1.set_ylim(ymin=0) # Logistic regression showing Pr(wage>250) for the age range. ax2.plot(np.arange(df.age.min(), df.age.max()).reshape(-1,1), pred3, color='b') # Rug plot showing the distribution of wage>250 in the training data. # 'True' on the top, 'False' on the bottom. ax2.scatter(df.age, y/5, s=30, c='grey', marker='|', alpha=0.7) ax2.set_ylim(-0.01,0.21) ax2.set_xlabel('age') ax2.set_ylabel('Pr(wage>250|age)'); # ### 7.8.2 Splines # Using patsy to create non-linear transformations of the input data. See http://patsy.readthedocs.org/en/latest/
# I have not found functions to create smoothing splines or GAMs or do local regression. # #### Cubic splines # In[29]: # Specifying 3 knots transformed_x = dmatrix("bs(df.age, knots=(25,40,60), degree=3, include_intercept=False)", {"df.age": df.age}, return_type='dataframe') fit4 = sm.GLM(df.wage, transformed_x).fit() pred4 = fit4.predict(dmatrix("bs(age_grid, knots=(25,40,60), degree=3, include_intercept=False)", {"age_grid": age_grid}, return_type='dataframe')) fit4.params # In[30]: # Specifying 6 degrees of freedom transformed_x2 = dmatrix("bs(df.age, df=6, degree=3, include_intercept=False)", {"df.age": df.age}, return_type='dataframe') fit5 = sm.GLM(df.wage, transformed_x2).fit() pred5 = fit5.predict(dmatrix("bs(age_grid, df=6, degree=3, include_intercept=False)", {"age_grid": age_grid}, return_type='dataframe')) fit5.params # #### Natural splines # In[31]: # Specifying 4 degrees of freedom transformed_x3 = dmatrix("cr(df.age, df=4)", {"df.age": df.age}, return_type='dataframe') fit6 = sm.GLM(df.wage, transformed_x3).fit() pred6 = fit6.predict(dmatrix("cr(age_grid, df=4)", {"age_grid": age_grid}, return_type='dataframe')) fit6.params # In[32]: plt.scatter(df.age, df.wage, facecolor='None', edgecolor='k', alpha=0.3) plt.plot(age_grid, pred4, color='b', label='Specifying three knots') plt.plot(age_grid, pred5, color='r', label='Specifying df=6') plt.plot(age_grid, pred6, color='g', label='Natural spline df=4') [plt.vlines(i , 0, 350, linestyles='dashed', lw=2, colors='b') for i in [25,40,60]] plt.legend(bbox_to_anchor=(1.5, 1.0)) plt.xlim(15,85) plt.ylim(0,350) plt.xlabel('age') plt.ylabel('wage');