#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: from __future__ import division # In[3]: from matplotlib import pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np import pandas as pd import seaborn as sns import scipy as sp from statsmodels import api as sm # In[4]: from rpy2.robjects import pandas2ri, r pandas2ri.activate() # We are trying to follow [Wood, _Generalized Additive Models_](http://www.amazon.com/gp/product/1584884746/ref=pd_lpo_sbs_dp_ss_1?pf_rd_p=1944687702&pf_rd_s=lpo-top-stripe-1&pf_rd_t=201&pf_rd_i=0412343908&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=005A9YGQJ97J6464F10E), Chapter 3 in Python to build our own additive model. # # Engine Wear # In[5]: import warnings warnings.filterwarnings('ignore') get_ipython().system('mkdir -p /tmp/Rpackages') r('install.packages("gamair", lib="/tmp/Rpackages/", repos="http://cran.us.r-project.org")'); warnings.filterwarnings('default') # In[6]: r('library(gamair, lib="/tmp/Rpackages/")'); r.data('engine'); engine_df = pandas2ri.ri2py(r['engine']) # In[7]: def scale(x): return (x - x.min()) / np.ptp(x) # In[8]: engine_df['scaled_size'] = scale(engine_df['size']) # In[9]: engine_df.head() # In[10]: fig, ax = plt.subplots(figsize=(8, 6)) blue = sns.color_palette()[0] ax.scatter(engine_df['scaled_size'], engine_df.wear, color=blue, alpha=0.5); ax.set_xlim(-0.05, 1.05); ax.set_ylim(1.5, 5); # In[11]: n = engine_df.shape[0] # In[12]: def R(x, z): return ((z - 0.5)**2 - 1 / 12) * ((x - 0.5)**2 - 1 / 12) / 4 - ((np.abs(x - z) - 0.5)**4 - 0.5 * (np.abs(x - z) - 0.5)**2 + 7 / 240) / 24 R = np.frompyfunc(R, 2, 1) # In[13]: def univariate_design_matrix(x, knots, intercept=True): n = x.shape[0] if intercept: q = knots.size + 2 else: q = knots.size + 1 X = np.empty((n, q)) if intercept: X[:, 0] = 1 X[:, 1] = x X[:, 2:] = R.outer(x, knots) else: X[:, 0] = x X[:, 1:] = R.outer(x, knots) return X # In[14]: def design_matrix(x, knots): if x.ndim == 1: x = np.row_stack(x) knots = np.row_stack(knots) return np.hstack(univariate_design_matrix(x[:, i], knots[:, i], intercept=(i == 0)) for i in xrange(x.shape[1])) # In[15]: def univariate_penalty_matrix(knots, intercept=True): if intercept: q = knots.size + 2 else: q = knots.size + 1 S = np.zeros((q, q)) if intercept: S[2:, 2:] = R.outer(knots, knots) else: S[1:, 1:] = R.outer(knots, knots) return S # In[16]: def penalty_matrix(knots, lambda_=None): if knots.ndim == 1: knots = np.row_stack(knots) if lambda_ is None: lambda_ = np.ones(knots.shape[1]) return sp.linalg.block_diag(*[lambda_[i] * univariate_penalty_matrix(knots[:, i], intercept=(i == 0)) for i in xrange(knots.shape[1])]) # In[17]: def matrix_sqrt(m): e, V = np.linalg.eig(m) e = np.maximum(e, 0) return np.dot(V, np.dot(np.diag(np.sqrt(e)), V.T)) # In[18]: def fit(y, x, knots, lambda_=None): if lambda_ is None: lambda_ = np.ones(knots.shape[1]) else: lambda_ = np.atleast_1d(lambda_) X = design_matrix(x, knots) S = penalty_matrix(knots, lambda_=lambda_) y_ = np.hstack((y, np.zeros(S.shape[0]))).T B = matrix_sqrt(S) X_ = np.vstack((X, B)) return sm.OLS(y_, X_).fit() # In[19]: def predict(x, knots, model, link=None): X = design_matrix(x, knots) if link is None: return model.predict(X) else: return link(model.predict(X)) # In[20]: lambdas = [10**-2, 10**-4, 10**-6] # In[21]: q = 10 qs = np.linspace(0, 1, q - 2) # In[22]: engine_y = engine_df.wear.values engine_x = engine_df.scaled_size.values engine_knots = engine_df.scaled_size.quantile(qs).values # In[23]: fig, axes = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(24, 6)) plot_sizes = np.linspace(0, 1, 1000) for lambda_, ax in zip(lambdas, axes): model = fit(engine_y, engine_x, engine_knots, lambda_=lambda_) ax.scatter(engine_x, engine_y, color=blue, alpha=0.5); ax.plot(plot_sizes, predict(plot_sizes, engine_knots, model)); ax.set_xlim(-0.05, 1.05); ax.set_xlabel('Scaled capacity'); ax.set_ylim(1.5, 5); ax.set_ylabel('Wear'); ax.set_title('$\lambda = {}$'.format(lambda_)); # In[24]: def gcv_score(y, x, knots, lambda_=None): if lambda_ is None: lambda_ = np.ones(knots.shape[1]) else: lambda_ = np.atleast_1d(lambda_) model = fit(y, x, knots, lambda_=lambda_) y_hat = predict(x, knots, model) n = x.shape[0] X = design_matrix(x, knots) hat_matrix_trace = model.get_influence().hat_matrix_diag[:n].sum() return n * np.power(y - y_hat, 2).sum() / np.power(n - hat_matrix_trace, 2) # In[25]: gcv_lambdas = np.power(1.5, np.linspace(0, 60, 100)) * 10**-8 # In[26]: gcv_scores = np.array([gcv_score(engine_y, engine_x, engine_knots, lambda_=lambda_) for lambda_ in gcv_lambdas]) # In[27]: lambda_best = gcv_lambdas[gcv_scores.argmin()] # In[28]: lambda_best # In[29]: fig, (gcv_ax, plot_ax) = plt.subplots(ncols=2, figsize=(16, 6)) gcv_ax.plot(gcv_lambdas, gcv_scores); gcv_ax.set_xscale('log'); gcv_ax.set_xlabel('$\lambda$'); gcv_ax.set_ylabel('$V$'); gcv_ax.set_title('GCV score'); model = fit(engine_y, engine_x, engine_knots, lambda_=lambda_best) plot_ax.scatter(engine_x, engine_y, color=blue, alpha=0.5); plot_ax.plot(plot_sizes, predict(plot_sizes, engine_knots, model)); plot_ax.set_xlim(-0.05, 1.05); plot_ax.set_xlabel('Scaled capacity'); plot_ax.set_ylim(1.5, 5); plot_ax.set_ylabel('Wear'); plot_ax.set_title('GCV optimal fit'); # # Volume # ## Additive model # In[30]: tree_df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/datasets/trees.csv', usecols=['Girth', 'Height', 'Volume']) # In[31]: tree_df['scaled_girth'] = scale(tree_df.Girth) tree_df['scaled_height'] = scale(tree_df.Height) # In[32]: tree_df.head() # In[33]: tree_x = tree_df[['scaled_girth', 'scaled_height']].values tree_knots = tree_df[['scaled_girth', 'scaled_height']].quantile(qs, axis=0).values # In[34]: girth_lambdas = np.power(2., np.arange(30)) * 1e-5 height_lambdas = np.power(2., np.arange(30)) * 1e-5 # In[35]: tree_gcv_scores = np.zeros((30, 30)) # In[36]: tree_y = tree_df.Volume.values # In[37]: for i in xrange(30): for j in xrange(30): lambda_ = np.array([girth_lambdas[i], height_lambdas[j]]) tree_gcv_scores[i, j] = gcv_score(tree_y, tree_x, tree_knots, lambda_=lambda_) # In[38]: tree_gcv_row_min_col_ix = tree_gcv_scores.argmin(axis=1) girth_lambda_ix = tree_gcv_scores[np.arange(30), tree_gcv_row_min_col_ix].argmin() height_lambda_ix = tree_gcv_row_min_col_ix[girth_lambda_ix] tree_lambda = np.array([girth_lambdas[girth_lambda_ix], height_lambdas[height_lambda_ix]]) # In[39]: tree_lambda # In[40]: fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, projection='3d') xv, yv = np.meshgrid(np.arange(30), np.arange(30)) ax.plot_wireframe(xv, yv, tree_gcv_scores, color='k', lw=0.5); ax.set_xticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_xticks()]); ax.set_xlabel(r'$\lambda_2 \times 1e-5$'); ax.set_yticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_yticks()]); ax.set_ylabel(r'$\lambda_1 \times 1e-5$'); ax.set_zlabel('GCV score'); # In[41]: model = fit(tree_y, tree_x, tree_knots, lambda_=tree_lambda) # In[42]: fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(tree_y, predict(tree_x, tree_knots, model), color=blue, alpha=0.5); v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max() MARGIN = 5 ax.plot((v_min - MARGIN, v_max + MARGIN), (v_min - MARGIN, v_max + MARGIN), '--', c='k', label='Perfect agreement'); ax.set_xlim(v_min - MARGIN, v_max + MARGIN); ax.set_xlabel('Actual volume'); ax.set_ylim(v_min - MARGIN, v_max + MARGIN); ax.set_ylabel('Predicted volume'); ax.legend(loc=2); # In[43]: def predict_component_function(x, icol, knots, model): X = design_matrix(x, knots) q = knots.shape[0] + 2 first_col = min(1, icol) * q + max(0, icol - 1) * (q - 1) last_col = first_col + q - min(1, icol) X_ = np.zeros_like(X) X_[:, first_col:last_col] = X[:, first_col:last_col] return model.predict(X_) # In[44]: fig, ax = plt.subplots(figsize=(8, 6)) tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100))) ax.plot(tree_plot_x[:, 0], predict_component_function(tree_plot_x, 0, tree_knots, model), '--', c='k', alpha=0.5, lw=0.5); ax.scatter(tree_x[:, 0], predict_component_function(tree_x, 0, tree_knots, model), color=blue, alpha=0.5); v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max() ax.set_xlim(-0.05, 1.05); ax.set_xlabel('Scaled girth'); ax.set_ylim(bottom=0); ax.set_ylabel('$f_1$'); # In[45]: fig, ax = plt.subplots(figsize=(8, 6)) tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100))) ax.plot(tree_plot_x[:, 1], predict_component_function(tree_plot_x, 1, tree_knots, model), '--', c='k', alpha=0.5, lw=0.5); ax.scatter(tree_x[:, 1], predict_component_function(tree_x, 1, tree_knots, model), color=blue, alpha=0.5); v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max() ax.set_xlim(-0.05, 1.05); ax.set_xlabel('Scaled girth'); ax.set_ylim(bottom=-0.25); ax.set_ylabel('$f_1$'); # ## Generalized additive model # # The model is # # $$\log(\mathbb{E}(\textrm{Volume})) = f_1(\textrm{Girth}) + f_2(\textrm{Height}),$$ # # where volume is gamma distributed # In[46]: def penalized_irls_step(y, X, B, knots, model=None): n = y.size # model is None for the first step if model is None: eta = np.log(y) else: eta = model.predict(X) mu = np.exp(eta) X_ = np.vstack((X, B)) z = (y - mu) / mu + eta z_ = np.hstack((z, np.zeros(B.shape[0]))).T new_model = sm.OLS(z_, X_).fit() rss = np.power(z - new_model.predict(X), 2).sum() gcv_score = rss / np.power(n - new_model.get_influence().hat_matrix_diag[:n].sum(), 2) return rss, gcv_score, new_model # In[47]: def fit_gam(y, x, knots, lambda_=None, rtol=1e-4): X = design_matrix(x, knots) S = penalty_matrix(knots, lambda_=lambda_) B = matrix_sqrt(S) model = None old_rss, rss = 0, 1 while np.abs(rss - old_rss) > rtol * old_rss: old_rss = rss rss, gcv_score, model = penalized_irls_step(y, X, B, knots, model=model) return gcv_score, model # In[48]: tree_gam_gcv_scores = np.empty((30, 30)) # In[49]: for i in xrange(30): for j in xrange(30): if j == 0 and i % 5 == 0: print 'i = {}'.format(i) lambda_ = np.array([girth_lambdas[i], height_lambdas[j]]) tree_gam_gcv_scores[i, j], _ = fit_gam(tree_y, tree_x, tree_knots, lambda_=lambda_) # In[50]: tree_gam_gcv_row_min_col_ix = tree_gam_gcv_scores.argmin(axis=1) gam_girth_lambda_ix = tree_gam_gcv_scores[np.arange(30), tree_gam_gcv_row_min_col_ix].argmin() gam_height_lambda_ix = tree_gam_gcv_row_min_col_ix[gam_girth_lambda_ix] tree_gam_lambda = np.array([girth_lambdas[gam_girth_lambda_ix], height_lambdas[gam_height_lambda_ix]]) # In[51]: tree_gam_lambda # In[52]: fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, projection='3d') xv, yv = np.meshgrid(np.arange(30), np.arange(30)) ax.plot_wireframe(xv, yv, tree_gam_gcv_scores, color='k', lw=0.5); ax.set_xticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_xticks()]); ax.set_xlabel(r'$\lambda_2 \times 1e-5$'); ax.set_yticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_yticks()]); ax.set_ylabel(r'$\lambda_1 \times 1e-5$'); ax.set_zlabel('GCV score'); # In[53]: _, model = fit_gam(tree_y, tree_x, tree_knots, lambda_=tree_gam_lambda) # In[54]: fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(tree_y, predict(tree_x, tree_knots, model, link=np.exp), color=blue, alpha=0.5); v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max() MARGIN = 5 ax.plot((v_min - MARGIN, v_max + MARGIN), (v_min - MARGIN, v_max + MARGIN), '--', c='k', label='Perfect agreement'); ax.set_xlim(v_min - MARGIN, v_max + MARGIN); ax.set_xlabel('Actual volume'); ax.set_ylim(v_min - MARGIN, v_max + MARGIN); ax.set_ylabel('Predicted volume'); ax.legend(loc=2); # In[55]: fig, ax = plt.subplots(figsize=(8, 6)) tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100))) ax.plot(tree_plot_x[:, 0], predict_component_function(tree_plot_x, 0, tree_knots, model), '--', c='k', alpha=0.5, lw=0.5); ax.scatter(tree_x[:, 0], predict_component_function(tree_x, 0, tree_knots, model), color=blue, alpha=0.5); v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max() ax.set_xlim(-0.05, 1.05); ax.set_xlabel('Scaled girth'); ax.set_ylim(2, 4.05); ax.set_ylabel('$f_1$'); # In[56]: fig, ax = plt.subplots(figsize=(8, 6)) tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100))) ax.plot(tree_plot_x[:, 1], predict_component_function(tree_plot_x, 1, tree_knots, model), '--', c='k', alpha=0.5, lw=0.5); ax.scatter(tree_x[:, 1], predict_component_function(tree_x, 1, tree_knots, model), color=blue, alpha=0.5); v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max() ax.set_xlim(-0.05, 1.05); ax.set_xlabel('Scaled girth'); ax.set_ylim(-0.01, 0.4); ax.set_ylabel('$f_1$'); # In[ ]: