%matplotlib inline
from __future__ import division
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
from statsmodels import api as sm
from rpy2.robjects import pandas2ri, r
pandas2ri.activate()
We are trying to follow Wood, Generalized Additive Models, Chapter 3 in Python to build our own additive model.
import warnings
warnings.filterwarnings('ignore')
!mkdir -p /tmp/Rpackages
r('install.packages("gamair", lib="/tmp/Rpackages/", repos="http://cran.us.r-project.org")');
warnings.filterwarnings('default')
The downloaded source packages are in ‘/tmp/RtmpO2Xdeg/downloaded_packages’
r('library(gamair, lib="/tmp/Rpackages/")');
r.data('engine');
engine_df = pandas2ri.ri2py(r['engine'])
def scale(x):
return (x - x.min()) / np.ptp(x)
engine_df['scaled_size'] = scale(engine_df['size'])
engine_df.head()
size | wear | scaled_size | |
---|---|---|---|
0 | 2.78 | 3.0 | 0.871795 |
1 | 2.43 | 3.1 | 0.647436 |
2 | 2.32 | 4.8 | 0.576923 |
3 | 2.43 | 3.3 | 0.647436 |
4 | 2.98 | 2.8 | 1.000000 |
fig, ax = plt.subplots(figsize=(8, 6))
blue = sns.color_palette()[0]
ax.scatter(engine_df['scaled_size'], engine_df.wear, color=blue, alpha=0.5);
ax.set_xlim(-0.05, 1.05);
ax.set_ylim(1.5, 5);
n = engine_df.shape[0]
def R(x, z):
return ((z - 0.5)**2 - 1 / 12) * ((x - 0.5)**2 - 1 / 12) / 4 - ((np.abs(x - z) - 0.5)**4 - 0.5 * (np.abs(x - z) - 0.5)**2 + 7 / 240) / 24
R = np.frompyfunc(R, 2, 1)
def univariate_design_matrix(x, knots, intercept=True):
n = x.shape[0]
if intercept:
q = knots.size + 2
else:
q = knots.size + 1
X = np.empty((n, q))
if intercept:
X[:, 0] = 1
X[:, 1] = x
X[:, 2:] = R.outer(x, knots)
else:
X[:, 0] = x
X[:, 1:] = R.outer(x, knots)
return X
def design_matrix(x, knots):
if x.ndim == 1:
x = np.row_stack(x)
knots = np.row_stack(knots)
return np.hstack(univariate_design_matrix(x[:, i], knots[:, i], intercept=(i == 0)) for i in xrange(x.shape[1]))
def univariate_penalty_matrix(knots, intercept=True):
if intercept:
q = knots.size + 2
else:
q = knots.size + 1
S = np.zeros((q, q))
if intercept:
S[2:, 2:] = R.outer(knots, knots)
else:
S[1:, 1:] = R.outer(knots, knots)
return S
def penalty_matrix(knots, lambda_=None):
if knots.ndim == 1:
knots = np.row_stack(knots)
if lambda_ is None:
lambda_ = np.ones(knots.shape[1])
return sp.linalg.block_diag(*[lambda_[i] * univariate_penalty_matrix(knots[:, i], intercept=(i == 0)) for i in xrange(knots.shape[1])])
def matrix_sqrt(m):
e, V = np.linalg.eig(m)
e = np.maximum(e, 0)
return np.dot(V, np.dot(np.diag(np.sqrt(e)), V.T))
def fit(y, x, knots, lambda_=None):
if lambda_ is None:
lambda_ = np.ones(knots.shape[1])
else:
lambda_ = np.atleast_1d(lambda_)
X = design_matrix(x, knots)
S = penalty_matrix(knots, lambda_=lambda_)
y_ = np.hstack((y, np.zeros(S.shape[0]))).T
B = matrix_sqrt(S)
X_ = np.vstack((X, B))
return sm.OLS(y_, X_).fit()
def predict(x, knots, model, link=None):
X = design_matrix(x, knots)
if link is None:
return model.predict(X)
else:
return link(model.predict(X))
lambdas = [10**-2, 10**-4, 10**-6]
q = 10
qs = np.linspace(0, 1, q - 2)
engine_y = engine_df.wear.values
engine_x = engine_df.scaled_size.values
engine_knots = engine_df.scaled_size.quantile(qs).values
fig, axes = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(24, 6))
plot_sizes = np.linspace(0, 1, 1000)
for lambda_, ax in zip(lambdas, axes):
model = fit(engine_y, engine_x, engine_knots, lambda_=lambda_)
ax.scatter(engine_x, engine_y, color=blue, alpha=0.5);
ax.plot(plot_sizes, predict(plot_sizes, engine_knots, model));
ax.set_xlim(-0.05, 1.05);
ax.set_xlabel('Scaled capacity');
ax.set_ylim(1.5, 5);
ax.set_ylabel('Wear');
ax.set_title('$\lambda = {}$'.format(lambda_));
def gcv_score(y, x, knots, lambda_=None):
if lambda_ is None:
lambda_ = np.ones(knots.shape[1])
else:
lambda_ = np.atleast_1d(lambda_)
model = fit(y, x, knots, lambda_=lambda_)
y_hat = predict(x, knots, model)
n = x.shape[0]
X = design_matrix(x, knots)
hat_matrix_trace = model.get_influence().hat_matrix_diag[:n].sum()
return n * np.power(y - y_hat, 2).sum() / np.power(n - hat_matrix_trace, 2)
gcv_lambdas = np.power(1.5, np.linspace(0, 60, 100)) * 10**-8
gcv_scores = np.array([gcv_score(engine_y, engine_x, engine_knots, lambda_=lambda_) for lambda_ in gcv_lambdas])
lambda_best = gcv_lambdas[gcv_scores.argmin()]
lambda_best
0.0021681971002683468
fig, (gcv_ax, plot_ax) = plt.subplots(ncols=2, figsize=(16, 6))
gcv_ax.plot(gcv_lambdas, gcv_scores);
gcv_ax.set_xscale('log');
gcv_ax.set_xlabel('$\lambda$');
gcv_ax.set_ylabel('$V$');
gcv_ax.set_title('GCV score');
model = fit(engine_y, engine_x, engine_knots, lambda_=lambda_best)
plot_ax.scatter(engine_x, engine_y, color=blue, alpha=0.5);
plot_ax.plot(plot_sizes, predict(plot_sizes, engine_knots, model));
plot_ax.set_xlim(-0.05, 1.05);
plot_ax.set_xlabel('Scaled capacity');
plot_ax.set_ylim(1.5, 5);
plot_ax.set_ylabel('Wear');
plot_ax.set_title('GCV optimal fit');
tree_df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/datasets/trees.csv',
usecols=['Girth', 'Height', 'Volume'])
tree_df['scaled_girth'] = scale(tree_df.Girth)
tree_df['scaled_height'] = scale(tree_df.Height)
tree_df.head()
Girth | Height | Volume | scaled_girth | scaled_height | |
---|---|---|---|---|---|
0 | 8.3 | 70 | 10.3 | 0.000000 | 0.291667 |
1 | 8.6 | 65 | 10.3 | 0.024390 | 0.083333 |
2 | 8.8 | 63 | 10.2 | 0.040650 | 0.000000 |
3 | 10.5 | 72 | 16.4 | 0.178862 | 0.375000 |
4 | 10.7 | 81 | 18.8 | 0.195122 | 0.750000 |
tree_x = tree_df[['scaled_girth', 'scaled_height']].values
tree_knots = tree_df[['scaled_girth', 'scaled_height']].quantile(qs, axis=0).values
girth_lambdas = np.power(2., np.arange(30)) * 1e-5
height_lambdas = np.power(2., np.arange(30)) * 1e-5
tree_gcv_scores = np.zeros((30, 30))
tree_y = tree_df.Volume.values
for i in xrange(30):
for j in xrange(30):
lambda_ = np.array([girth_lambdas[i], height_lambdas[j]])
tree_gcv_scores[i, j] = gcv_score(tree_y, tree_x, tree_knots, lambda_=lambda_)
tree_gcv_row_min_col_ix = tree_gcv_scores.argmin(axis=1)
girth_lambda_ix = tree_gcv_scores[np.arange(30), tree_gcv_row_min_col_ix].argmin()
height_lambda_ix = tree_gcv_row_min_col_ix[girth_lambda_ix]
tree_lambda = np.array([girth_lambdas[girth_lambda_ix], height_lambdas[height_lambda_ix]])
tree_lambda
array([ 1.02400000e-02, 2.68435456e+03])
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
xv, yv = np.meshgrid(np.arange(30), np.arange(30))
ax.plot_wireframe(xv, yv, tree_gcv_scores, color='k', lw=0.5);
ax.set_xticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_xticks()]);
ax.set_xlabel(r'$\lambda_2 \times 1e-5$');
ax.set_yticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_yticks()]);
ax.set_ylabel(r'$\lambda_1 \times 1e-5$');
ax.set_zlabel('GCV score');
model = fit(tree_y, tree_x, tree_knots, lambda_=tree_lambda)
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(tree_y, predict(tree_x, tree_knots, model), color=blue, alpha=0.5);
v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max()
MARGIN = 5
ax.plot((v_min - MARGIN, v_max + MARGIN), (v_min - MARGIN, v_max + MARGIN), '--',
c='k', label='Perfect agreement');
ax.set_xlim(v_min - MARGIN, v_max + MARGIN);
ax.set_xlabel('Actual volume');
ax.set_ylim(v_min - MARGIN, v_max + MARGIN);
ax.set_ylabel('Predicted volume');
ax.legend(loc=2);
def predict_component_function(x, icol, knots, model):
X = design_matrix(x, knots)
q = knots.shape[0] + 2
first_col = min(1, icol) * q + max(0, icol - 1) * (q - 1)
last_col = first_col + q - min(1, icol)
X_ = np.zeros_like(X)
X_[:, first_col:last_col] = X[:, first_col:last_col]
return model.predict(X_)
fig, ax = plt.subplots(figsize=(8, 6))
tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100)))
ax.plot(tree_plot_x[:, 0], predict_component_function(tree_plot_x, 0, tree_knots, model),
'--', c='k', alpha=0.5, lw=0.5);
ax.scatter(tree_x[:, 0], predict_component_function(tree_x, 0, tree_knots, model),
color=blue, alpha=0.5);
v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max()
ax.set_xlim(-0.05, 1.05);
ax.set_xlabel('Scaled girth');
ax.set_ylim(bottom=0);
ax.set_ylabel('$f_1$');
fig, ax = plt.subplots(figsize=(8, 6))
tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100)))
ax.plot(tree_plot_x[:, 1], predict_component_function(tree_plot_x, 1, tree_knots, model),
'--', c='k', alpha=0.5, lw=0.5);
ax.scatter(tree_x[:, 1], predict_component_function(tree_x, 1, tree_knots, model),
color=blue, alpha=0.5);
v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max()
ax.set_xlim(-0.05, 1.05);
ax.set_xlabel('Scaled girth');
ax.set_ylim(bottom=-0.25);
ax.set_ylabel('$f_1$');
The model is
$$\log(\mathbb{E}(\textrm{Volume})) = f_1(\textrm{Girth}) + f_2(\textrm{Height}),$$where volume is gamma distributed
def penalized_irls_step(y, X, B, knots, model=None):
n = y.size
# model is None for the first step
if model is None:
eta = np.log(y)
else:
eta = model.predict(X)
mu = np.exp(eta)
X_ = np.vstack((X, B))
z = (y - mu) / mu + eta
z_ = np.hstack((z, np.zeros(B.shape[0]))).T
new_model = sm.OLS(z_, X_).fit()
rss = np.power(z - new_model.predict(X), 2).sum()
gcv_score = rss / np.power(n - new_model.get_influence().hat_matrix_diag[:n].sum(), 2)
return rss, gcv_score, new_model
def fit_gam(y, x, knots, lambda_=None, rtol=1e-4):
X = design_matrix(x, knots)
S = penalty_matrix(knots, lambda_=lambda_)
B = matrix_sqrt(S)
model = None
old_rss, rss = 0, 1
while np.abs(rss - old_rss) > rtol * old_rss:
old_rss = rss
rss, gcv_score, model = penalized_irls_step(y, X, B, knots, model=model)
return gcv_score, model
tree_gam_gcv_scores = np.empty((30, 30))
for i in xrange(30):
for j in xrange(30):
if j == 0 and i % 5 == 0:
print 'i = {}'.format(i)
lambda_ = np.array([girth_lambdas[i], height_lambdas[j]])
tree_gam_gcv_scores[i, j], _ = fit_gam(tree_y, tree_x, tree_knots, lambda_=lambda_)
i = 0 i = 5 i = 10 i = 15 i = 20 i = 25
tree_gam_gcv_row_min_col_ix = tree_gam_gcv_scores.argmin(axis=1)
gam_girth_lambda_ix = tree_gam_gcv_scores[np.arange(30), tree_gam_gcv_row_min_col_ix].argmin()
gam_height_lambda_ix = tree_gam_gcv_row_min_col_ix[gam_girth_lambda_ix]
tree_gam_lambda = np.array([girth_lambdas[gam_girth_lambda_ix], height_lambdas[gam_height_lambda_ix]])
tree_gam_lambda
array([ 1.02400000e-02, 2.68435456e+03])
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
xv, yv = np.meshgrid(np.arange(30), np.arange(30))
ax.plot_wireframe(xv, yv, tree_gam_gcv_scores, color='k', lw=0.5);
ax.set_xticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_xticks()]);
ax.set_xlabel(r'$\lambda_2 \times 1e-5$');
ax.set_yticklabels([r'$2^{' + str(int(tick)) + r'}$' for tick in ax.get_yticks()]);
ax.set_ylabel(r'$\lambda_1 \times 1e-5$');
ax.set_zlabel('GCV score');
_, model = fit_gam(tree_y, tree_x, tree_knots, lambda_=tree_gam_lambda)
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(tree_y, predict(tree_x, tree_knots, model, link=np.exp), color=blue, alpha=0.5);
v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max()
MARGIN = 5
ax.plot((v_min - MARGIN, v_max + MARGIN), (v_min - MARGIN, v_max + MARGIN), '--',
c='k', label='Perfect agreement');
ax.set_xlim(v_min - MARGIN, v_max + MARGIN);
ax.set_xlabel('Actual volume');
ax.set_ylim(v_min - MARGIN, v_max + MARGIN);
ax.set_ylabel('Predicted volume');
ax.legend(loc=2);
fig, ax = plt.subplots(figsize=(8, 6))
tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100)))
ax.plot(tree_plot_x[:, 0], predict_component_function(tree_plot_x, 0, tree_knots, model),
'--', c='k', alpha=0.5, lw=0.5);
ax.scatter(tree_x[:, 0], predict_component_function(tree_x, 0, tree_knots, model),
color=blue, alpha=0.5);
v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max()
ax.set_xlim(-0.05, 1.05);
ax.set_xlabel('Scaled girth');
ax.set_ylim(2, 4.05);
ax.set_ylabel('$f_1$');
fig, ax = plt.subplots(figsize=(8, 6))
tree_plot_x = np.column_stack((np.linspace(0, 1, 100), np.linspace(0, 1, 100)))
ax.plot(tree_plot_x[:, 1], predict_component_function(tree_plot_x, 1, tree_knots, model),
'--', c='k', alpha=0.5, lw=0.5);
ax.scatter(tree_x[:, 1], predict_component_function(tree_x, 1, tree_knots, model),
color=blue, alpha=0.5);
v_min, v_max = tree_df.Volume.min(), tree_df.Volume.max()
ax.set_xlim(-0.05, 1.05);
ax.set_xlabel('Scaled girth');
ax.set_ylim(-0.01, 0.4);
ax.set_ylabel('$f_1$');