import numpy as np
import scipy as sp
import scipy.linalg as lin
import matplotlib.pyplot as plt
from __future__ import division, print_function
# function to compute the R2 from Y and X
# from scipy import linalg as lin
def projR(to_proj, X_):
"""
projects the 2d array `to_proj` onto the residual space of `_X`
The svd of _X is computed to have a orthonormal basis of the
space of the columns of _X
"""
u_, _, _ = lin.svd(X_, full_matrices=False)
return (to_proj - u_.dot(u_.T.dot(to_proj)))
## try this function with the mean:
# ones_n1 = np.ones((n,1))
# y = np.random.normal(3,1,size=(n,1))
# print("this should be close to 3: ", y.mean())
# print("this should be close to 0: ", projR(y, ones_n1).mean())
def R2(Y, X, rm_intercept=False):
"""
Compute the coefficient of determination R2
Y: numpy array shape (n,1) : the data
X: numpy array shape (n,p) : the design matrix
"""
(n,p) = X.shape
SST = (Y**2).sum() # total sum of square
C = (Y.sum())**2 / n # sum of square of the mean
ones_n1 = np.ones((n,1))
Xdemeaned = projR(X, ones_n1)
# compute the svd to get an orthonormal basis
u_, _, _ = lin.svd(Xdemeaned, full_matrices=False)
# compute SSReg noticing that Y^t u u^t Y = \sum (u^t Y)^2
SSReg = ((u_.T.dot(Y))**2).sum()
if rm_intercept:
return SSReg / (SST - C)
else:
return SSReg / SST
def SSR(Y, X):
"compute the sum of square of the regression on X"
(n_, p_) = X.shape
if n_ < p_:
raise ValueError("n must be >= to p")
u_, _, _ = lin.svd(X, full_matrices=False)
return (((u_.T * Y.T).sum(axis=1))**2).sum()
def Proj(X):
""" This function returns the projector on the
space of the columns of `X`
"""
(n_, p_) = X.shape
if n_ < p_:
raise ValueError("n must be >= to p")
u_, _, _ = lin.svd(X, full_matrices=False)
return u_.dot(u_.T)
# define a true model
def quadratic_model(x, param=[1,-8,16]):
return param[0]*x**2 + param[1]*x + param[2]
model = quadratic_model
n = 16 # number of points
noise_level = 1./8
x = np.arange(n)
true_y = model(x)
noiseSD = (true_y.max() - true_y.min())*noise_level
y = true_y + np.random.normal(0, noiseSD, size=(n,))
plt.plot(x,y,x,true_y)
[<matplotlib.lines.Line2D at 0x69ed8d0>, <matplotlib.lines.Line2D at 0x69edb10>]
Now, we would like to know if the data are linear or quadratic with x.
# fit a linear model y = XB + E
X = np.hstack((np.ones((n,1)),x.reshape((n,1))))
print(X)
[[ 1. 0.] [ 1. 1.] [ 1. 2.] [ 1. 3.] [ 1. 4.] [ 1. 5.] [ 1. 6.] [ 1. 7.] [ 1. 8.] [ 1. 9.] [ 1. 10.] [ 1. 11.] [ 1. 12.] [ 1. 13.] [ 1. 14.] [ 1. 15.]]
# fit the data
beta_h = lin.pinv(X).dot(y)
resid = y - X.dot(beta_h)
plt.plot(resid)
print(R2(y, X))
0.789569189143
X2 = np.hstack((X, (x*x).reshape((n,1))))
X2
array([[ 1., 0., 0.], [ 1., 1., 1.], [ 1., 2., 4.], [ 1., 3., 9.], [ 1., 4., 16.], [ 1., 5., 25.], [ 1., 6., 36.], [ 1., 7., 49.], [ 1., 8., 64.], [ 1., 9., 81.], [ 1., 10., 100.], [ 1., 11., 121.], [ 1., 12., 144.], [ 1., 13., 169.], [ 1., 14., 196.], [ 1., 15., 225.]])
# fit the data
beta_h = lin.pinv(X2).dot(y)
resid = y - X2.dot(beta_h)
plt.plot(resid)
print(R2(y, X2))
0.895987535281
def make_test_train(y, lmodels, scheme='leave_one_out'):
"""
Parameters
----------
y : numpy array
the data
lmodels : list of numpy arrays
each element is a design matrix
Returns
-------
"""
if scheme=='leave_one_out':
for i, y_test in enumerate(y):
y_train = np.hstack((y[:i],y[i+1:]))
m_loo = []
for mX in lmodels:
m_loo.append(np.vstack((mX[:i,:],mX[i+1:,:])))
yield i, y_test, y_train, m_loo
n = len(y)
x = np.arange(n)
X1 = np.hstack((np.ones((n,1)),x.reshape((n,1))))
X2 = np.hstack((X1, (x*x).reshape((n,1))))
models_list = [X1, X2]
for k in range(2,8):
models_list.append(np.hstack((X2, np.random.normal(size=(n,k)))))
y_split = make_test_train(y, models_list)
predicted_error = np.zeros(len(models_list),dtype='float')
for i, y_test, y_train, models_train in y_split:
for midx, md in enumerate(models_train):
# fit the model on train
bh = lin.pinv(md).dot(y_train)
# compute the difference between y_test and predicted
predicted_error[midx] += (y_test - models_list[midx][i,:].dot(bh))**2
print(predicted_error/n)
plt.plot(predicted_error/n)
[ 1216.81644631 419.52053563 520.91121718 551.83232086 851.68596693 800.49557814 843.35431674 1096.09708688]
[<matplotlib.lines.Line2D at 0x6c531d0>]
Cross validation: http://en.wikipedia.org/wiki/Cross-validation_%28statistics%29
Coefficient of determination: http://en.wikipedia.org/wiki/Coefficient_of_determination
Testing
Model checking / specification in fMRI: