%matplotlib inline
import numpy as np
import numpy.linalg as npl
import matplotlib.pyplot as plt
from scipy.stats import t as t_dist, f as f_dist
np.set_printoptions(precision=3)
rng = np.random.RandomState(1965)
N = 12
e = rng.normal(0, 5, size=N)
print(e)
[ 1.416 -5.486 2.204 4.835 -2.584 -4.437 6.343 2.02 -0.81 -4.098 6.721 6.324]
berkeley = np.array([1] * 4 + [0] * 8)
stanford = np.array([0] * 4 + [1] * 4 + [0] * 4)
mit = np.array([0] * 8 + [1] * 4)
X = np.column_stack((berkeley, stanford, mit))
beta = [10, 11, 16]
clammy = np.dot(X, beta) + rng.normal(0, 2, size=N)
# Adjust clammy to start around zero
clammy = clammy - min(clammy) + 0.2
clammy
array([ 0.389, 0.2 , 0.241, 0.463, 4.585, 1.097, 1.642, 4.972, 7.957, 5.585, 5.527, 6.964])
y = X.dot(beta) + e
y
array([ 11.416, 4.514, 12.204, 14.835, 8.416, 6.563, 17.343, 13.02 , 15.19 , 11.902, 22.721, 22.324])
plt.plot(y)
[<matplotlib.lines.Line2D at 0x106097810>]
plt.plot(clammy, y, '+')
[<matplotlib.lines.Line2D at 0x1060c82d0>]
npl.pinv(X).dot(y)
array([ 10.742, 11.336, 18.034])
X1 = np.column_stack((clammy, np.ones(N)))
npl.pinv(X1).dot(y)
array([ 0.999, 10.071])
def fit_ols(Y, X):
""" betas, fitted data, and residuals from OLS linear fit.
This is OLS estimation; we assume the errors to have independent
and identical normal distributions around zero for each $i$ in
$\Epsilon_i$ (i.i.d)
"""
Y = np.asarray(Y)
X = np.asarray(X)
# Get the estimated betas
betah = npl.pinv(X).dot(Y)
fitted = X.dot(betah)
resid = Y - fitted
return betah, fitted, resid
def t_stat(Y, X, C):
""" betas, t statistic and significance test given data, design matrix, contrast
Ordinary least squares estimation - see `fit_ols` function.
"""
Y = np.asarray(Y)
X = np.asarray(X)
C = np.atleast_2d(C)
# Calculate the parameters
B, fitted, resid = fit_ols(Y, X)
# Residual sum of squares
RSS = (resid**2).sum(axis=0)
# Degrees of freedom - number of observations - number of fitted parameters
df = X.shape[0] - npl.matrix_rank(X)
# Mean residual sum of squares
MRSS = RSS / df
# Standard error of contrast estimate C.dot(B)
SE = np.sqrt(MRSS * C.dot(npl.pinv(X.T.dot(X)).dot(C.T)))
t = C.dot(B)/SE
ltp = t_dist(df).cdf(t) # lower tail p
p = 1 - ltp # upper tail p
return B, t, df, p
t_stat(y, X1, [1, 0])
(array([ 0.999, 10.071]), array([[ 1.915]]), 10, array([[ 0.042]]))
t_stat(y, X, [-1, -1, 2])
(array([ 10.742, 11.336, 18.034]), array([[ 2.34]]), 9, array([[ 0.022]]))
np.set_printoptions(precision=1)
age = rng.normal(23, 2, size=12)
age
array([ 22.5, 25.3, 24.6, 21.4, 20.7, 23.3, 23.8, 21.7, 21.3, 25.2, 24.6, 21.8])