In [1]:

%matplotlib inline
import numpy as np
import numpy.linalg as npl
import matplotlib.pyplot as plt

In [2]:

from scipy.stats import t as t_dist, f as f_dist

In [3]:

np.set_printoptions(precision=3)

In [4]:

rng = np.random.RandomState(1965)

In [5]:

N = 12
e = rng.normal(0, 5, size=N)
print(e)

[ 1.416 -5.486  2.204  4.835 -2.584 -4.437  6.343  2.02  -0.81  -4.098
  6.721  6.324]

In [6]:

berkeley = np.array([1] * 4 + [0] * 8)
stanford = np.array([0] * 4 + [1] * 4 + [0] * 4)
mit = np.array([0] * 8 + [1] * 4)

In [7]:

X = np.column_stack((berkeley, stanford, mit))
beta = [10, 11, 16]
clammy = np.dot(X, beta) + rng.normal(0, 2, size=N)
# Adjust clammy to start around zero
clammy = clammy - min(clammy) + 0.2
clammy

Out[7]:

array([ 0.389,  0.2  ,  0.241,  0.463,  4.585,  1.097,  1.642,  4.972,
        7.957,  5.585,  5.527,  6.964])

In [8]:

y = X.dot(beta) + e
y

Out[8]:

array([ 11.416,   4.514,  12.204,  14.835,   8.416,   6.563,  17.343,
        13.02 ,  15.19 ,  11.902,  22.721,  22.324])

In [9]:

plt.plot(y)

Out[9]:

[<matplotlib.lines.Line2D at 0x106097810>]

In [10]:

plt.plot(clammy, y, '+')

Out[10]:

[<matplotlib.lines.Line2D at 0x1060c82d0>]

In [11]:

npl.pinv(X).dot(y)

Out[11]:

array([ 10.742,  11.336,  18.034])

In [12]:

X1 = np.column_stack((clammy, np.ones(N)))
npl.pinv(X1).dot(y)

Out[12]:

array([  0.999,  10.071])

In [13]:

def fit_ols(Y, X):
    """ betas, fitted data, and residuals from OLS linear fit.
    
    This is OLS estimation; we assume the errors to have independent
    and identical normal distributions around zero for each $i$ in 
    $\Epsilon_i$ (i.i.d)
    """
    Y = np.asarray(Y)
    X = np.asarray(X)
    # Get the estimated betas
    betah   =  npl.pinv(X).dot(Y)
    fitted =  X.dot(betah)
    resid   =  Y - fitted
    return betah, fitted, resid

In [14]:

def t_stat(Y, X, C):
    """ betas, t statistic and significance test given data, design matrix, contrast
    
    Ordinary least squares estimation - see `fit_ols` function.
    """
    Y = np.asarray(Y)
    X = np.asarray(X)
    C = np.atleast_2d(C)
    # Calculate the parameters
    B, fitted, resid = fit_ols(Y, X)
    # Residual sum of squares
    RSS   = (resid**2).sum(axis=0)
    # Degrees of freedom - number of observations - number of fitted parameters
    df =  X.shape[0] - npl.matrix_rank(X)
    # Mean residual sum of squares
    MRSS  = RSS / df
    # Standard error of contrast estimate C.dot(B)
    SE    = np.sqrt(MRSS * C.dot(npl.pinv(X.T.dot(X)).dot(C.T)))
    t     = C.dot(B)/SE
    ltp   = t_dist(df).cdf(t) # lower tail p
    p = 1 - ltp # upper tail p
    return B, t, df, p

In [15]:

t_stat(y, X1, [1, 0])

Out[15]:

(array([  0.999,  10.071]), array([[ 1.915]]), 10, array([[ 0.042]]))

In [16]:

t_stat(y, X, [-1, -1, 2])

Out[16]:

(array([ 10.742,  11.336,  18.034]), array([[ 2.34]]), 9, array([[ 0.022]]))

In [17]:

np.set_printoptions(precision=1)
age = rng.normal(23, 2, size=12)
age

Out[17]:

array([ 22.5,  25.3,  24.6,  21.4,  20.7,  23.3,  23.8,  21.7,  21.3,
        25.2,  24.6,  21.8])