A notebook for thoughts about Cyril Pernet's article on GLM misconceptions

In [1]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [2]:

import numpy as np
import numpy.linalg as npl
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

Import statistical distributions from scipy:

In [3]:

from scipy.stats import t as t_dist, f as f_dist, gamma

A routine to scale the design matrix for display:

In [4]:

def scale_design_mtx(X):
    """utility to scale the design matrix for display

    This scales the columns to their own range so we can see the variations 
    across the column for all the columns, regardless of the scaling of the 
    column.
    """
    mi, ma = X.min(axis=0), X.max(axis=0)
    col_neq = (ma - mi) > 1.e-8
    Xs = np.ones_like(X)
    mi = mi[col_neq]
    ma = ma[col_neq]
    Xs[:,col_neq] = (X[:,col_neq] - mi)/(ma - mi)
    return Xs

Display the design matrix nicely:

In [5]:

def show_design(X, design_title, **kwargs):
    """ Show the design matrix nicely """
    plt.figure()
    plt.gray() # Gray colormap
    plt.imshow(scale_design_mtx(X), interpolation='nearest', **kwargs)
    plt.title(design_title)

Very simple t statistic from contrast and Ordinary Least Squares fit:

In [6]:

def fit_ols(Y, X):
    """ betas, fitted data, and residuals from OLS linear fit.
    
    This is OLS estimation; we assume the errors to have independent
    and identical normal distributions around zero for each $i$ in 
    $\Epsilon_i$ (i.i.d)
    """
    Y = np.asarray(Y)
    X = np.asarray(X)
    # Get the estimated betas
    betah   =  npl.pinv(X).dot(Y)
    fitted =  X.dot(betah)
    resid   =  Y - fitted
    return betah, fitted, resid

In [7]:

def t_stat(Y, X, C):
    """ betas, t statistic and significance test given data, design matrix, contrast
    
    Ordinary least squares estimation - see `fit_ols` function.
    """
    Y = np.asarray(Y)
    X = np.asarray(X)
    C = np.atleast_2d(C)
    # Calculate the parameters
    B, fitted, resid = fit_ols(Y, X)
    # Residual sum of squares
    RSS   = (resid**2).sum(axis=0)
    # Degrees of freedom - number of observations - number of fitted parameters
    df =  X.shape[0] - npl.matrix_rank(X)
    # Mean residual sum of squares
    MRSS  = RSS / df
    # Standard error of contrast estimate C.dot(B)
    SE    = np.sqrt(MRSS * C.dot(npl.pinv(X.T.dot(X)).dot(C.T)))
    t     = C.dot(B)/SE
    ltp   = t_dist(df).cdf(t) # lower tail p
    p = 1 - ltp # upper tail p
    return B, t, df, p

Make the simulated data from C.P.s figure 2 (was figure 1):

In [8]:

a_block = np.array([-0.1, 0, 0.1])
baseline = 10
activation = 11
on_off = np.hstack((a_block + baseline, a_block + activation))
n_on_off = 3
e = np.tile(a_block, (2 * n_on_off,))
y = np.tile(on_off, (n_on_off,))
plt.plot(y)

Out[8]:

[<matplotlib.lines.Line2D at 0x4287c50>]

In [9]:

x_on = np.hstack((np.zeros(len(a_block)), np.ones(len(a_block))))
x_off = 1 - x_on
X_over_part = np.column_stack((x_on, x_off, np.ones_like(x_on)))
X_over = np.tile(X_over_part, (n_on_off, 1))
show_design(X_over, 'over parametrized')

In [10]:

X_well_part = np.column_stack((x_on, np.ones_like(x_on)))
X_well = np.tile(X_well_part, (n_on_off, 1))
show_design(X_well, 'well parametrized')

In [11]:

X_times2 = X_well.copy()
X_times2[:, 0] *= 2
show_design(X_times2, 'times 2')

In [12]:

B_over = npl.pinv(X_over).dot(y)
B_over

Out[12]:

array([ 4.,  3.,  7.])

In [13]:

# Print betas, t statistic, df, p value for different designs, contrasts
print(t_stat(y, X_over, [1, 0, 0]))
print(t_stat(y, X_well, [1, 0]))
print(t_stat(y, X_times2, [1, 0]))

(array([ 4.,  3.,  7.]), array([[ 185.90320062]]), 16, array([[ 0.]]))
(array([  1.,  10.]), array([[ 24.49489743]]), 16, array([[  2.05391260e-14]]))
(array([  0.5,  10. ]), array([[ 24.49489743]]), 16, array([[  2.05391260e-14]]))

The t statistic on the over-parametrized design does not relflect the estimability of the contrast:

In [14]:

def is_estimable(X, C):
    """ Is a contrast C estimable on a design X
    
    To be estimable, the contrast needs to be orthogonal to the null-space of X.
    The null space of X is all the vectors $k$ such that Xk = 0.
    If $k$ is a vector in the null-space of X, then:
    
    Y = X B + e
    
    and
    
    Y = X (B + k) + e
    
    give the same fit (X B == X (B + k)).
    
    The null space of X is the orthogonal complement of the row-space of X.  So C
    has to be in the row space of X.
    
    A less mathematical way of seeing this is that the information we will need to form
    our betas is made of linear combinations of the rows of X, because the betas are
    constructed from Xt Y. 
    """
    C = np.atleast_2d(C)
    rankX = npl.matrix_rank(X)
    return rankX == npl.matrix_rank(np.vstack((C, X)))

In [15]:

is_estimable(X_over, [1, 0, 0])

Out[15]:

False

In [16]:

is_estimable(X_over, [1, -1, 0])

Out[16]:

True

In our case, the null space of $\mathbf{X}$ is:

In [17]:

import sympy
X = sympy.Matrix(X_over)
sympy.pretty_print(X.nullspace())

⎡⎡-1.0⎤⎤
⎢⎢    ⎥⎥
⎢⎢-1.0⎥⎥
⎢⎢    ⎥⎥
⎣⎣ 1  ⎦⎦

Any contrast vector not orthogonal to this null space is not estimable.

Another way of seeing this is to get a minimal set of vectors that "span" the rows of X. These can be found using the svd :

In [18]:

[u, s, vt] = npl.svd(X_over, full_matrices=False)
tol = s.max() * max(X_over.shape) * np.finfo(s.dtype).eps
nz = np.where(s > tol)[0]
print nz
vt[np.abs(vt) < tol] = 0
print vt.T[:,nz]
print "\nNormalize the columns \n"
print vt.T[:,nz].dot(np.diagflat([1./vt[0,0], 1./vt[1,1]]))

[0 1]
[[-0.40824829 -0.70710678]
 [-0.40824829  0.70710678]
 [-0.81649658  0.        ]]

Normalize the columns 

[[ 1. -1.]
 [ 1.  1.]
 [ 2.  0.]]

So, any contrast that is made of these two is estimable. In particular, what do you think the first column would test for?

Compare to an F test:

In [19]:

def f_stat(Y, X, col):
    """ betas, F statistic and significance test given data, design and column to test
    
    We do the F statistic long hand, by fitting the model with and without the column to test.
    
    This is OLS estimation; we assume the errors to have independent
    and identical normal distributions around zero for each $i$ in 
    $\Epsilon_i$ (i.i.d).
    """
    Y = np.asarray(Y)
    full_X = np.asarray(X)
    # Delete column to make reduced design
    reduced_X = np.delete(full_X, col, 1)
    # fit both designs
    _, _, full_resid = fit_ols(Y, full_X)
    _, _, reduced_resid = fit_ols(Y, reduced_X)
    n_obs = Y.shape[0]
    # Number of parameters used by each design
    n_p_full = npl.matrix_rank(full_X)
    n_p_reduced = npl.matrix_rank(reduced_X)
    # Extra sum of squares
    full_SS = (full_resid ** 2).sum()
    reduced_SS = (reduced_resid ** 2).sum()
    extra_SS = (reduced_SS - full_SS) / (n_p_full - n_p_reduced)
    # F statistic
    F = extra_SS / (full_SS / (n_obs - n_p_full))
    ltp   = f_dist((n_p_full - n_p_reduced), n_obs - n_p_full).cdf(F) # lower tail p
    p = 1 - ltp # upper tail p
    return F, p

In [20]:

f_stat(y, X_well, 0)

Out[20]:

(600.00000000000409, 4.1189274213593308e-14)

What happens with the over-parametrized design?

In [21]:

f_stat(y, X_over, 0)

-c:24: RuntimeWarning: divide by zero encountered in double_scalars

Out[21]:

(inf, nan)

Was that what you were expecting?

Correlated regressors¶

In [22]:

# Remind ourselves about the t statistic on the over-parametrized design
print(t_stat(y, X_over, [0, 0, 1]))

(array([ 4.,  3.,  7.]), array([[ 514.39284598]]), 16, array([[ 0.]]))

In [23]:

# Add a tiny bit of noise to the over-parametrized design
X_over_tweaked = X_over.copy()
X_over_tweaked[0, 2] = 1e-14
print(t_stat(y, X_over_tweaked, [0, 0, 1]))

(array([ 10.8875,   9.9   ,   0.1125]), array([[ 1.2456822]]), 15, array([[ 0.11599406]]))

The general case of correlated regressors

In [24]:

# first a quick function to yield two correlated regressor with correlation c
def correlated(c, n=20, seed=None):
    np.random.seed(seed) # To get predictable random numbers
    # generate two uncorrelated regressors
    Y = np.random.normal(0,1,(n,2))
    #make them correlated with mixing matrix M:
    #we want a correlation c between Y[0] and Y[1], with unit variance
    #this is the mixing matrix needed:
    c1 = np.sqrt( (1 + np.sqrt(1-c*c))/2 ) 
    c2 = .5 * c / c1
    M = np.asarray([[c1, c2],[c2, c1]])
    Yc = Y.dot(M)     # Make Yc = MY
    return Yc

In [25]:

n = 20
Xc = correlated(.8, n)
x1c = Xc[:,0]; x2c = Xc[:,1]; 
print np.corrcoef(Xc.T)[0,1]

y_corr = x1c + 1.5*np.random.normal(size=n) + 10
#x1c = np.random.normal(size=n)
#x2c = np.random.normal(size=n)
X_corr = np.column_stack((x1c, x2c, np.ones_like(x2c)))

C = np.array([1, 0, 0]) # The contrast
t_stat(y_corr, X_corr, C)

0.830592959914

Out[25]:

(array([ 1.01047766,  0.14863412,  9.42780916]),
 array([[ 1.9797293]]),
 17,
 array([[ 0.03207994]]))

In [26]:

XtX = X_corr.T.dot(X_corr)
XtX

Out[26]:

array([[ 18.34278453,  14.98641369,  -2.36167903],
       [ 14.98641369,  17.77134663,  -4.45422709],
       [ -2.36167903,  -4.45422709,  20.        ]])

In [27]:

npl.pinv(XtX)

Out[27]:

array([[ 0.17851102, -0.15384086, -0.01318282],
       [-0.15384086,  0.19217721,  0.02463391],
       [-0.01318282,  0.02463391,  0.05392957]])

In [28]:

C.dot(npl.pinv(XtX)).dot(C)

Out[28]:

0.17851102422647533

In [28]:

In [29]:

#Xc = correlated(.0, n, seed=42)
#y_corr = Xc[:,0] + 1.*np.random.normal(size=n) + 10


n=20
x1c = np.random.normal(size=n)
x2c = np.random.normal(size=n)
y_corr = x1c + 1.*np.random.normal(size=n) + 10
# if you make the data exactly the same, for the 
# experiment to work, you would have to make the correlated regressor 
# different (use the correlated function with the seed not fixed). 
# Otherwise, you are not going to change your t since 
# the amount of the first regressor that is in the data is just going
# to be scaled, and the scaling doesnt change the t-value. In other
# words you are not changing the amount of data variance that can be 
# explained by the part of x1 that is orthogonal to x2, since x1 and 
# x2 directions remains exactly the same.

    
x2c_corr = np.linspace(0.1, 0.999, 15)

dtype = np.dtype(dict(names=['t', 'mss', 'b0', 'b1', 'b2', 'xvar', 'df'],
                      formats=['f8'] * 7))
res = np.zeros(len(x2c_corr), dtype=dtype)

for i, x2c_corr in enumerate(x2c_corr):
    new_x1c = (1 - x2c_corr) * x1c + x2c_corr * x2c
    #Xc = correlated(x2c_corr, n, seed=42)
    #new_x1c = Xc[:,0]; x2c = Xc[:,1];
    #print np.corrcoef(new_x1c,x2c)[0,1]
    X_corr = np.column_stack((new_x1c, x2c, np.ones_like(x2c)))
    y_corr = new_x1c + 1.*np.random.normal(size=n) + 10
    
    betas, res[i]['t'], res[i]['df'], p = t_stat(y_corr, X_corr, C)
    res[i]['b0'], res[i]['b1'], res[i]['b2'] = betas[:]
    fitted = X_corr.dot(betas)
    res[i]['mss'] = ((y_corr - fitted)**2).sum() / res[i]['df']
    res[i]['xvar'] = C.dot(npl.pinv(X_corr.T.dot(X_corr)).dot(C))
print(mlab.rec2txt(res))

        t     mss        b0         b1       b2        xvar       df
    4.824   1.360     1.144     -0.018    9.782       0.041   17.000
    5.043   0.683     0.913      0.149   10.290       0.048   17.000
    4.343   0.901     0.978     -0.043   10.037       0.056   17.000
    4.319   1.373     1.309      0.067    9.772       0.067   17.000
    4.767   0.896     1.283     -0.565   10.077       0.081   17.000
    4.142   0.828     1.192     -0.370    9.882       0.100   17.000
    4.805   0.449     1.144     -0.208   10.143       0.126   17.000
    2.846   1.016     1.165     -0.427    9.295       0.165   17.000
    4.372   0.777     1.826     -0.910   10.194       0.224   17.000
    0.404   1.138     0.245      0.992   10.018       0.323   17.000
    0.674   1.115     0.505      0.055    9.971       0.504   17.000
    0.133   1.243     0.141      0.791   10.225       0.893   17.000
    1.310   0.863     1.720     -0.847   10.129       1.999   17.000
   -0.884   0.946    -2.411      3.074    9.777       7.873   17.000
    0.773   1.399   167.393   -165.998   10.192   33482.838   17.000

HRF and temporal derivatives¶

Now consider a design convolved with an HRF:

In [ ]:

def spm_hrf(t):
    """ Return SPM hrf sampled at times `t`
    """
    # gamma.pdf only defined for t > 0
    hrf = np.zeros_like(t, dtype=np.float)
    hrf[t > 0] = gamma.pdf(t[t > 0], 6, 0, 1) - gamma.pdf(t[t > 0], 16, 0, 1) / 6.
    return hrf / np.sum(hrf)

def spm_hrf_d(t):
    """ Return temporal derivative of SPM HRF sampled at times `t`
    """
    # This is what spm does!
    return spm_hrf(t) - spm_hrf(t - 1)

In [ ]:

t = np.arange(24)
plt.plot(t, spm_hrf(t), label='HRF')
plt.plot(t, spm_hrf_d(t), label='TD')
plt.legend()

The design without HRF convolution

In [ ]:

block_length = 24
on_off = np.hstack((np.zeros(block_length), np.ones(block_length)))
off_on = 1 - on_off
X_hrf_over = np.column_stack((on_off, off_on, np.ones_like(on_off)))
X_hrf_over = np.tile(X_hrf_over, (10, 1))
show_design(X_hrf_over, 'before convolution', aspect=0.01)
S = npl.svd(X_hrf_over, compute_uv=False)
print('Smallest singular value (if close to 0 matrix may be rank deficient)')
print(S.min())
# Is -1 -1 1 still in the null space? Yes.
np.allclose(X_hrf_over.dot([-1, -1, 1]), 0)

Design with convolution:

In [ ]:

hrf = spm_hrf(t)
x0 = np.convolve(hrf, X_hrf_over[:, 0], mode='same')
x1 = np.convolve(hrf, X_hrf_over[:, 1], mode='same')
X_conv_over = np.column_stack((x0, x1, np.ones_like(x0)))
show_design(X_conv_over, 'Convolved design', aspect=0.01)
S = npl.svd(X_conv_over, compute_uv=False)
print('Smallest singular value (if close to 0 matrix may be rank deficient)')
print(S.min())

Notice that the matrix with convolved columns is no longer rank deficient, and the vector:

$$ \left[\begin{matrix}-1\\-1\\1\end{matrix}\right] $$

is no longer in the null space (because there is no null space for the convolved matrix).

In [ ]:

# Is -1 -1 1 still in the null space?
np.allclose(X_conv_over.dot([-1, -1, 1]), 0)

OK, let's make some data based on the design:

In [ ]:

x0_hrf = X_conv_over[:, 0]
X_conv_well = np.column_stack((x0_hrf, np.ones_like(x0)))
y = (x0_hrf - x0_hrf.mean()) * 1 + 10

In [ ]:

npl.pinv(X_conv_well).dot(y)

We don't reconstruct 10 correctly because the x0 regressor does not have mean 0:

In [ ]:

x0_hrf.mean()

In [ ]:

X_conv_orth = np.column_stack((x0_hrf - x0_hrf.mean(), np.ones_like(x0)))
npl.pinv(X_conv_orth).dot(y)

Make a temporal derivative:

In [ ]:

dhrf = spm_hrf_d(np.arange(24))
x0_dhrf = np.convolve(dhrf, x0, mode='same')

The derivative is not orthogonal to the HRF, after convolution:

In [ ]:

# Basis functions, before convolution with block - nearly orthogonal
hrf.dot(dhrf)

In [ ]:

# After convolution with block - not orthogonal
x0_hrf.dot(x0_dhrf)

We can orthogonalize using the same mechamism as we use for the fit:

In [ ]:

def orth_x_wrt_y(x, y):
    """ Orthonalize vector `x` with respect to matrix `y`
    """
    y_in_x_beta = npl.pinv(y).dot(x)
    y_in_x = y.dot(y_in_x_beta)
    return x - y_in_x

In [ ]:

x0_dhrf_orth = orth_x_wrt_y(x0_dhrf, X_conv_orth)
x0_dhrf_orth.T.dot(X_conv_orth)

In [ ]:

x0_hrf.dot(x0_dhrf_orth)

As we know, adding an orthogonal regressor to the whole the rest of the design cannot affect the fit for other regressors:

In [ ]:

X_dconv_orth = np.column_stack((X_conv_orth[:, 0], x0_dhrf_orth, np.ones_like(x0)))
npl.pinv(X_dconv_orth).dot(y)

With or without noise:

In [ ]:

ye = y + np.random.normal(size=(y.shape))
print(npl.pinv(X_conv_orth).dot(ye))
print(npl.pinv(X_dconv_orth).dot(ye))

But - if the new regressor is not orthogonal to the whole of the rest of the design, the fit can change, even for regressors that are orthogonal to the new regressor:

In [ ]:

# Design in which hrf regressor not orthogonal to constant
partial_orthed = orth_x_wrt_y(x0_dhrf, x0_hrf[:, None])
X_colinear = np.column_stack((x0_hrf, np.ones_like(x0_hrf)))
X_mixed = np.column_stack((x0_hrf, partial_orthed, np.ones_like(x0_hrf)))
X_mixed.T.dot(X_mixed)

So the second temporal derivative regressor is orthogonal to the first HRF regressor, but not orthogonal to the constant. Estimating:

In [ ]:

print(npl.pinv(X_colinear).dot(ye))
print(npl.pinv(X_mixed).dot(ye))

Estimation in the presence of noise caused the parameter for the first HRF regressor to change, even though the added regressor is orthogonal to the first HRF regressor.