%pylab inline

import numpy as np
import numpy.linalg as npl
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

from scipy.stats import t as t_dist, f as f_dist, gamma

def scale_design_mtx(X):
    """utility to scale the design matrix for display

    This scales the columns to their own range so we can see the variations 
    across the column for all the columns, regardless of the scaling of the 
    column.
    """
    mi, ma = X.min(axis=0), X.max(axis=0)
    col_neq = (ma - mi) > 1.e-8
    Xs = np.ones_like(X)
    mi = mi[col_neq]
    ma = ma[col_neq]
    Xs[:,col_neq] = (X[:,col_neq] - mi)/(ma - mi)
    return Xs

def show_design(X, design_title, **kwargs):
    """ Show the design matrix nicely """
    plt.figure()
    plt.gray() # Gray colormap
    plt.imshow(scale_design_mtx(X), interpolation='nearest', **kwargs)
    plt.title(design_title)

def fit_ols(Y, X):
    """ betas, fitted data, and residuals from OLS linear fit.
    
    This is OLS estimation; we assume the errors to have independent
    and identical normal distributions around zero for each $i$ in 
    $\Epsilon_i$ (i.i.d)
    """
    Y = np.asarray(Y)
    X = np.asarray(X)
    # Get the estimated betas
    betah   =  npl.pinv(X).dot(Y)
    fitted =  X.dot(betah)
    resid   =  Y - fitted
    return betah, fitted, resid

def t_stat(Y, X, C):
    """ betas, t statistic and significance test given data, design matrix, contrast
    
    Ordinary least squares estimation - see `fit_ols` function.
    """
    Y = np.asarray(Y)
    X = np.asarray(X)
    C = np.atleast_2d(C)
    # Calculate the parameters
    B, fitted, resid = fit_ols(Y, X)
    # Residual sum of squares
    RSS   = (resid**2).sum(axis=0)
    # Degrees of freedom - number of observations - number of fitted parameters
    df =  X.shape[0] - npl.matrix_rank(X)
    # Mean residual sum of squares
    MRSS  = RSS / df
    # Standard error of contrast estimate C.dot(B)
    SE    = np.sqrt(MRSS * C.dot(npl.pinv(X.T.dot(X)).dot(C.T)))
    t     = C.dot(B)/SE
    ltp   = t_dist(df).cdf(t) # lower tail p
    p = 1 - ltp # upper tail p
    return B, t, df, p

a_block = np.array([-0.1, 0, 0.1])
baseline = 10
activation = 11
on_off = np.hstack((a_block + baseline, a_block + activation))
n_on_off = 3
e = np.tile(a_block, (2 * n_on_off,))
y = np.tile(on_off, (n_on_off,))
plt.plot(y)

x_on = np.hstack((np.zeros(len(a_block)), np.ones(len(a_block))))
x_off = 1 - x_on
X_over_part = np.column_stack((x_on, x_off, np.ones_like(x_on)))
X_over = np.tile(X_over_part, (n_on_off, 1))
show_design(X_over, 'over parametrized')

X_well_part = np.column_stack((x_on, np.ones_like(x_on)))
X_well = np.tile(X_well_part, (n_on_off, 1))
show_design(X_well, 'well parametrized')

X_times2 = X_well.copy()
X_times2[:, 0] *= 2
show_design(X_times2, 'times 2')

B_over = npl.pinv(X_over).dot(y)
B_over

# Print betas, t statistic, df, p value for different designs, contrasts
print(t_stat(y, X_over, [1, 0, 0]))
print(t_stat(y, X_well, [1, 0]))
print(t_stat(y, X_times2, [1, 0]))

def is_estimable(X, C):
    """ Is a contrast C estimable on a design X
    
    To be estimable, the contrast needs to be orthogonal to the null-space of X.
    The null space of X is all the vectors $k$ such that Xk = 0.
    If $k$ is a vector in the null-space of X, then:
    
    Y = X B + e
    
    and
    
    Y = X (B + k) + e
    
    give the same fit (X B == X (B + k)).
    
    The null space of X is the orthogonal complement of the row-space of X.  So C
    has to be in the row space of X.
    
    A less mathematical way of seeing this is that the information we will need to form
    our betas is made of linear combinations of the rows of X, because the betas are
    constructed from Xt Y. 
    """
    C = np.atleast_2d(C)
    rankX = npl.matrix_rank(X)
    return rankX == npl.matrix_rank(np.vstack((C, X)))

is_estimable(X_over, [1, 0, 0])

is_estimable(X_over, [1, -1, 0])

import sympy
X = sympy.Matrix(X_over)
sympy.pretty_print(X.nullspace())

[u, s, vt] = npl.svd(X_over, full_matrices=False)
tol = s.max() * max(X_over.shape) * np.finfo(s.dtype).eps
nz = np.where(s > tol)[0]
print nz
vt[np.abs(vt) < tol] = 0
print vt.T[:,nz]
print "\nNormalize the columns \n"
print vt.T[:,nz].dot(np.diagflat([1./vt[0,0], 1./vt[1,1]]))

def f_stat(Y, X, col):
    """ betas, F statistic and significance test given data, design and column to test
    
    We do the F statistic long hand, by fitting the model with and without the column to test.
    
    This is OLS estimation; we assume the errors to have independent
    and identical normal distributions around zero for each $i$ in 
    $\Epsilon_i$ (i.i.d).
    """
    Y = np.asarray(Y)
    full_X = np.asarray(X)
    # Delete column to make reduced design
    reduced_X = np.delete(full_X, col, 1)
    # fit both designs
    _, _, full_resid = fit_ols(Y, full_X)
    _, _, reduced_resid = fit_ols(Y, reduced_X)
    n_obs = Y.shape[0]
    # Number of parameters used by each design
    n_p_full = npl.matrix_rank(full_X)
    n_p_reduced = npl.matrix_rank(reduced_X)
    # Extra sum of squares
    full_SS = (full_resid ** 2).sum()
    reduced_SS = (reduced_resid ** 2).sum()
    extra_SS = (reduced_SS - full_SS) / (n_p_full - n_p_reduced)
    # F statistic
    F = extra_SS / (full_SS / (n_obs - n_p_full))
    ltp   = f_dist((n_p_full - n_p_reduced), n_obs - n_p_full).cdf(F) # lower tail p
    p = 1 - ltp # upper tail p
    return F, p

f_stat(y, X_well, 0)

f_stat(y, X_over, 0)

# Remind ourselves about the t statistic on the over-parametrized design
print(t_stat(y, X_over, [0, 0, 1]))

# Add a tiny bit of noise to the over-parametrized design
X_over_tweaked = X_over.copy()
X_over_tweaked[0, 2] = 1e-14
print(t_stat(y, X_over_tweaked, [0, 0, 1]))

# first a quick function to yield two correlated regressor with correlation c
def correlated(c, n=20, seed=None):
    np.random.seed(seed) # To get predictable random numbers
    # generate two uncorrelated regressors
    Y = np.random.normal(0,1,(n,2))
    #make them correlated with mixing matrix M:
    #we want a correlation c between Y[0] and Y[1], with unit variance
    #this is the mixing matrix needed:
    c1 = np.sqrt( (1 + np.sqrt(1-c*c))/2 ) 
    c2 = .5 * c / c1
    M = np.asarray([[c1, c2],[c2, c1]])
    Yc = Y.dot(M)     # Make Yc = MY
    return Yc

n = 20
Xc = correlated(.8, n)
x1c = Xc[:,0]; x2c = Xc[:,1]; 
print np.corrcoef(Xc.T)[0,1]

y_corr = x1c + 1.5*np.random.normal(size=n) + 10
#x1c = np.random.normal(size=n)
#x2c = np.random.normal(size=n)
X_corr = np.column_stack((x1c, x2c, np.ones_like(x2c)))

C = np.array([1, 0, 0]) # The contrast
t_stat(y_corr, X_corr, C)

XtX = X_corr.T.dot(X_corr)
XtX

npl.pinv(XtX)

C.dot(npl.pinv(XtX)).dot(C)


#Xc = correlated(.0, n, seed=42)
#y_corr = Xc[:,0] + 1.*np.random.normal(size=n) + 10


n=20
x1c = np.random.normal(size=n)
x2c = np.random.normal(size=n)
y_corr = x1c + 1.*np.random.normal(size=n) + 10
# if you make the data exactly the same, for the 
# experiment to work, you would have to make the correlated regressor 
# different (use the correlated function with the seed not fixed). 
# Otherwise, you are not going to change your t since 
# the amount of the first regressor that is in the data is just going
# to be scaled, and the scaling doesnt change the t-value. In other
# words you are not changing the amount of data variance that can be 
# explained by the part of x1 that is orthogonal to x2, since x1 and 
# x2 directions remains exactly the same.

    
x2c_corr = np.linspace(0.1, 0.999, 15)

dtype = np.dtype(dict(names=['t', 'mss', 'b0', 'b1', 'b2', 'xvar', 'df'],
                      formats=['f8'] * 7))
res = np.zeros(len(x2c_corr), dtype=dtype)

for i, x2c_corr in enumerate(x2c_corr):
    new_x1c = (1 - x2c_corr) * x1c + x2c_corr * x2c
    #Xc = correlated(x2c_corr, n, seed=42)
    #new_x1c = Xc[:,0]; x2c = Xc[:,1];
    #print np.corrcoef(new_x1c,x2c)[0,1]
    X_corr = np.column_stack((new_x1c, x2c, np.ones_like(x2c)))
    y_corr = new_x1c + 1.*np.random.normal(size=n) + 10
    
    betas, res[i]['t'], res[i]['df'], p = t_stat(y_corr, X_corr, C)
    res[i]['b0'], res[i]['b1'], res[i]['b2'] = betas[:]
    fitted = X_corr.dot(betas)
    res[i]['mss'] = ((y_corr - fitted)**2).sum() / res[i]['df']
    res[i]['xvar'] = C.dot(npl.pinv(X_corr.T.dot(X_corr)).dot(C))
print(mlab.rec2txt(res))

def spm_hrf(t):
    """ Return SPM hrf sampled at times `t`
    """
    # gamma.pdf only defined for t > 0
    hrf = np.zeros_like(t, dtype=np.float)
    hrf[t > 0] = gamma.pdf(t[t > 0], 6, 0, 1) - gamma.pdf(t[t > 0], 16, 0, 1) / 6.
    return hrf / np.sum(hrf)

def spm_hrf_d(t):
    """ Return temporal derivative of SPM HRF sampled at times `t`
    """
    # This is what spm does!
    return spm_hrf(t) - spm_hrf(t - 1)

t = np.arange(24)
plt.plot(t, spm_hrf(t), label='HRF')
plt.plot(t, spm_hrf_d(t), label='TD')
plt.legend()

block_length = 24
on_off = np.hstack((np.zeros(block_length), np.ones(block_length)))
off_on = 1 - on_off
X_hrf_over = np.column_stack((on_off, off_on, np.ones_like(on_off)))
X_hrf_over = np.tile(X_hrf_over, (10, 1))
show_design(X_hrf_over, 'before convolution', aspect=0.01)
S = npl.svd(X_hrf_over, compute_uv=False)
print('Smallest singular value (if close to 0 matrix may be rank deficient)')
print(S.min())
# Is -1 -1 1 still in the null space? Yes.
np.allclose(X_hrf_over.dot([-1, -1, 1]), 0)

hrf = spm_hrf(t)
x0 = np.convolve(hrf, X_hrf_over[:, 0], mode='same')
x1 = np.convolve(hrf, X_hrf_over[:, 1], mode='same')
X_conv_over = np.column_stack((x0, x1, np.ones_like(x0)))
show_design(X_conv_over, 'Convolved design', aspect=0.01)
S = npl.svd(X_conv_over, compute_uv=False)
print('Smallest singular value (if close to 0 matrix may be rank deficient)')
print(S.min())

# Is -1 -1 1 still in the null space?
np.allclose(X_conv_over.dot([-1, -1, 1]), 0)

x0_hrf = X_conv_over[:, 0]
X_conv_well = np.column_stack((x0_hrf, np.ones_like(x0)))
y = (x0_hrf - x0_hrf.mean()) * 1 + 10

npl.pinv(X_conv_well).dot(y)

x0_hrf.mean()

X_conv_orth = np.column_stack((x0_hrf - x0_hrf.mean(), np.ones_like(x0)))
npl.pinv(X_conv_orth).dot(y)

dhrf = spm_hrf_d(np.arange(24))
x0_dhrf = np.convolve(dhrf, x0, mode='same')

# Basis functions, before convolution with block - nearly orthogonal
hrf.dot(dhrf)

# After convolution with block - not orthogonal
x0_hrf.dot(x0_dhrf)

def orth_x_wrt_y(x, y):
    """ Orthonalize vector `x` with respect to matrix `y`
    """
    y_in_x_beta = npl.pinv(y).dot(x)
    y_in_x = y.dot(y_in_x_beta)
    return x - y_in_x

x0_dhrf_orth = orth_x_wrt_y(x0_dhrf, X_conv_orth)
x0_dhrf_orth.T.dot(X_conv_orth)

x0_hrf.dot(x0_dhrf_orth)

X_dconv_orth = np.column_stack((X_conv_orth[:, 0], x0_dhrf_orth, np.ones_like(x0)))
npl.pinv(X_dconv_orth).dot(y)

ye = y + np.random.normal(size=(y.shape))
print(npl.pinv(X_conv_orth).dot(ye))
print(npl.pinv(X_dconv_orth).dot(ye))

# Design in which hrf regressor not orthogonal to constant
partial_orthed = orth_x_wrt_y(x0_dhrf, x0_hrf[:, None])
X_colinear = np.column_stack((x0_hrf, np.ones_like(x0_hrf)))
X_mixed = np.column_stack((x0_hrf, partial_orthed, np.ones_like(x0_hrf)))
X_mixed.T.dot(X_mixed)

print(npl.pinv(X_colinear).dot(ye))
print(npl.pinv(X_mixed).dot(ye))