Ancova : the issues¶

This small notebook is based on : K. T. D'Alonzo, " The Johnson-Neyman Pprocedure as an Alternative to ANCOVA", 2004. This follows the advise of Consuala.

In [43]:

import scipy.linalg as lin
import scipy.stats as sst
import numpy as np

In [74]:

#np.set_printoptions(formatter={'float':lambda x: '% -4.2f' % x})
#print  t_test(X, y, [1,-1,0, 0])
def ff(frmt="% -3.2f",padd=6):
    
    def fformat(x, f=frmt, p=padd):
        s = f % x
        if len(s) <= p:
            s = (p-len(s))*' ' + s
        return s
    
    return fformat

np.set_printoptions(formatter={'float':ff(frmt="%-3.1f",padd=6)})

Define the size of the two groups - make a model

In [78]:

def glm(X,Y):
    """ a simple GLM function returning the estimated parameters and residuals """
    betah   =  lin.pinv(X).dot(Y)
    Yfitted =  X.dot(betah)
    resid   =  Y - Yfitted
    return betah, Yfitted, resid

def t_test(X, y, c, clean=True):

    if clean: 
        c = clean_c(c)
    n,p = X.shape
    pXTX = np.linalg.pinv(X.T.dot(X))

    betah, yh, rh = glm(X,y) # fit the model
    df = (n - linalg.matrix_rank(X))
    MRSS = sum((y - yh)**2)/df

    t_num = c.T.dot(betah)
    SE = np.sqrt(MRSS* c.T.dot(pXTX).dot(c))
    tval = t_num / SE
    pval = 1.0 - sst.t.cdf(tval,df)
        
    return tval, df, pval

def clean_c(c):
    c = np.asarray(c)
    if len(c.shape) == 1:
        c = c[:, newaxis]
    if len(c.shape) == 2:
        p = np.max(c.shape)
        d = np.min(c.shape)
        c = c.reshape((p,d))
    return c

In [79]:

def f_test(X, Y, L, verbose=False, clean=True):
    
    if clean: 
        L = clean_c(L)
    # check that the contrast makes sense
    B = lin.pinv(X).dot(Y)
    
    # Numerator
    piXtX = lin.pinv(X.T.dot(X))
    piLtpiXtXL = lin.pinv(L.T.dot(piXtX).dot(L))
    LtB = L.T.dot(B)
    Fnum = LtB.T.dot(piLtpiXtXL).dot(LtB)
    
    # SSRX ?
    SSRX = Y.T.dot(Y) - B.T.dot(X.T.dot(X)).dot(B)
    v1 = matrix_rank(L)
    v2 = X.shape[0] - matrix_rank(X)
    
    if verbose:
        print 'Fnum: %f, df1=%d, Fdenom: %f, df2=%d,  \n' % (Fnum, v1, SSRX, v2)
        
    fval = (Fnum/v1)/(SSRX/v2)
    pval = 1.0 - sst.f.cdf(fval,v1,v2)
 
    
    return (fval, (v1, v2), pval)

In [152]:

def proj(X, Y=None):
    # project onto the space of X
    #print matrix_rank(X)
    [u, s, vt] = lin.svd(X, full_matrices=False)
    tol = s.max() * max(X.shape) * finfo(s.dtype).eps
    nz = np.where(s > tol)[0]
    #print nz
    if Y is None:
        return u[:,nz].dot(u[:,nz].T)
    else:
        return u[:,nz].dot(u[:,nz].T.dot(Y))
    
def R_proj(X, Y=None):
    # project onto the space orthogonal to X (RY) 
    if Y is None:
        return np.eye(X.shape[0]) - proj(X, Y) 
    else:
        return Y - proj(X, Y)

Make a model for generating the data¶

In [136]:

n1,n2=(16,16)
n = n1+n2
def make_model(n=(16,16), offset=0):
    # w are the weights on the following model
    
    n1, n2 = n[0], n[1]
    n = n1+n2
    cg1 = np.random.randint(0,20,size=(n1,1)) + 0
    cg2 = np.random.randint(0,20,size=(n2,1)) + 0
    cg12 = np.vstack((cg1,cg2)) 
    mean_cg = np.vstack((cg1,cg2)).mean() * 0 
    cg1 = np.vstack((cg1 - mean_cg, np.zeros((n2,1))))
    cg2 = np.vstack((np.zeros((n1,1)), cg2 - mean_cg))

    intercept = np.ones((n,1))
    g1 = np.vstack((np.ones((n1,1)), np.zeros((n2,1))))
    g2 = np.vstack((np.zeros((n1,1)), np.ones((n2,1))))
    
    if offset:
        X = np.hstack((g1, g2, cg1, cg2, intercept))
    else:
        X = np.hstack((g1, g2, cg1, cg2))
            
    return X

# a function to generate data:
def make_data(X, w, sig=1.0):
    w = np.array(w)
    w = w.reshape((X.shape[1],1))
    noise = np.random.normal(size=(X.shape[0], 1))*sig
    # print noise.shape
    
    return X.dot(w) + noise
    

In [137]:

X = make_model()
print X
print X.mean(axis=0)

[[   1.0    0.0   17.0    0.0]
 [   1.0    0.0    4.0    0.0]
 [   1.0    0.0   18.0    0.0]
 [   1.0    0.0    5.0    0.0]
 [   1.0    0.0   19.0    0.0]
 [   1.0    0.0    6.0    0.0]
 [   1.0    0.0    8.0    0.0]
 [   1.0    0.0   12.0    0.0]
 [   1.0    0.0   15.0    0.0]
 [   1.0    0.0    6.0    0.0]
 [   1.0    0.0   14.0    0.0]
 [   1.0    0.0    1.0    0.0]
 [   1.0    0.0   10.0    0.0]
 [   1.0    0.0    4.0    0.0]
 [   1.0    0.0    3.0    0.0]
 [   1.0    0.0    2.0    0.0]
 [   0.0    1.0    0.0    3.0]
 [   0.0    1.0    0.0   13.0]
 [   0.0    1.0    0.0   10.0]
 [   0.0    1.0    0.0    0.0]
 [   0.0    1.0    0.0    8.0]
 [   0.0    1.0    0.0   15.0]
 [   0.0    1.0    0.0   18.0]
 [   0.0    1.0    0.0    3.0]
 [   0.0    1.0    0.0    5.0]
 [   0.0    1.0    0.0    5.0]
 [   0.0    1.0    0.0   14.0]
 [   0.0    1.0    0.0    8.0]
 [   0.0    1.0    0.0   14.0]
 [   0.0    1.0    0.0   15.0]
 [   0.0    1.0    0.0    9.0]
 [   0.0    1.0    0.0   11.0]]
[   0.5    0.5    4.5    4.7]

In [138]:

y = make_data(X, [10, 15, 3, 3], 1.0)
print y.shape

(32, 1)

In [151]:

def simulate(X, gen_with, contrasts, N, Xa=None, sig=1.):
    
    res = {}
    pkeys = ['p_'+str(i) for i in range(len(contrasts))]
    print pkeys
    for k in pkeys:
        res[k] = -np.ones((N,))

    res['m1'] = -np.ones((N,))
    res['m2'] = -np.ones((N,))

    if Xa == None: Xa = X
    for i in range(N):        
        y = make_data(X, gen_with, sig)
        for k,c in zip(pkeys,contrasts):
#            print k, c, i
            t,df,p = t_test(Xa, y, c)
            res[k][i] = p[0][0]
        
        res['m1'][i] = y[0:n1].mean()
        res['m2'][i] = y[n1:n].mean()
        
    return res

In [146]:

def plot_res(res):

    pkeys = [k for k in res.keys() if k[:2] == 'p_']
    nplot = len(pkeys)
    fig, ax = plt.subplots(1,nplot)
    fig.set_figwidth(12)
    
    nobs = -np.ones((nplot,))
    for ip in range(nplot):
        ax[ip].plot(np.log10(res[pkeys[ip]]))
        ax[ip].plot(np.log10(.05)*np.ones((N,)))
        nobs[ip] = (res[pkeys[ip]] <= 0.05).sum()
    
    return nobs

In [160]:

N = 500
gen_with = [1.0, 1.0, 0., 0.2]
c1 = [-1,1,0,0]
c2 = [0,0,-1,1]
contrasts = [c1, c2]
res = simulate(X, gen_with, contrasts, N)
nobs = plot_res(res)
alpha = 0.05
print nobs, N*alpha
print nobs/(N*1.0), alpha

['p_0', 'p_1']
[  21.0  459.0] 25.0
[   0.0    0.9] 0.05

What is the chance of obtaining such a value if p=0.05?

We have a binomial distribution, we want to know if N*.05 is a likely outcome:

In [161]:

binom = sst.binom(N, alpha)
print [1. - binom.cdf(obs) for obs in nobs]
#    print "t = ",t ,"df = ", df , "p = ", p
#    f,df,p = f_test(X, y, [1,-1,0, 0])
#    print "f = ",f ,"df = ", df , "p = ", p

[0.75905461584108314, 1.1102230246251565e-16]

In [162]:

y = make_data(X, gen_with)
print gen_with
fig, ax = plt.subplots(1)
ax.plot(X[0:n1,2], y[0:n1], 'b*',  X[n1:n,3], y[n1:n] ,'ro')
ax.set_xlabel('covariate')

[1.0, 1.0, 0.0, 0.2]

Out[162]:

<matplotlib.text.Text at 0x88774d0>

In [163]:

print (res['m1'] - res['m2']).mean()

-1.88122620438

In [164]:

N = 500
#gen_with = [1.0, 1.0, 0., 0.]
#c1 = [-1,1,0,0]
#c2 = [0,0,-1,1]
#contrasts = [c1, c2]

X1 = X[:,:2]
Xo = R_proj(X1, X[:,2:])
Xa = np.hstack((X1,Xo))

res = simulate(X, gen_with, contrasts, N, Xa)
nobs = plot_res(res)
alpha = 0.05
print nobs, N*alpha
print nobs/(N*1.0), alpha

['p_0', 'p_1']
[ 500.0  448.0] 25.0
[   1.0    0.9] 0.05

In [165]:

y = make_data(X, gen_with)
print gen_with
fig, ax = plt.subplots(1)
ax.plot(X[0:n1,2], y[0:n1], 'b*',  X[n1:n,3], y[n1:n] ,'ro')
ax.set_xlabel('covariate')

[1.0, 1.0, 0.0, 0.2]

Out[165]:

<matplotlib.text.Text at 0x9505350>

In [ ]:

mc1 = X[0:n1,[2]] - X[0:n1,2].mean()
print mc1.shape

mc2 = X[n2:n, [3]] - (X[n2:n,3]).mean()
print mc2.shape

mc1 = np.vstack((np.zeros((n1,1)), mc1))
mc2 = np.vstack((mc2, np.zeros((n2,1))))
print mc1.shape, mc2.shape

g1 = np.vstack((np.ones((n1,1)), np.zeros((n2,1))))
g2 = np.vstack((np.zeros((n1,1)), np.ones((n2,1))))

Xm = np.hstack((g1, g2, mc1, mc2))

In [ ]:

#print  Xm
print  np.hstack((y, Xm))
#print len(ff(123.56))

In [ ]:

Xm.mean(axis=0)

In [ ]:

print t_test(X, y, [1, -1, 0, 0])

In [ ]: