This small notebook is based on : K. T. D'Alonzo, " The Johnson-Neyman Pprocedure as an Alternative to ANCOVA", 2004. This follows the advise of Consuala.
import scipy.linalg as lin
import scipy.stats as sst
import numpy as np
#np.set_printoptions(formatter={'float':lambda x: '% -4.2f' % x})
#print t_test(X, y, [1,-1,0, 0])
def ff(frmt="% -3.2f",padd=6):
def fformat(x, f=frmt, p=padd):
s = f % x
if len(s) <= p:
s = (p-len(s))*' ' + s
return s
return fformat
np.set_printoptions(formatter={'float':ff(frmt="%-3.1f",padd=6)})
Define the size of the two groups - make a model
def glm(X,Y):
""" a simple GLM function returning the estimated parameters and residuals """
betah = lin.pinv(X).dot(Y)
Yfitted = X.dot(betah)
resid = Y - Yfitted
return betah, Yfitted, resid
def t_test(X, y, c, clean=True):
if clean:
c = clean_c(c)
n,p = X.shape
pXTX = np.linalg.pinv(X.T.dot(X))
betah, yh, rh = glm(X,y) # fit the model
df = (n - linalg.matrix_rank(X))
MRSS = sum((y - yh)**2)/df
t_num = c.T.dot(betah)
SE = np.sqrt(MRSS* c.T.dot(pXTX).dot(c))
tval = t_num / SE
pval = 1.0 - sst.t.cdf(tval,df)
return tval, df, pval
def clean_c(c):
c = np.asarray(c)
if len(c.shape) == 1:
c = c[:, newaxis]
if len(c.shape) == 2:
p = np.max(c.shape)
d = np.min(c.shape)
c = c.reshape((p,d))
return c
def f_test(X, Y, L, verbose=False, clean=True):
if clean:
L = clean_c(L)
# check that the contrast makes sense
B = lin.pinv(X).dot(Y)
# Numerator
piXtX = lin.pinv(X.T.dot(X))
piLtpiXtXL = lin.pinv(L.T.dot(piXtX).dot(L))
LtB = L.T.dot(B)
Fnum = LtB.T.dot(piLtpiXtXL).dot(LtB)
# SSRX ?
SSRX = Y.T.dot(Y) - B.T.dot(X.T.dot(X)).dot(B)
v1 = matrix_rank(L)
v2 = X.shape[0] - matrix_rank(X)
if verbose:
print 'Fnum: %f, df1=%d, Fdenom: %f, df2=%d, \n' % (Fnum, v1, SSRX, v2)
fval = (Fnum/v1)/(SSRX/v2)
pval = 1.0 - sst.f.cdf(fval,v1,v2)
return (fval, (v1, v2), pval)
def proj(X, Y=None):
# project onto the space of X
#print matrix_rank(X)
[u, s, vt] = lin.svd(X, full_matrices=False)
tol = s.max() * max(X.shape) * finfo(s.dtype).eps
nz = np.where(s > tol)[0]
#print nz
if Y is None:
return u[:,nz].dot(u[:,nz].T)
else:
return u[:,nz].dot(u[:,nz].T.dot(Y))
def R_proj(X, Y=None):
# project onto the space orthogonal to X (RY)
if Y is None:
return np.eye(X.shape[0]) - proj(X, Y)
else:
return Y - proj(X, Y)
n1,n2=(16,16)
n = n1+n2
def make_model(n=(16,16), offset=0):
# w are the weights on the following model
n1, n2 = n[0], n[1]
n = n1+n2
cg1 = np.random.randint(0,20,size=(n1,1)) + 0
cg2 = np.random.randint(0,20,size=(n2,1)) + 0
cg12 = np.vstack((cg1,cg2))
mean_cg = np.vstack((cg1,cg2)).mean() * 0
cg1 = np.vstack((cg1 - mean_cg, np.zeros((n2,1))))
cg2 = np.vstack((np.zeros((n1,1)), cg2 - mean_cg))
intercept = np.ones((n,1))
g1 = np.vstack((np.ones((n1,1)), np.zeros((n2,1))))
g2 = np.vstack((np.zeros((n1,1)), np.ones((n2,1))))
if offset:
X = np.hstack((g1, g2, cg1, cg2, intercept))
else:
X = np.hstack((g1, g2, cg1, cg2))
return X
# a function to generate data:
def make_data(X, w, sig=1.0):
w = np.array(w)
w = w.reshape((X.shape[1],1))
noise = np.random.normal(size=(X.shape[0], 1))*sig
# print noise.shape
return X.dot(w) + noise
X = make_model()
print X
print X.mean(axis=0)
[[ 1.0 0.0 17.0 0.0] [ 1.0 0.0 4.0 0.0] [ 1.0 0.0 18.0 0.0] [ 1.0 0.0 5.0 0.0] [ 1.0 0.0 19.0 0.0] [ 1.0 0.0 6.0 0.0] [ 1.0 0.0 8.0 0.0] [ 1.0 0.0 12.0 0.0] [ 1.0 0.0 15.0 0.0] [ 1.0 0.0 6.0 0.0] [ 1.0 0.0 14.0 0.0] [ 1.0 0.0 1.0 0.0] [ 1.0 0.0 10.0 0.0] [ 1.0 0.0 4.0 0.0] [ 1.0 0.0 3.0 0.0] [ 1.0 0.0 2.0 0.0] [ 0.0 1.0 0.0 3.0] [ 0.0 1.0 0.0 13.0] [ 0.0 1.0 0.0 10.0] [ 0.0 1.0 0.0 0.0] [ 0.0 1.0 0.0 8.0] [ 0.0 1.0 0.0 15.0] [ 0.0 1.0 0.0 18.0] [ 0.0 1.0 0.0 3.0] [ 0.0 1.0 0.0 5.0] [ 0.0 1.0 0.0 5.0] [ 0.0 1.0 0.0 14.0] [ 0.0 1.0 0.0 8.0] [ 0.0 1.0 0.0 14.0] [ 0.0 1.0 0.0 15.0] [ 0.0 1.0 0.0 9.0] [ 0.0 1.0 0.0 11.0]] [ 0.5 0.5 4.5 4.7]
y = make_data(X, [10, 15, 3, 3], 1.0)
print y.shape
(32, 1)
def simulate(X, gen_with, contrasts, N, Xa=None, sig=1.):
res = {}
pkeys = ['p_'+str(i) for i in range(len(contrasts))]
print pkeys
for k in pkeys:
res[k] = -np.ones((N,))
res['m1'] = -np.ones((N,))
res['m2'] = -np.ones((N,))
if Xa == None: Xa = X
for i in range(N):
y = make_data(X, gen_with, sig)
for k,c in zip(pkeys,contrasts):
# print k, c, i
t,df,p = t_test(Xa, y, c)
res[k][i] = p[0][0]
res['m1'][i] = y[0:n1].mean()
res['m2'][i] = y[n1:n].mean()
return res
def plot_res(res):
pkeys = [k for k in res.keys() if k[:2] == 'p_']
nplot = len(pkeys)
fig, ax = plt.subplots(1,nplot)
fig.set_figwidth(12)
nobs = -np.ones((nplot,))
for ip in range(nplot):
ax[ip].plot(np.log10(res[pkeys[ip]]))
ax[ip].plot(np.log10(.05)*np.ones((N,)))
nobs[ip] = (res[pkeys[ip]] <= 0.05).sum()
return nobs
N = 500
gen_with = [1.0, 1.0, 0., 0.2]
c1 = [-1,1,0,0]
c2 = [0,0,-1,1]
contrasts = [c1, c2]
res = simulate(X, gen_with, contrasts, N)
nobs = plot_res(res)
alpha = 0.05
print nobs, N*alpha
print nobs/(N*1.0), alpha
['p_0', 'p_1'] [ 21.0 459.0] 25.0 [ 0.0 0.9] 0.05
What is the chance of obtaining such a value if p=0.05?
We have a binomial distribution, we want to know if N*.05 is a likely outcome:
binom = sst.binom(N, alpha)
print [1. - binom.cdf(obs) for obs in nobs]
# print "t = ",t ,"df = ", df , "p = ", p
# f,df,p = f_test(X, y, [1,-1,0, 0])
# print "f = ",f ,"df = ", df , "p = ", p
[0.75905461584108314, 1.1102230246251565e-16]
y = make_data(X, gen_with)
print gen_with
fig, ax = plt.subplots(1)
ax.plot(X[0:n1,2], y[0:n1], 'b*', X[n1:n,3], y[n1:n] ,'ro')
ax.set_xlabel('covariate')
[1.0, 1.0, 0.0, 0.2]
<matplotlib.text.Text at 0x88774d0>
print (res['m1'] - res['m2']).mean()
-1.88122620438
N = 500
#gen_with = [1.0, 1.0, 0., 0.]
#c1 = [-1,1,0,0]
#c2 = [0,0,-1,1]
#contrasts = [c1, c2]
X1 = X[:,:2]
Xo = R_proj(X1, X[:,2:])
Xa = np.hstack((X1,Xo))
res = simulate(X, gen_with, contrasts, N, Xa)
nobs = plot_res(res)
alpha = 0.05
print nobs, N*alpha
print nobs/(N*1.0), alpha
['p_0', 'p_1'] [ 500.0 448.0] 25.0 [ 1.0 0.9] 0.05
y = make_data(X, gen_with)
print gen_with
fig, ax = plt.subplots(1)
ax.plot(X[0:n1,2], y[0:n1], 'b*', X[n1:n,3], y[n1:n] ,'ro')
ax.set_xlabel('covariate')
[1.0, 1.0, 0.0, 0.2]
<matplotlib.text.Text at 0x9505350>
mc1 = X[0:n1,[2]] - X[0:n1,2].mean()
print mc1.shape
mc2 = X[n2:n, [3]] - (X[n2:n,3]).mean()
print mc2.shape
mc1 = np.vstack((np.zeros((n1,1)), mc1))
mc2 = np.vstack((mc2, np.zeros((n2,1))))
print mc1.shape, mc2.shape
g1 = np.vstack((np.ones((n1,1)), np.zeros((n2,1))))
g2 = np.vstack((np.zeros((n1,1)), np.ones((n2,1))))
Xm = np.hstack((g1, g2, mc1, mc2))
#print Xm
print np.hstack((y, Xm))
#print len(ff(123.56))
Xm.mean(axis=0)
print t_test(X, y, [1, -1, 0, 0])