# Gaussian process regression.

Marcus Frean 2013, for COMP421

Gaussian process regression. Reads a datafile consisting of a matrix in which each row is a training item. The final column in the target and preceding columns are the input.

Hyperparameters theta control a squared exponential covariance function, and are chosen from Gaussian distributions. These theta are exponentiated to give the length scales in each input dimension, the overall vertical scale of the covariance function (i.e. the max), and the amount of noise assumed to be corrupting the observed target values. That is: length scale in 1st dim of input = exp(theta[0]) length scale in 2nd dim of input = exp(theta[1]) : : length scale in n-th dim of inpt = exp(theta[D-1]) Vertical scale of covariance fn = exp(theta[D]) Assumed variance of target noise = exp(theta[D+1]) NOTE THE EXPONENTIATION OF THETA, which allows us to put simple Gaussian priors on it.

The blue plot shows predictions and error bars (actually variance...) for the initial randomly chosen hyperparameters theta. The red shows those after optimizing theta.

In [15]:
import sys
import pylab
from numpy import *
import numpy.random as rng

In [16]:
def calcCovariance(X1, X2, theta):
"""
This should work for either making a square cov matrix from data
via calcCovariance(X,X,theta), OR making a vector of
covariances of some test data (against train) via
calcCovariance(X, Xtest, theta), where Xtest is a matrix of
test data with same form as X (i.e. one pattern = one row).
"""
TheExponent = 2.0 # 2.0 for a squared-exponential Covariance Func. NB: so far my gradient only works for 2.
(n1,D)  = shape(X1)
if len(shape(X2)) != 2:
print 'warning: shape(X2) is ',shape(X2)
L = len(X2)
X2 = X2.reshape((1,D))
(n2,D2) = shape(X2)
if (D2 != D):
print 'X1 and X2 should have same number of cols ',D,D2
print 'X1: ',X1
print 'X2: ',X2
sys.exit("oops!");
S = zeros((n1,n2))
for d in range(D):
A1 = transpose([X1[:,d]]) * ones((n1,n2))
A2 = [X2[:,d]] * ones((n1,n2))
S = S + pow(abs(A1-A2),TheExponent) * exp(theta[d])
verticalScale = exp(theta[D])
K = verticalScale * exp(-0.5 * S)
return K

In [17]:
def setAndSampleHyperprior(D):
# SET HYPERPRIORS FOR ALL PARAMETERS
hyperLogInvLengths  = (log(0.5), 1.0); # prior for **inverse** length scales
hyperLogVertical = (log(1.0), 1.0);   # prior for vertical scale
hyperLogNoiseVar = (log(0.05),  1.0);  # prior for noise variance
# collect them under one name

# INITIAL HYPERPARAMETERS, CHOSEN FROM THE HYPERPRIOR
logVertical = rng.normal(hyperLogVertical[0],hyperLogVertical[1])
logNoiseVar = rng.normal(hyperLogNoiseVar[0],hyperLogNoiseVar[1])
# collect them as one vector
theta = list(logInvLengths)  # indexed from 0..D-1
theta.append(logVertical) # indexed at D
theta.append(logNoiseVar) # indexed at D+1
return (theta, hyperprior)

In [18]:
def doCholesky(theta, args):
# These same steps are necessary before calculating predictions,
# logPosterior and its gradients, so I'm collecting them here.
(X,y,hyperprior) = args
(n,D) = shape(X)
noiseVar  = exp(theta[D+1])
K = calcCovariance(X,X,theta)
Q = K + noiseVar*eye(n)  # we're adding in the observation noise here.
L = linalg.cholesky(Q)   # Cholesky factorization kicks ass.
alpha = linalg.solve(transpose(L) , linalg.solve(L,y))
return (n,D,K,Q,L,alpha)

In [19]:
def calcGPPrediction(theta,args,Xtest):
"""
Make a prediction about the output given input Xtest (can be several).
"""
(n,D,K,Q,L,alpha) = doCholesky(theta,args)

(X,y,hyperprior) = args
kstar = calcCovariance(X,Xtest,theta)
meanPred = dot(transpose(kstar),alpha)
V = linalg.solve(L,kstar)
verticalScale = exp(theta[D])
varPred = ravel( verticalScale - diag(dot(transpose(V), V)) )
return (meanPred,varPred)

In [20]:
def calcNegLogPosterior(theta, args):
(n,D,K,Q,L,alpha) = doCholesky(theta,args)
(X,y,hyperprior) = args
logL = -0.5*dot(transpose(y),alpha)  - sum(log(diag(L))) - 0.5*n*log(2*pi);

# hang on: haven't included the log prob of hyperparams, under hyperprior.

return -(logL + sum(logProbTheta))  # minus since fmin_cg looks for a MINIMUM

In [21]:
def calcLogProbTheta(theta, hyperprior):
"""
Returns the log prob of each component of theta under the
"""
logP = zeros(shape(theta))
D = len(theta)-2
for d in range(D):
logP[D] = -pow(hyperLogVertical[0] - theta[D],2.0)/(2*hyperLogVertical[1])
logP[D] -= 0.5*log(2*pi*hyperLogVertical[1])
logP[D+1] = -pow(hyperLogNoiseVar[0] - theta[D+1],2.0)/(2*hyperLogNoiseVar[1])
logP[D+1] -= 0.5*log(2*pi*hyperLogNoiseVar[1])

In [22]:
def calcNegGradLogPosterior(theta, args):
"""
calc the negative gradient of the log likelihood of the data.
"""
(n,D,K,Q,L,alpha) = doCholesky(theta,args)
(X,y,hyperprior) = args
invQ = linalg.solve(transpose(L),  linalg.solve(L,eye(n)) )
# yep, I've checked and this seems to be inv(Q). Tick.

dLogL = zeros(shape(theta))
invQt = dot(invQ,y)
# need matrix of derivatives, dK/dtheta_i, which depends on the cov fn form.
for d in range(D):
lengthScale = exp(theta[d])
B = [X[:,d]] * ones((n,n))
V = pow(B-transpose(B),2) * K
term1 = dot(transpose(invQt), dot(V,invQt))
term2 = sum(sum(invQ*V))
dLogL[d] = -lengthScale * 0.25 * (term1 - term2)
dLogL[D] = -0.5*(sum(sum(invQ*K)) - dot(transpose(invQt), dot(K,invQt)))
dLogL[D+1] = -0.5*exp(theta[D+1])*(trace(invQ) - dot(transpose(invQt),invQt))

# hang on: haven't included the log prob of hyperparams, under hyperprior.

return -(dLogL + gradLogProbTheta)  # minus because fmin_cg finds MINIMUM

In [23]:
def BROKEN_calcNegGradLogL(theta, args):
"""
calc the negative gradient of the log likelihood of the data.
"""
(n,D,K,Q,L,alpha) = doCholesky(theta,args)
invQ = linalg.solve(transpose(L),  linalg.solve(L,eye(n)) )
# yep, I've checked and this seems to be inv(Q). Tick.

# Precompute the matrix alpha*alpha^T - inv(Q).
A = dot(alpha,transpose(alpha)) - invQ
# NOT SURE I TRUST THIS.... not one of the grads is right, which
# suggests the problem is in A.
dfX = zeros(shape(theta))
for d in range(D):
B = [X[:,d]] * ones((n,n))
lengthScale = exp(theta[d])
Sd = pow(B-transpose(B),2) * lengthScale
noiseVar = exp(theta[D+1])
# BUT THIS MAY STILL BE BUG-RIDDEN AND BOGUS.
return -dfX

In [24]:
def demo1Dplot(theta,args,outfile,colour=array([0,0,1.0])):

(X,y,hyperprior) = args
(n, D) = shape(X)

xrange = X.max() - X.min()
Xtest = arange(X.min()-xrange/3,X.max()+xrange/3,(X.max()-X.min())/100)
Xtest.shape = (len(Xtest),1)

mu,sig2 = calcGPPrediction(theta,args,Xtest)
Xtest.shape = (len(Xtest),)
noiseVar = exp(theta[D+1])  # it's the last hyperparameter in the list.
sig2 = sig2 + noiseVar

fig = pylab.figure()

pylab.plot(X,y,'sk',markersize=5,alpha=0.75) # show data points
vertical_range = ravel(y).max() -  ravel(y).min()
minY, maxY = ravel(y).min()-vertical_range/2, ravel(y).max()+vertical_range/2
pylab.axis([X.min(),X.max(),minY,maxY])
pylab.plot(Xtest,mu,'--',color=[1,.3,.3])
ymax = max(sqrt(sig2))

if (True): # if True, this will indicate hyperparameter settings
# pictorially!
lengthScale = 1.0/sqrt(exp(theta[0]))
dummy = 4*lengthScale
XcovShow = arange(-dummy,dummy,dummy/50)
covariance = exp(theta[1])*exp(-exp(theta[0])*pow(XcovShow,2))
offset = Xtest.min()+dummy
print 'lengthScale is ',lengthScale
print 'offset is ',offset
pylab.fill_between(offset+XcovShow,minY,minY+covariance,color='blue',alpha=.3)

# Also show the noiseVar, to be added to the diagonal of Cov matrix
blobHeight = exp(theta[1]) + exp(theta[-1])
pylab.plot([offset], [minY+blobHeight],'o',color='blue',alpha=.3)
pylab.plot([offset,offset], [minY,minY+blobHeight],'-',color='blue',alpha=.9)
pylab.text(offset,minY+.3*blobHeight, 'Cov',color='black')

pylab.draw()
pylab.savefig(outfile,bbox_inches='tight')
print 'Wrote %s ' % (outfile)

In [25]:
infile = 'testdata.txt'
X = data[:,0:-1] # everything except the last column
y = data[:,-1]   # just the last column
pylab.plot(X,y,'sk',markersize=5,alpha=0.75) # show data points
vertical_range = ravel(y).max() -  ravel(y).min()
pylab.axis([X.min(),X.max(),ravel(y).min()-vertical_range/2, ravel(y).max()+vertical_range/2])

Out[25]:
[-9.9499999999999993, 7.6399999999999997, -4.8900000000000006, 6.75]

In [26]:
(n, D) = shape(X)
(theta, hyperprior) = setAndSampleHyperprior(D)
args = (X,y,hyperprior) # Done because fmin_cg needs these all in one box...

print 'initial log likelihood: ',-calcNegLogPosterior(theta,args)
print 'initial exp(theta): ', exp(theta)

# check that I haven't busted anything lately...!

initial log likelihood:  -20.6670831763
initial exp(theta):  [ 0.38015962  2.15017397  0.06075123]



Let's see some predictions with the initial hyperparameters.

In [27]:
if (D == 1):
demo1Dplot(theta,args,infile+'_GaussProc.png',array([0.7,0,0]))

lengthScale is  1.62187361177
offset is  -9.32583888624
Wrote testdata.txt_GaussProc.png



Update to "optimal" (meaning MAP) hyperparameters, and show the new predictions.

In [28]:
# update to better hyperparameters, using conjugate gradients algorithm ("fmin_cg").
newTheta = fmin_cg(calcNegLogPosterior, theta, calcNegGradLogPosterior, [args], gtol=1e-4,disp=0)
print 'final log likelihood: ',-calcNegLogPosterior(newTheta,args)
print 'final exp(theta): ', exp(newTheta)
if (D == 1):
demo1Dplot(newTheta,args,infile+'_GaussProc.png',array([0.7,0,0]))

final log likelihood:  -15.3969492179
final exp(theta):  [ 0.14064185  2.9931069   0.01782606]
lengthScale is  2.6665068782
offset is  -5.14730582052
Wrote testdata.txt_GaussProc.png