%matplotlib inline # import all shogun classes from modshogun import * # import all required libraries import scipy import scipy.io import numpy as np from math import exp,sqrt import time import matplotlib.pyplot as plt def Gaussian_points(Sigma,mu,xmin,xmax,ymin,ymax,delta=0.01): """ This function is used to evaluate the likelihood of a Gaussian distribution on mesh. Parameters: Sigma - covariance of Gaussian mu - mean of Gaussian xmin, xmax, ymin, ymax, delta - defines mesh Returns: X, Y, Z, where Z = log(p(X,Y)) and p is a bivariate gaussian (mu, Sigma) evaluated at a mesh X,Y """ xlist = np.arange(xmin, xmax, delta) ylist = np.arange(ymin, ymax, delta) X, Y = np.meshgrid(xlist, ylist) model = GaussianDistribution(mu, Sigma) Z = np.zeros(len(X)*len(Y)) idx = 0 for items in zip(X,Y): for sample in zip(items[0],items[1]): sample = np.asarray(sample) Z[idx] = model.log_pdf(sample) idx += 1 Z = np.asarray(Z).reshape(len(X),len(Y)) return (X,Y,Z) def likelihood_points(X,Y,labels,likelihood): """ This function is used to evaluate a given likelihood function on a given mesh of points Parameters: X, Y - coordiates of the mesh labels - labels for the likelihood likelihood - likelihood function Returns: Z - log(p(X,Y,labels)), where p comes from likelihood """ Z = np.zeros(len(X)*len(Y)) idx = 0 for items in zip(X,Y): for sample in zip(items[0],items[1]): sample = np.asarray(sample) lpdf = likelihood.get_log_probability_f(labels, sample).sum() Z[idx] = lpdf idx += 1 Z = np.asarray(Z).reshape(len(X),len(Y)) return Z def approx_posterior_plot(methods, kernel_func, features, mean_func, labels, likelihoods, kernel_log_scale, xmin, xmax, ymin, ymax, delta, plots): """ This function is used to generate points drawn from approximated posterior and plot them Parameters: methods - a list of methods used to approximate the posterior kernel_func - a covariance function for GP features - X mean_func - mean function for GP labels - Y likelihood- a data likelihood to model labels kernel_log_scale - a hyper-parameter of covariance function xmin, ymin, xmax, ymax, delta - used in linespace function: maximum and minimum values used to generate points from an approximated posterior plots Returns: Nothing """ (rows, cols) = plots.shape methods = np.asarray(methods).reshape(rows, cols) likelihoods = np.asarray(likelihoods).reshape(rows, cols) for r in xrange(rows): for c in xrange(cols): inference = methods[r][c] likelihood = likelihoods[r][c] inf = inference(kernel_func, features, mean_func, labels, likelihood()) inf.set_scale(exp(kernel_log_scale)) #get the approximated Gaussian distribution mu = inf.get_posterior_mean() Sigma = inf.get_posterior_covariance() #normalized approximated posterior (X,Y,Z) = Gaussian_points(Sigma, mu, xmin, xmax, ymin, ymax, delta) plots[r][c].contour(X, Y, np.exp(Z)) plots[r][c].set_title('posterior via %s'%inf.get_name()) plots[r][c].axis('equal') #a toy 2D example (data) x=np.asarray([sqrt(2),-sqrt(2)]).reshape(1,2) y=np.asarray([1,-1]) features = RealFeatures(x) labels = BinaryLabels(y) kernel_log_sigma = 1.0 kernel_log_scale = 1.5 #a mean function and a covariance function for GP mean_func = ConstMean() #using log_sigma as a hyper-parameter of GP instead of sigma kernel_sigma = 2*exp(2*kernel_log_sigma) kernel_func = GaussianKernel(10, kernel_sigma) kernel_func.init(features, features) #a prior distribution derived from GP via applying the mean function and the covariance function to data Sigma = kernel_func.get_kernel_matrix() Sigma = Sigma * exp(2.0*kernel_log_scale) mu = mean_func.get_mean_vector(features) delta = 0.1 xmin = -4 xmax = 6 ymin = -6 ymax = 4 #a prior (Gaussian) derived from GP (X,Y,Z1) = Gaussian_points(Sigma, mu, xmin, xmax, ymin, ymax, delta) col_size = 6 f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(col_size*3,col_size)) ax1.contour(X, Y, np.exp(Z1)) ax1.set_title('Prior') ax1.axis('equal') #likelihood (inverse logit, A.K.A. Bernoulli-logistic) #likelihoods classes for inference methods (see the table above) likelihoods=[ LogitLikelihood, LogitLikelihood, LogitVGLikelihood, LogitVGLikelihood, LogitVGLikelihood, LogitDVGLikelihood ] Z2 = likelihood_points(X,Y,labels,LogitLikelihood()) ax2.contour(X, Y, np.exp(Z2)) ax2.set_title('Likelihood') ax2.axis('equal') #a unnormalized true posterior (non-Gaussian) Z3 = Z1+Z2 ax3.contour(X, Y, np.exp(Z3)) ax3.set_title('True posterior') ax3.axis('equal') f, plots =plt.subplots(2, 3, figsize=(col_size*3,col_size*2)) #Inference methods used to approximate a posterior (see the table below) methods=[ SingleLaplacianInferenceMethod, SingleLaplacianInferenceMethodWithLBFGS, KLApproxDiagonalInferenceMethod, KLCovarianceInferenceMethod, KLCholeskyInferenceMethod, KLDualInferenceMethod ] approx_posterior_plot(methods, kernel_func, features, mean_func, labels, likelihoods, kernel_log_scale, xmin, xmax, ymin, ymax, delta, plots) plt.show() def learning_example1(inference, linesearch, likelihood, x_train, y_train, x_test, y_test, kernel_log_sigma, kernel_log_scale): """ This function is used to train a GP classifer given an inference method Parameters: inference - an inference methods used to train the classifer linesearch - a linesearch method used in the inference method (see the table below) likelihood - likelihood function to model data x_train, x_test - X for training and testing y_train, y_test - Y for training and testing kernel_log_sigma, kernel_log_scale hyper-parameters of covariance function Returns: predictive result of the testing data set """ mean_func = ZeroMean() kernel_sigma = 2*exp(2*kernel_log_sigma); kernel_func = GaussianKernel(10, kernel_sigma) #Y is a sample-by-1 vector labels_train = BinaryLabels(y_train) labels_test = BinaryLabels(y_test) #X is a feature-by-sample matrix features_train=RealFeatures(x_train) features_test=RealFeatures(x_test) inf = inference(kernel_func, features_train, mean_func, labels_train, likelihood) inf.set_scale(exp(kernel_log_scale)) try: #setting lbfgs parameters inf.set_lbfgs_parameters(100,2000,int(linesearch),2000) #used to make sure the kernel matrix is positive definite inf.set_noise_factor(1e-6) inf.set_min_coeff_kernel(1e-5) inf.set_max_attempt(10) except: pass gp = GaussianProcessClassification(inf) gp.train() prob=gp.get_probabilities(features_test) return prob, inf.get_name() import random labels={ "m":1, "r":-1, } from math import log def extract(inf, train_size): """ This function is used to processing raw data Parameters: inf - the path of the raw data train_size - number of data points used for testing Returns: x_train, x_test - X for training and testing y_train, y_test - Y for training and testing B - for computing the information bit """ random.seed(1) x=[] y=[] for line in open(inf): line=line.strip() info=line.split(',') label=labels[info[-1].lower()] x.append(map(float, info[:-1])) y.append(label) #train_size should be less than the size of all available data points assert train_size < len(x) idx=range(len(y)) random.shuffle(idx) train_idx=set(idx[:train_size]) test_idx=set(idx[train_size:]) x_train = np.asarray([value for (idx, value) in enumerate(x) if idx in train_idx]).T y_train = np.asarray([label for (idx, label) in enumerate(y) if idx in train_idx]) x_test = np.asarray([value for (idx, value) in enumerate(x) if idx in test_idx]).T y_test = np.asarray([label for (idx, label) in enumerate(y) if idx in test_idx]) y_train_positive=(y_train==1).sum() y_train_negative=(y_train==-1).sum() y_test_positive=(y_test==1).sum() y_test_negative=(y_test==-1).sum() prb_positive=float(y_train_positive)/len(y_train) B=float(y_test_positive)*log(prb_positive,2)+float(y_test_negative)*log(1.0-prb_positive,2) B=-B/len(y_test) return x_train, y_train, x_test, y_test, B def approx_bit_plot(methods, linesearchs, likelihoods, x_train, y_train, x_test , y_test, plots, lScale, lSigma, B): """ This function is used to plot information bit figures Parameters: methods - a list of inference methods used to train the classifer linesearchs - a list of linesearch methods used in the inference method (see the table below) likelihood - a list of likelihood functions to model data x_train, x_test - X for training and testing y_train, y_test - Y for training and testing plots - plot objects lScale, lSigma - a list of hyper-parameters of covariance function B - for computing information bits Returns: predictive result of the testing data set """ #Note that information bit is used to measure effectiveness of an inference method if len(plots.shape)==1: rows=1 cols=plots.shape[0] else: (rows, cols) = plots.shape methods=np.asarray(methods).reshape(rows, cols) linesearchs=np.asarray(linesearchs).reshape(rows, cols) likelihoods=np.asarray(likelihoods).reshape(rows, cols) for r in xrange(rows): for c in xrange(cols): inference = methods[r][c] linesearch = linesearchs[r][c] likelihood = likelihoods[r][c] try: likelihood.set_noise_factor(1e-15) likelihood.set_strict_scale(0.01) except: pass scores=[] for items in zip(lScale, lSigma): for parameters in zip(items[0],items[1]): lscale=parameters[0] lsigma=parameters[1] (prob, inf_name)=learning_example1(inference, linesearch, likelihood, x_train, y_train, x_test, y_test, lscale, lsigma) #compute the information bit score=0.0 for (label_idx, prb_idx) in zip(y_test, prob): if label_idx==1: score+=(1.0+label_idx)*log(prb_idx,2) else: #label_idx==-1: score+=(1.0-label_idx)*log(1.0-prb_idx,2) score=score/(2.0*len(y_test))+B scores.append(score) scores=np.asarray(scores).reshape(len(lScale),len(scores)/len(lScale)) if len(plots.shape)==1: sub_plot=plots else: sub_plot=plots[r] CS =sub_plot[c].contour(lScale, lSigma, scores) sub_plot[c].clabel(CS, inline=1, fontsize=10) sub_plot[c].set_title('information bit for %s'%inf_name) sub_plot[c].set_xlabel('log_scale') sub_plot[c].set_ylabel('log_sigma') sub_plot[c].axis('equal') #an example for the sonar data set train_size=108 (x_train, y_train, x_test, y_test, B)=extract('../../../data/uci/sonar/sonar.all-data', train_size) inference_methods =[ EPInferenceMethod, KLDualInferenceMethod, SingleLaplacianInferenceMethod, KLApproxDiagonalInferenceMethod, KLCholeskyInferenceMethod, KLCovarianceInferenceMethod, ] likelihoods =[ LogitLikelihood(), LogitDVGLikelihood(), #KLDual method uses a likelihood class that supports dual variational inference LogitLikelihood(), LogitVGLikelihood(), #KL method uses a likelihood class that supports variational inference LogitVGLikelihood(), #KL method uses a likelihood class that supports variational inference LogitVGLikelihood(), #KL method uses a likelihood class that supports variational inference ] linesearchs =[ 3, 1, #KLDual method using the type 3 line_search method will cause an error during computing the information bit 3, 3, 3, 3, ] col_size=8 lscale_min=0.0 lscale_max=5.0 lsigma_min=0.0 lsigma_max=5.0 delta=0.5 scale=5.0 lscale_list = np.arange(lscale_min, lscale_max, delta) lsigma_list = np.arange(lsigma_min, lsigma_max, delta*scale) lScale, lSigma = np.meshgrid(lscale_list, lsigma_list) f, plots =plt.subplots(2, 3, figsize=(col_size*3,col_size*2)) approx_bit_plot(inference_methods, linesearchs, likelihoods, x_train, y_train, x_test, y_test, plots, lScale, lSigma, B) plt.show() def binary_extract(idx, features, labels): """ This function is used to extract a trainning set and a test set Parameters: idx - a 2-wide list of digits (eg, [0,3] means to extract a sub set of images and labels for digit 3 and digit 0) features - the whole image set labels - the whole label set Returns: binary_features - X (images for binary classification) binary_labels - y (labels for binary classification) """ #binary classification assert len(idx) == 2 positive_idx = (labels[idx[0],:] == 1) negative_idx = (labels[idx[1],:] == 1) binary_idx = (positive_idx | negative_idx) ds = binary_idx.shape[-1] bin_labels = np.zeros(ds) bin_labels[positive_idx] = 1 bin_labels[negative_idx] = -1 binary_features = (features[:,binary_idx]) binary_labels = (bin_labels[binary_idx]) positive_count = bin_labels[positive_idx].shape[-1] negative_count = bin_labels[negative_idx].shape[-1] binary_count = binary_labels.shape[-1] print "There are %d positive samples and %d negative samples" %(positive_count, negative_count) return binary_features, binary_labels def learning_example2(inference, likelihood, x_train, x_test, y_train, y_test): """ This function is used to train a GP classifer given an inference method Parameters: inference - an inference methods used to train the classifer likelihood - likelihood function to model data x_train, x_test - X for training and testing y_train, y_test - Y for training and testing kernel_log_sigma, kernel_log_scale hyper-parameters of covariance function Returns: Nothing """ #train a GP classifer error_eval = ErrorRateMeasure() mean_func = ConstMean() kernel_log_sigma = 1.0 kernel_sigma = 2*exp(2*kernel_log_sigma); kernel_func = GaussianKernel(10, kernel_sigma) #sample by 1 labels_train = BinaryLabels(y_train) labels_test = BinaryLabels(y_test) #feature by sample features_train=RealFeatures(x_train) features_test=RealFeatures(x_test) kernel_log_scale = 1.0 inf = inference(kernel_func, features_train, mean_func, labels_train, likelihood) print "\nusing %s"%inf.get_name() inf.set_scale(exp(kernel_log_scale)) try: inf.set_lbfgs_parameters(100,80,0,80) except: pass start = time.time() gp = GaussianProcessClassification(inf) gp.train() end = time.time() print "cost %.2f seconds at training"%(end-start) nlz=inf.get_negative_log_marginal_likelihood() print "the negative_log_marginal_likelihood is %.4f"%nlz start = time.time() #classification on train_data pred_labels_train = gp.apply_binary(features_train) #classification on test_data pred_labels_test = gp.apply_binary(features_test) end = time.time() print "cost %.2f seconds at prediction"%(end-start) error_train = error_eval.evaluate(pred_labels_train, labels_train) error_test = error_eval.evaluate(pred_labels_test, labels_test) print "Train error : %.2f Test error: %.2f\n" % (error_train, error_test) #an example for the USPS data set inf='../../../data/uci/usps/usps_resampled.mat' data=scipy.io.loadmat(inf) train_labels=data['train_labels'] test_labels=data['test_labels'] train_features=data['train_patterns'] test_features=data['test_patterns'] #using images of digit 3 and digit 5 from the dataset idx=[3,5] #Note that #y_train and y_test are followed the definition in the first session #the transpose of x_train and x_test are followed the definition in the first session print "Training set statistics" (x_train, y_train)=binary_extract(idx,train_features, train_labels) print "Test set statistics" (x_test, y_test)=binary_extract(idx,test_features, test_labels) inference_methods=[ SingleLaplacianInferenceMethodWithLBFGS, #using LBFGS optimizer SingleLaplacianInferenceMethod, #using Newton-Raphson optimizer ] likelihood = LogitLikelihood() #using default optimizer and default linesearch method for inference in inference_methods: learning_example2(inference, likelihood, x_train, x_test, y_train, y_test) likelihood = LogitVGLikelihood() #using the default linesearch method learning_example2(KLCovarianceInferenceMethod, likelihood, x_train, x_test, y_train, y_test) likelihood = LogitVGLikelihood() #using the default linesearch method learning_example2(KLApproxDiagonalInferenceMethod, likelihood, x_train, x_test, y_train, y_test) likelihood = LogitVGLikelihood() #using the default linesearch method learning_example2(KLCholeskyInferenceMethod, likelihood, x_train, x_test, y_train, y_test) likelihood = LogitDVGLikelihood() likelihood.set_strict_scale(0.1) #using the default linesearch method learning_example2(KLDualInferenceMethod, likelihood, x_train, x_test, y_train, y_test) likelihood = LogitVGPiecewiseBoundLikelihood() likelihood.set_default_variational_bound() likelihood.set_noise_factor(1e-15) inference_methods=[ KLCholeskyInferenceMethod, KLApproxDiagonalInferenceMethod, KLCovarianceInferenceMethod, ] for inference in inference_methods: #using the default linesearch method learning_example2(inference, likelihood, x_train, x_test, y_train, y_test) def learning_example3(inference, likelihood, x_train, x_test, y_train, y_test): """ This function is used to train a GP classifer given an inference method and hyper-parameters are automatically optimized Parameters: inference - an inference methods used to train the classifer likelihood - likelihood function to model data x_train, x_test - X for training and testing y_train, y_test - Y for training and testing Returns: Nothing """ error_eval = ErrorRateMeasure() mean_func = ZeroMean() kernel_log_sigma = 1.0 kernel_sigma = 2*exp(2*kernel_log_sigma); kernel_func = GaussianKernel(10, kernel_sigma) #sample by 1 labels_train = BinaryLabels(y_train) labels_test = BinaryLabels(y_test) #feature by sample features_train=RealFeatures(x_train) features_test=RealFeatures(x_test) kernel_log_scale = 1.0 inf = inference(kernel_func, features_train, mean_func, labels_train, likelihood) print "\nusing %s"%inf.get_name() inf.set_scale(exp(kernel_log_scale)) try: inf.set_lbfgs_parameters(100,80,3,80) except: pass gp = GaussianProcessClassification(inf) # evaluate our inference method for its derivatives grad = GradientEvaluation(gp, features_train, labels_train, GradientCriterion(), False) grad.set_function(inf) # handles all of the above structures in memory grad_search = GradientModelSelection(grad) # search for best parameters and store them best_combination = grad_search.select_model() # apply best parameters to GP best_combination.apply_to_machine(gp) # we have to "cast" objects to the specific kernel interface we used (soon to be easier) best_width=GaussianKernel.obtain_from_generic(inf.get_kernel()).get_width() best_scale=inf.get_scale() print "Selected kernel bandwidth:", best_width print "Selected kernel scale:", best_scale start = time.time() gp.train() end = time.time() print "cost %s seconds at training"%(end-start) nlz=inf.get_negative_log_marginal_likelihood() print "the negative_log_marginal_likelihood is %.4f"%nlz start = time.time() #classification on train_data pred_labels_train = gp.apply_binary(features_train) #classification on test_data pred_labels_test = gp.apply_binary(features_test) end = time.time() print "cost %s seconds at prediction"%(end-start) error_train = error_eval.evaluate(pred_labels_train, labels_train) error_test = error_eval.evaluate(pred_labels_test, labels_test) print "Train error : %.2f Test error: %.2f\n" % (error_train, error_test); likelihood = LogitVGLikelihood() likelihood.set_noise_factor(1e-15) #using the default linesearch method learning_example3(KLApproxDiagonalInferenceMethod, likelihood, x_train, x_test, y_train, y_test)