%matplotlib inline
# import all shogun classes
from modshogun import *

# import all required libraries
import scipy
import scipy.io
import numpy as np
from math import exp,sqrt
import time
import matplotlib.pyplot as plt

def Gaussian_points(Sigma,mu,xmin,xmax,ymin,ymax,delta=0.01):
    """
    This function is used to evaluate the likelihood of a Gaussian distribution on mesh.
    
    Parameters:
         Sigma - covariance of Gaussian
         mu - mean of Gaussian
         xmin, xmax, ymin, ymax, delta - defines mesh
         
    Returns:
    X, Y, Z, where Z = log(p(X,Y)) and p is a bivariate gaussian (mu, Sigma) evaluated at a mesh X,Y
    """

    xlist = np.arange(xmin, xmax, delta)
    ylist = np.arange(ymin, ymax, delta)
    X, Y = np.meshgrid(xlist, ylist)
    model = GaussianDistribution(mu, Sigma)
    Z = np.zeros(len(X)*len(Y))
    idx = 0
    for items in zip(X,Y):
        for sample in zip(items[0],items[1]):
            sample = np.asarray(sample)
            Z[idx] = model.log_pdf(sample)
            idx += 1
    Z = np.asarray(Z).reshape(len(X),len(Y))
    return (X,Y,Z)

def likelihood_points(X,Y,labels,likelihood):
    """
    This function is used to evaluate a given likelihood function on a given mesh of points
    
    Parameters:
        X, Y - coordiates of the mesh
        labels - labels for the likelihood
        likelihood - likelihood function
        
    Returns:
        Z - log(p(X,Y,labels)), where p comes from likelihood
    """
    
    Z = np.zeros(len(X)*len(Y))    
    idx = 0
    for items in zip(X,Y):
        for sample in zip(items[0],items[1]):
            sample = np.asarray(sample)
            lpdf = likelihood.get_log_probability_f(labels, sample).sum()
            Z[idx] = lpdf
            idx += 1
    Z = np.asarray(Z).reshape(len(X),len(Y))
    return Z

def approx_posterior_plot(methods, kernel_func, features, mean_func, 
                          labels, likelihoods, kernel_log_scale, 
                          xmin, xmax, ymin, ymax, delta, plots):
    """
    This function is used to generate points drawn from approximated posterior and plot them
        
    Parameters:
        methods - a list of methods used to approximate the posterior
        kernel_func - a covariance function for GP
        features - X
        mean_func -  mean function for GP
        labels - Y
        likelihood- a data likelihood to model labels
        kernel_log_scale - a hyper-parameter of covariance function
        xmin, ymin, xmax, ymax, delta - used in linespace function: maximum and minimum values used to generate points from an approximated posterior
        plots
    
    Returns:
        Nothing
    """
    
    (rows, cols) = plots.shape
    methods = np.asarray(methods).reshape(rows, cols)
    likelihoods = np.asarray(likelihoods).reshape(rows, cols)
    for r in xrange(rows):
        for c in xrange(cols):
            inference = methods[r][c]
            likelihood = likelihoods[r][c]
            inf = inference(kernel_func, features, mean_func, labels, likelihood())
            inf.set_scale(exp(kernel_log_scale))
            #get the approximated Gaussian distribution
            mu = inf.get_posterior_mean()
            Sigma = inf.get_posterior_covariance()
            #normalized approximated posterior
            (X,Y,Z) = Gaussian_points(Sigma, mu, xmin, xmax, ymin, ymax, delta)
            plots[r][c].contour(X, Y, np.exp(Z))
            plots[r][c].set_title('posterior via %s'%inf.get_name())
            plots[r][c].axis('equal')


#a toy 2D example (data)
x=np.asarray([sqrt(2),-sqrt(2)]).reshape(1,2)
y=np.asarray([1,-1])

features = RealFeatures(x)
labels = BinaryLabels(y)
kernel_log_sigma = 1.0
kernel_log_scale = 1.5

#a mean function and a covariance function for GP
mean_func = ConstMean()
#using log_sigma as a hyper-parameter of GP instead of sigma
kernel_sigma = 2*exp(2*kernel_log_sigma)
kernel_func = GaussianKernel(10, kernel_sigma)
kernel_func.init(features, features)
#a prior distribution derived from GP via applying the mean function and the covariance function to data
Sigma = kernel_func.get_kernel_matrix()
Sigma = Sigma * exp(2.0*kernel_log_scale)
mu = mean_func.get_mean_vector(features)

delta = 0.1
xmin = -4
xmax = 6
ymin = -6
ymax = 4

#a prior (Gaussian) derived from GP
(X,Y,Z1) = Gaussian_points(Sigma, mu, xmin, xmax, ymin, ymax, delta)

col_size = 6
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(col_size*3,col_size))

ax1.contour(X, Y, np.exp(Z1))
ax1.set_title('Prior')
ax1.axis('equal')

#likelihood (inverse logit, A.K.A. Bernoulli-logistic)
#likelihoods classes for inference methods  (see the table above)
likelihoods=[
LogitLikelihood,
LogitLikelihood,
LogitVGLikelihood,
LogitVGLikelihood,
LogitVGLikelihood,
LogitDVGLikelihood
]


Z2 = likelihood_points(X,Y,labels,LogitLikelihood())
ax2.contour(X, Y, np.exp(Z2))
ax2.set_title('Likelihood')
ax2.axis('equal')

#a unnormalized true posterior (non-Gaussian)
Z3 = Z1+Z2
ax3.contour(X, Y, np.exp(Z3))
ax3.set_title('True posterior')
ax3.axis('equal')

f, plots =plt.subplots(2, 3, figsize=(col_size*3,col_size*2))

#Inference methods used to approximate a posterior (see the table below)
methods=[
SingleLaplacianInferenceMethod,
SingleLaplacianInferenceMethodWithLBFGS,
KLApproxDiagonalInferenceMethod,
KLCovarianceInferenceMethod,
KLCholeskyInferenceMethod,
KLDualInferenceMethod
]

approx_posterior_plot(methods, kernel_func, features, mean_func, labels, likelihoods, kernel_log_scale, 
                      xmin, xmax, ymin, ymax, delta, plots)

plt.show()

def learning_example1(inference, linesearch, likelihood, x_train, y_train, x_test, y_test, kernel_log_sigma, kernel_log_scale):
    
    """
    This function is used to train a GP classifer given an inference method
        
    Parameters:
        inference - an inference methods used to train the classifer
        linesearch - a linesearch method used in the inference method (see the table below)
        likelihood - likelihood function to model data
        x_train, x_test - X for training and testing
        y_train, y_test - Y for training and testing
        kernel_log_sigma, kernel_log_scale hyper-parameters of covariance function    
    Returns:
        predictive result of the testing data set
    """

    mean_func = ZeroMean()
    kernel_sigma = 2*exp(2*kernel_log_sigma);
    kernel_func = GaussianKernel(10, kernel_sigma)

    #Y is a sample-by-1 vector
    labels_train = BinaryLabels(y_train)
    labels_test = BinaryLabels(y_test)
    #X is a feature-by-sample matrix
    features_train=RealFeatures(x_train)
    features_test=RealFeatures(x_test)

    inf = inference(kernel_func, features_train, mean_func, labels_train, likelihood)
    inf.set_scale(exp(kernel_log_scale))
    try:
        #setting lbfgs parameters
        inf.set_lbfgs_parameters(100,2000,int(linesearch),2000)
        #used to make sure the kernel matrix is positive definite
        inf.set_noise_factor(1e-6)
        inf.set_min_coeff_kernel(1e-5)
        inf.set_max_attempt(10)
    except:
        pass

    gp = GaussianProcessClassification(inf)
    gp.train()
    prob=gp.get_probabilities(features_test)

    return prob, inf.get_name()

import random
labels={
    "m":1,
    "r":-1,
       }
from math import log

def extract(inf, train_size):
    """
    This function is used to processing raw data
        
    Parameters:
        inf - the path of the raw data
        train_size - number of data points used for testing   
        
    Returns:
        x_train, x_test - X for training and testing
        y_train, y_test - Y for training and testing
        B - for computing the information bit
    """

    random.seed(1)
    x=[]
    y=[]
    for line in open(inf):
        line=line.strip()
        info=line.split(',')
        label=labels[info[-1].lower()]
        x.append(map(float, info[:-1]))
        y.append(label)
        
    #train_size should be less than the size of all available data points
    assert train_size < len(x) 
    idx=range(len(y))
    random.shuffle(idx)
    train_idx=set(idx[:train_size])
    test_idx=set(idx[train_size:])
    x_train = np.asarray([value for (idx, value) in enumerate(x) if idx in train_idx]).T
    y_train = np.asarray([label for (idx, label) in enumerate(y) if idx in train_idx])
    x_test = np.asarray([value for (idx, value) in enumerate(x) if idx in test_idx]).T
    y_test = np.asarray([label for (idx, label) in enumerate(y) if idx in test_idx])

    y_train_positive=(y_train==1).sum()
    y_train_negative=(y_train==-1).sum()
    y_test_positive=(y_test==1).sum()
    y_test_negative=(y_test==-1).sum()

    prb_positive=float(y_train_positive)/len(y_train)
    B=float(y_test_positive)*log(prb_positive,2)+float(y_test_negative)*log(1.0-prb_positive,2)
    B=-B/len(y_test)

    return x_train, y_train, x_test, y_test, B

def approx_bit_plot(methods, linesearchs, likelihoods, x_train, y_train, x_test , y_test, plots, lScale, lSigma, B):
    """
    This function is used to plot information bit figures
        
    Parameters:
        methods - a list of inference methods used to train the classifer
        linesearchs - a list of linesearch methods used in the inference method (see the table below)
        likelihood - a list of likelihood functions to model data
        x_train, x_test - X for training and testing
        y_train, y_test - Y for training and testing
        plots  - plot objects
        lScale, lSigma  - a list of hyper-parameters of covariance function    
        B - for computing information bits
        
    Returns:
        predictive result of the testing data set
    """
    #Note that information bit is used to measure effectiveness of an inference method
    if len(plots.shape)==1:
        rows=1
        cols=plots.shape[0]
    else:
        (rows, cols) = plots.shape

    methods=np.asarray(methods).reshape(rows, cols)
    linesearchs=np.asarray(linesearchs).reshape(rows, cols)
    likelihoods=np.asarray(likelihoods).reshape(rows, cols) 

    for r in xrange(rows):
        for c in xrange(cols):
            inference = methods[r][c]
            linesearch = linesearchs[r][c]
            likelihood = likelihoods[r][c]
            try:
                likelihood.set_noise_factor(1e-15)
                likelihood.set_strict_scale(0.01)
            except:
                pass
            scores=[]
            for items in zip(lScale, lSigma):
                for parameters in zip(items[0],items[1]):
                    lscale=parameters[0]
                    lsigma=parameters[1]
                    (prob, inf_name)=learning_example1(inference, linesearch, likelihood, x_train, y_train, x_test,  y_test, 
                                              lscale, lsigma)
                    #compute the information bit
                    score=0.0
                    for (label_idx, prb_idx) in zip(y_test, prob):
                        if label_idx==1:
                            score+=(1.0+label_idx)*log(prb_idx,2)
                        else: #label_idx==-1:
                            score+=(1.0-label_idx)*log(1.0-prb_idx,2)
                    score=score/(2.0*len(y_test))+B
                    scores.append(score)
            scores=np.asarray(scores).reshape(len(lScale),len(scores)/len(lScale))

            if len(plots.shape)==1:
                sub_plot=plots
            else:
                sub_plot=plots[r]
            CS =sub_plot[c].contour(lScale, lSigma, scores)
            sub_plot[c].clabel(CS, inline=1, fontsize=10)
            sub_plot[c].set_title('information bit for %s'%inf_name)
            sub_plot[c].set_xlabel('log_scale')
            sub_plot[c].set_ylabel('log_sigma')
            sub_plot[c].axis('equal')

#an example for the sonar data set
train_size=108
(x_train, y_train, x_test, y_test, B)=extract('../../../data/uci/sonar/sonar.all-data', train_size)
inference_methods =[
                  EPInferenceMethod,
                  KLDualInferenceMethod,
                  SingleLaplacianInferenceMethod,
                  KLApproxDiagonalInferenceMethod,
                  KLCholeskyInferenceMethod,
                  KLCovarianceInferenceMethod,
                  ]

likelihoods =[
            LogitLikelihood(),
            LogitDVGLikelihood(), #KLDual method uses a likelihood class that supports dual variational inference
            LogitLikelihood(),
            LogitVGLikelihood(), #KL method uses a likelihood class that supports variational inference
            LogitVGLikelihood(), #KL method uses a likelihood class that supports variational inference
            LogitVGLikelihood(), #KL method uses a likelihood class that supports variational inference
            ]

linesearchs =[
            3,
            1, #KLDual method using the type 3 line_search method will cause an error during computing the information bit
            3,
            3,
            3,
            3,
            ]

col_size=8
lscale_min=0.0
lscale_max=5.0
lsigma_min=0.0
lsigma_max=5.0
delta=0.5
scale=5.0

lscale_list = np.arange(lscale_min, lscale_max, delta)
lsigma_list = np.arange(lsigma_min, lsigma_max, delta*scale)
lScale, lSigma = np.meshgrid(lscale_list, lsigma_list)
f, plots =plt.subplots(2, 3, figsize=(col_size*3,col_size*2))

approx_bit_plot(inference_methods, linesearchs, likelihoods, x_train, y_train,  x_test, y_test, plots, lScale, lSigma, B)

plt.show()

def binary_extract(idx, features, labels):
    """
    This function is used to extract a trainning set and a test set
    
    Parameters:
        idx - a 2-wide list of digits (eg, [0,3] means to extract a sub set of images and labels for digit 3 and digit 0) 
        features - the whole image set 
        labels - the whole label set
        
    Returns:
        binary_features - X (images for binary classification)
        binary_labels - y (labels for binary classification)
    """
    #binary classification
    assert len(idx) == 2
    positive_idx = (labels[idx[0],:] == 1)
    negative_idx = (labels[idx[1],:] == 1)
    binary_idx = (positive_idx | negative_idx)
    ds = binary_idx.shape[-1]

    bin_labels = np.zeros(ds) 
    bin_labels[positive_idx] = 1
    bin_labels[negative_idx] = -1

    binary_features = (features[:,binary_idx])
    binary_labels = (bin_labels[binary_idx])
 
    positive_count = bin_labels[positive_idx].shape[-1]
    negative_count = bin_labels[negative_idx].shape[-1]
    binary_count = binary_labels.shape[-1]
 
    print "There are %d positive samples and %d negative samples" %(positive_count, negative_count)
    return binary_features, binary_labels

def learning_example2(inference, likelihood, x_train, x_test, y_train, y_test):
    """
    This function is used to train a GP classifer given an inference method
        
    Parameters:
        inference - an inference methods used to train the classifer
        likelihood - likelihood function to model data
        x_train, x_test - X for training and testing
        y_train, y_test - Y for training and testing
        kernel_log_sigma, kernel_log_scale hyper-parameters of covariance function    
        
    Returns:
        Nothing
    """
        
    #train a GP classifer
    error_eval = ErrorRateMeasure()
    mean_func = ConstMean()
    kernel_log_sigma = 1.0
    kernel_sigma = 2*exp(2*kernel_log_sigma);
    kernel_func = GaussianKernel(10, kernel_sigma)

    #sample by 1
    labels_train = BinaryLabels(y_train)
    labels_test = BinaryLabels(y_test)
    #feature by sample
    features_train=RealFeatures(x_train)
    features_test=RealFeatures(x_test)

    kernel_log_scale = 1.0

    inf = inference(kernel_func, features_train, mean_func, labels_train, likelihood)
    print "\nusing %s"%inf.get_name()
    
    inf.set_scale(exp(kernel_log_scale))
    try:
        inf.set_lbfgs_parameters(100,80,0,80)
    except:
        pass

    start = time.time()
    gp = GaussianProcessClassification(inf)
    gp.train()
    end = time.time()
    print "cost %.2f seconds at training"%(end-start)
    nlz=inf.get_negative_log_marginal_likelihood()
    print "the negative_log_marginal_likelihood is %.4f"%nlz
    start = time.time()
    #classification on train_data
    pred_labels_train = gp.apply_binary(features_train)
    #classification on test_data
    pred_labels_test = gp.apply_binary(features_test)
    end = time.time()    
    print "cost %.2f seconds at prediction"%(end-start)
    
    error_train = error_eval.evaluate(pred_labels_train, labels_train)
    error_test = error_eval.evaluate(pred_labels_test, labels_test)
    
    print "Train error : %.2f Test error: %.2f\n" % (error_train, error_test)  

#an example for the USPS data set
inf='../../../data/uci/usps/usps_resampled.mat'
data=scipy.io.loadmat(inf)
train_labels=data['train_labels']
test_labels=data['test_labels']
train_features=data['train_patterns']
test_features=data['test_patterns']
#using images of digit 3 and digit 5 from the dataset
idx=[3,5]
#Note that 
#y_train and y_test are followed the definition in the first session 
#the transpose of x_train and x_test are followed the definition in the first session
print "Training set statistics"
(x_train, y_train)=binary_extract(idx,train_features, train_labels)
print "Test set statistics"
(x_test, y_test)=binary_extract(idx,test_features, test_labels)

inference_methods=[
                  SingleLaplacianInferenceMethodWithLBFGS,     #using LBFGS optimizer
                  SingleLaplacianInferenceMethod,              #using Newton-Raphson optimizer
                  ]
likelihood = LogitLikelihood()
#using default optimizer and default linesearch method
for inference in inference_methods:
    learning_example2(inference, likelihood, x_train, x_test, y_train, y_test)

likelihood = LogitVGLikelihood()
#using the default linesearch method
learning_example2(KLCovarianceInferenceMethod, likelihood, x_train, x_test, y_train, y_test)

likelihood = LogitVGLikelihood()
#using the default linesearch method
learning_example2(KLApproxDiagonalInferenceMethod, likelihood, x_train, x_test, y_train, y_test)

likelihood = LogitVGLikelihood()
#using the default linesearch method
learning_example2(KLCholeskyInferenceMethod, likelihood, x_train, x_test, y_train, y_test)

likelihood = LogitDVGLikelihood()
likelihood.set_strict_scale(0.1)
#using the default linesearch method
learning_example2(KLDualInferenceMethod, likelihood, x_train, x_test, y_train, y_test)

likelihood = LogitVGPiecewiseBoundLikelihood()
likelihood.set_default_variational_bound()
likelihood.set_noise_factor(1e-15)
inference_methods=[
                  KLCholeskyInferenceMethod,
                  KLApproxDiagonalInferenceMethod,
                  KLCovarianceInferenceMethod,
                  ]
for inference in inference_methods:
    #using the default linesearch method
    learning_example2(inference, likelihood, x_train, x_test, y_train, y_test)

def learning_example3(inference, likelihood, x_train, x_test, y_train, y_test):
    """
    This function is used to train a GP classifer given an inference method and hyper-parameters are automatically optimized
        
    Parameters:
        inference - an inference methods used to train the classifer
        likelihood - likelihood function to model data
        x_train, x_test - X for training and testing
        y_train, y_test - Y for training and testing
        
    Returns:
        Nothing
    """
    error_eval = ErrorRateMeasure()
    mean_func = ZeroMean()
    kernel_log_sigma = 1.0
    kernel_sigma = 2*exp(2*kernel_log_sigma);
    kernel_func = GaussianKernel(10, kernel_sigma)

    #sample by 1
    labels_train = BinaryLabels(y_train)
    labels_test = BinaryLabels(y_test)
    #feature by sample
    features_train=RealFeatures(x_train)
    features_test=RealFeatures(x_test)

    kernel_log_scale = 1.0

    inf = inference(kernel_func, features_train, mean_func, labels_train, likelihood)
    print "\nusing %s"%inf.get_name()
    
    inf.set_scale(exp(kernel_log_scale))
    try:
            inf.set_lbfgs_parameters(100,80,3,80)
    except:
            pass

    gp = GaussianProcessClassification(inf)

    # evaluate our inference method for its derivatives
    grad = GradientEvaluation(gp, features_train, labels_train, GradientCriterion(), False)
    grad.set_function(inf)

    # handles all of the above structures in memory
    grad_search = GradientModelSelection(grad)

    # search for best parameters and store them
    best_combination = grad_search.select_model()

    # apply best parameters to GP
    best_combination.apply_to_machine(gp)

    # we have to "cast" objects to the specific kernel interface we used (soon to be easier)
    best_width=GaussianKernel.obtain_from_generic(inf.get_kernel()).get_width()
    best_scale=inf.get_scale()
    print "Selected kernel bandwidth:", best_width
    print "Selected kernel scale:", best_scale

    start = time.time()
    gp.train()
    end = time.time()
    print "cost %s seconds at training"%(end-start)
    nlz=inf.get_negative_log_marginal_likelihood()
    print "the negative_log_marginal_likelihood is %.4f"%nlz
    start = time.time()
    #classification on train_data
    pred_labels_train = gp.apply_binary(features_train)
    #classification on test_data
    pred_labels_test = gp.apply_binary(features_test)
    end = time.time()    
    print "cost %s seconds at prediction"%(end-start)
    
    error_train = error_eval.evaluate(pred_labels_train, labels_train)
    error_test = error_eval.evaluate(pred_labels_test, labels_test)
    
    print "Train error : %.2f Test error: %.2f\n" % (error_train, error_test);

likelihood = LogitVGLikelihood()
likelihood.set_noise_factor(1e-15)
#using the default linesearch method
learning_example3(KLApproxDiagonalInferenceMethod, likelihood, x_train, x_test, y_train, y_test)