import numpy

x = numpy.array([[0,0],[-1,0.1],[0.3,-0.05],[0.7,0.3],[-0.2,-0.6],[-0.15,-0.63],[-0.25,0.55],[-0.28,0.67]])
y = numpy.array([0,0,0,0,1,1,2,2])

import matplotlib.pyplot as pyplot

%matplotlib inline

def plot_data(features,labels,axis,alpha=1.0):
    # separate features according to their class
    X0,X1,X2 = features[labels==0], features[labels==1], features[labels==2]
    
    # class 0 data
    axis.plot(X0[:,0], X0[:,1], 'o', color='green', markersize=12, alpha=alpha)
    # class 1 data
    axis.plot(X1[:,0], X1[:,1], 'o', color='red', markersize=12, alpha=alpha)
    # class 2 data
    axis.plot(X2[:,0], X2[:,1], 'o', color='blue', markersize=12, alpha=alpha)
    
    # set axes limits
    axis.set_xlim(-1.5,1.5)
    axis.set_ylim(-1.5,1.5)
    axis.set_aspect('equal')
    
    axis.set_xlabel('x')
    axis.set_ylabel('y')

figure,axis = pyplot.subplots(1,1)
plot_data(x,y,axis)
axis.set_title('Toy data set')
pyplot.show()

def make_covariance_ellipse(covariance):
    import matplotlib.patches as patches
    import scipy.linalg       as linalg
    
    # the ellipse is centered at (0,0)
    mean = numpy.array([0,0])
    
    # eigenvalue decomposition of the covariance matrix (w are eigenvalues and v eigenvectors),
    # keeping only the real part
    w,v = linalg.eigh(covariance)
    # normalize the eigenvector corresponding to the largest eigenvalue
    u = v[0]/linalg.norm(v[0])
    # angle in degrees
    angle = 180.0/numpy.pi*numpy.arctan(u[1]/u[0])
    # fill Gaussian ellipse at 2 standard deviation
    ellipse = patches.Ellipse(mean, 2*w[0]**0.5, 2*w[1]**0.5, 180+angle, color='orange', alpha=0.3)
    
    return ellipse

# represent the Euclidean distance
figure,axis = pyplot.subplots(1,1)
plot_data(x,y,axis)
ellipse = make_covariance_ellipse(numpy.eye(2))
axis.add_artist(ellipse)
axis.set_title('Euclidean distance')
pyplot.show()


from modshogun import RealFeatures, MulticlassLabels

features = RealFeatures(x.T)
labels   = MulticlassLabels(y.astype(numpy.float64))

from modshogun import LMNN

# number of target neighbours per example
k = 1

lmnn = LMNN(features,labels,k)
# set an initial transform as a start point of the optimization
init_transform = numpy.eye(2)
lmnn.set_maxiter(2000)
lmnn.train(init_transform)

# get the linear transform from LMNN
L = lmnn.get_linear_transform()
# square the linear transform to obtain the Mahalanobis distance matrix
M = numpy.matrix(numpy.dot(L.T,L))

# represent the distance given by LMNN
figure,axis = pyplot.subplots(1,1)
plot_data(x,y,axis)
ellipse = make_covariance_ellipse(M.I)
axis.add_artist(ellipse)
axis.set_title('LMNN distance')
pyplot.show()

# project original data using L
lx = numpy.dot(L,x.T)

# represent the data in the projected space
figure,axis = pyplot.subplots(1,1)
plot_data(lx.T,y,axis)
plot_data(x,y,axis,0.3)
ellipse = make_covariance_ellipse(numpy.eye(2))
axis.add_artist(ellipse)
axis.set_title('LMNN\'s linear transform')
pyplot.show()

import numpy
import matplotlib.pyplot as pyplot
%matplotlib inline

def sandwich_data():
    from numpy.random import normal
    
    # number of distinct classes
    num_classes = 6
    # number of points per class
    num_points = 9
    # distance between layers, the points of each class are in a layer
    dist = 0.7
    
    # memory pre-allocation
    x = numpy.zeros((num_classes*num_points, 2))
    y = numpy.zeros(num_classes*num_points)
    
    for i,j in zip(xrange(num_classes), xrange(-num_classes//2, num_classes//2 + 1)):
        for k,l in zip(xrange(num_points), xrange(-num_points//2, num_points//2 + 1)):
            x[i*num_points + k, :] = numpy.array([normal(l, 0.1), normal(dist*j, 0.1)])
            
        y[i*num_points:i*num_points + num_points] = i
        
    return x,y
    

def plot_sandwich_data(x, y, axis=pyplot, cols=['r', 'b', 'g', 'm', 'k', 'y']):
    for idx,val in enumerate(numpy.unique(y)):
        xi = x[y==val]
        axis.scatter(xi[:,0], xi[:,1], s=50, facecolors='none', edgecolors=cols[idx])

x, y = sandwich_data()
figure, axis = pyplot.subplots(1, 1, figsize=(5,5))
plot_sandwich_data(x, y, axis)

axis.set_aspect('equal')
axis.set_title('"Sandwich" toy data set')
axis.set_xlabel('x')
axis.set_ylabel('y')

pyplot.show()

from modshogun import KNN, EuclideanDistance, LMNN, RealFeatures, MulticlassLabels

def plot_neighborhood_graph(x, nn, axis=pyplot, cols=['r', 'b', 'g', 'm', 'k', 'y']):
	for i in xrange(x.shape[0]):
		xs = [x[i,0], x[nn[1,i], 0]]
		ys = [x[i,1], x[nn[1,i], 1]]
		axis.plot(xs, ys, cols[int(y[i])])

features = RealFeatures(x.T)
labels = MulticlassLabels(y)

fig, axes = pyplot.subplots(1, 3, figsize=(15, 10))

# use k = 2 instead of 1 because otherwise the method nearest_neighbors just returns the same
# points as their own 1-nearest neighbours
k = 2

knn = KNN(k, EuclideanDistance(features, features), labels)

plot_sandwich_data(x, y, axes[0])
plot_neighborhood_graph(x, knn.nearest_neighbors(), axes[0])
axes[0].set_title('Euclidean neighbourhood in the input space')

lmnn = LMNN(features, labels, k)
# set a large number of iterations. The data set is small so it does not cost a lot, and this way
# we ensure a robust solution
lmnn.set_maxiter(3000)
lmnn.train()
knn.set_distance(lmnn.get_distance())

plot_sandwich_data(x, y, axes[1])
plot_neighborhood_graph(x, knn.nearest_neighbors(), axes[1])
axes[1].set_title('LMNN neighbourhood in the input space')

# plot features in the transformed space, with the neighbourhood graph computed using the Euclidean distance
L = lmnn.get_linear_transform()
xl = numpy.dot(x, L.T)
features = RealFeatures(xl.T)
knn.set_distance(EuclideanDistance(features, features))

plot_sandwich_data(xl, y, axes[2])
plot_neighborhood_graph(xl, knn.nearest_neighbors(), axes[2])
axes[2].set_ylim(-3, 2.5)
axes[2].set_title('Euclidean neighbourhood in the transformed space')

[axes[i].set_xlabel('x') for i in xrange(len(axes))]
[axes[i].set_ylabel('y') for i in xrange(len(axes))]
[axes[i].set_aspect('equal') for i in xrange(len(axes))]

pyplot.show()

from modshogun import CSVFile, RealFeatures, MulticlassLabels

ape_features = RealFeatures(CSVFile('../../../data/multiclass/fm_ape_gut.dat'))
ape_labels = MulticlassLabels(CSVFile('../../../data/multiclass/label_ape_gut.dat'))

print('Number of examples = %d, number of features = %d.' % (ape_features.get_num_vectors(), ape_features.get_num_features()))

def visualize_tdsne(features, labels):
    from modshogun import TDistributedStochasticNeighborEmbedding
    
    converter = TDistributedStochasticNeighborEmbedding()
    converter.set_target_dim(2)
    converter.set_perplexity(25)
    
    embedding = converter.embed(features)
    
    import matplotlib.pyplot as pyplot
    % matplotlib inline
    
    x = embedding.get_feature_matrix()
    y = labels.get_labels()
    
    pyplot.scatter(x[0, y==0], x[1, y==0], color='green')
    pyplot.scatter(x[0, y==1], x[1, y==1], color='red')
    pyplot.scatter(x[0, y==2], x[1, y==2], color='blue')
    pyplot.show()
    
visualize_tdsne(ape_features, ape_labels)

from modshogun import KNN, EuclideanDistance
from modshogun import StratifiedCrossValidationSplitting, CrossValidation
from modshogun import CrossValidationResult, MulticlassAccuracy

# set up the classifier
knn = KNN()
knn.set_k(3)
knn.set_distance(EuclideanDistance())

# set up 5-fold cross-validation
splitting = StratifiedCrossValidationSplitting(ape_labels, 5)
# evaluation method
evaluator = MulticlassAccuracy()
cross_validation = CrossValidation(knn, ape_features, ape_labels, splitting, evaluator)
# locking is not supported for kNN, deactivate it to avoid an inoffensive warning
cross_validation.set_autolock(False)
# number of experiments, the more we do, the less variance in the result
num_runs = 200
cross_validation.set_num_runs(num_runs)

# perform cross-validation and print the result!
result = cross_validation.evaluate()
result = CrossValidationResult.obtain_from_generic(result)
print('kNN mean accuracy in a total of %d runs is %.4f.' % (num_runs, result.mean))


from modshogun import LMNN
import numpy

# to make training faster, use a portion of the features
fm = ape_features.get_feature_matrix()
ape_features_subset = RealFeatures(fm[:300, :])

# number of targer neighbours in LMNN, here we just use the same value that was used for KNN before
k = 3
lmnn = LMNN(ape_features_subset, ape_labels, k)
lmnn.set_diagonal(True)
lmnn.set_maxiter(1200)
init_transform = numpy.eye(ape_features_subset.get_num_features())
lmnn.train(init_transform)

diagonal = numpy.diag(lmnn.get_linear_transform())
print('%d out of %d elements are non-zero.' % (numpy.sum(diagonal != 0), diagonal.size))

import matplotlib.pyplot as pyplot
%matplotlib inline

statistics = lmnn.get_statistics()
pyplot.plot(statistics.obj.get())
pyplot.grid(True)
pyplot.xlabel('Number of iterations')
pyplot.ylabel('LMNN objective')
pyplot.show()

from modshogun import CSVFile, RealFeatures, MulticlassLabels

wine_features = RealFeatures(CSVFile('../../../data/multiclass/fm_wine.dat'))
wine_labels = MulticlassLabels(CSVFile('../../../data/multiclass/label_wine.dat'))

assert(wine_features.get_num_vectors() == wine_labels.get_num_labels())
print('%d feature vectors with %d features from %d different classes.' % (wine_features.get_num_vectors(), \
       wine_features.get_num_features(), wine_labels.get_num_classes()))

from modshogun import KNN, EuclideanDistance
from modshogun import StratifiedCrossValidationSplitting, CrossValidation
from modshogun import CrossValidationResult, MulticlassAccuracy
import numpy

# kNN classifier
k = 5
knn = KNN()
knn.set_k(k)
knn.set_distance(EuclideanDistance())

splitting = StratifiedCrossValidationSplitting(wine_labels, 5)
evaluator = MulticlassAccuracy()
cross_validation = CrossValidation(knn, wine_features, wine_labels, splitting, evaluator)
cross_validation.set_autolock(False)
num_runs = 200
cross_validation.set_num_runs(num_runs)

result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate())
euclidean_means = numpy.zeros(3)
euclidean_means[0] = result.mean

print('kNN accuracy with the Euclidean distance %.4f.' % result.mean)

from modshogun import LMNN

# train LMNN
lmnn = LMNN(wine_features, wine_labels, k)
lmnn.set_maxiter(5000)
lmnn.train()

# evaluate kNN using the distance learnt by LMNN
knn.set_distance(lmnn.get_distance())

result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate())
lmnn_means = numpy.zeros(3)
lmnn_means[0] = result.mean

print('kNN accuracy with the distance obtained by LMNN %.4f.' % result.mean)

print('minima = ' + str(numpy.min(wine_features, axis=1)))
print('maxima = ' + str(numpy.max(wine_features, axis=1)))

from modshogun import RescaleFeatures

# preprocess features so that all of them vary within [0,1]
preprocessor = RescaleFeatures()
preprocessor.init(wine_features)
wine_features.add_preprocessor(preprocessor)
wine_features.apply_preprocessor()

# sanity check
feature_matrix = wine_features.get_feature_matrix()
assert(numpy.min(wine_features) >= 0.0 and numpy.max(wine_features) <= 1.0)

# perform kNN classification after the feature rescaling
knn.set_distance(EuclideanDistance())
result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate())
euclidean_means[1] = result.mean

print('kNN accuracy with the Euclidean distance after feature rescaling %.4f.' % result.mean)

# train kNN in the new features and classify with kNN
lmnn.train()
knn.set_distance(lmnn.get_distance())
result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate())
lmnn_means[1] = result.mean

print('kNN accuracy with the distance obtained by LMNN after feature rescaling %.4f.' % result.mean)

import scipy.linalg as linalg

# shorthand for the feature matrix -- this makes a copy of the feature matrix
data = wine_features.get_feature_matrix()
# remove mean
data = data.T
data-= numpy.mean(data, axis=0)
# compute the square of the covariance matrix and its inverse
M = linalg.sqrtm(numpy.cov(data.T))
# keep only the real part, although the imaginary that pops up in the sqrtm operation should be equal to zero
N = linalg.inv(M).real
# apply whitening transform
white_data = numpy.dot(N, data.T)
wine_white_features = RealFeatures(white_data)

import matplotlib.pyplot as pyplot
%matplotlib inline

fig, axarr = pyplot.subplots(1,2)
axarr[0].matshow(numpy.cov(wine_features))
axarr[1].matshow(numpy.cov(wine_white_features))
pyplot.show()

wine_features = wine_white_features

# perform kNN classification after whitening
knn.set_distance(EuclideanDistance())
result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate())
euclidean_means[2] = result.mean

print('kNN accuracy with the Euclidean distance after whitening %.4f.' % result.mean)

# train kNN in the new features and classify with kNN
lmnn.train()
knn.set_distance(lmnn.get_distance())
result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate())
lmnn_means[2] = result.mean

print('kNN accuracy with the distance obtained by LMNN after whitening %.4f.' % result.mean)

assert(euclidean_means.shape[0] == lmnn_means.shape[0])
N = euclidean_means.shape[0]
# the x locations for the groups
ind = 0.5*numpy.arange(N)
# bar width
width = 0.15
figure, axes = pyplot.subplots()

euclidean_rects = axes.bar(ind, euclidean_means, width, color='y')
lmnn_rects = axes.bar(ind+width, lmnn_means, width, color='r')

# attach information to chart
axes.set_ylabel('Accuracies')
axes.set_ylim(top=1.4)
axes.set_title('kNN accuracy by distance and feature preprocessing')
axes.set_xticks(ind+width)
axes.set_xticklabels(('Raw', 'Rescaling', 'Whitening'))
axes.legend(( euclidean_rects[0], lmnn_rects[0]), ('Euclidean', 'LMNN'), loc='upper right')

def autolabel(rects):
    # attach text labels to bars
    for rect in rects:
        height = rect.get_height()
        axes.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%.3f' % height,
                  ha='center', va='bottom')
        
autolabel(euclidean_rects)
autolabel(lmnn_rects)

pyplot.show()