import numpy x = numpy.array([[0,0],[-1,0.1],[0.3,-0.05],[0.7,0.3],[-0.2,-0.6],[-0.15,-0.63],[-0.25,0.55],[-0.28,0.67]]) y = numpy.array([0,0,0,0,1,1,2,2]) import matplotlib.pyplot as pyplot %matplotlib inline def plot_data(features,labels,axis,alpha=1.0): # separate features according to their class X0,X1,X2 = features[labels==0], features[labels==1], features[labels==2] # class 0 data axis.plot(X0[:,0], X0[:,1], 'o', color='green', markersize=12, alpha=alpha) # class 1 data axis.plot(X1[:,0], X1[:,1], 'o', color='red', markersize=12, alpha=alpha) # class 2 data axis.plot(X2[:,0], X2[:,1], 'o', color='blue', markersize=12, alpha=alpha) # set axes limits axis.set_xlim(-1.5,1.5) axis.set_ylim(-1.5,1.5) axis.set_aspect('equal') axis.set_xlabel('x') axis.set_ylabel('y') figure,axis = pyplot.subplots(1,1) plot_data(x,y,axis) axis.set_title('Toy data set') pyplot.show() def make_covariance_ellipse(covariance): import matplotlib.patches as patches import scipy.linalg as linalg # the ellipse is centered at (0,0) mean = numpy.array([0,0]) # eigenvalue decomposition of the covariance matrix (w are eigenvalues and v eigenvectors), # keeping only the real part w,v = linalg.eigh(covariance) # normalize the eigenvector corresponding to the largest eigenvalue u = v[0]/linalg.norm(v[0]) # angle in degrees angle = 180.0/numpy.pi*numpy.arctan(u[1]/u[0]) # fill Gaussian ellipse at 2 standard deviation ellipse = patches.Ellipse(mean, 2*w[0]**0.5, 2*w[1]**0.5, 180+angle, color='orange', alpha=0.3) return ellipse # represent the Euclidean distance figure,axis = pyplot.subplots(1,1) plot_data(x,y,axis) ellipse = make_covariance_ellipse(numpy.eye(2)) axis.add_artist(ellipse) axis.set_title('Euclidean distance') pyplot.show() from modshogun import RealFeatures, MulticlassLabels features = RealFeatures(x.T) labels = MulticlassLabels(y.astype(numpy.float64)) from modshogun import LMNN # number of target neighbours per example k = 1 lmnn = LMNN(features,labels,k) # set an initial transform as a start point of the optimization init_transform = numpy.eye(2) lmnn.set_maxiter(2000) lmnn.train(init_transform) # get the linear transform from LMNN L = lmnn.get_linear_transform() # square the linear transform to obtain the Mahalanobis distance matrix M = numpy.matrix(numpy.dot(L.T,L)) # represent the distance given by LMNN figure,axis = pyplot.subplots(1,1) plot_data(x,y,axis) ellipse = make_covariance_ellipse(M.I) axis.add_artist(ellipse) axis.set_title('LMNN distance') pyplot.show() # project original data using L lx = numpy.dot(L,x.T) # represent the data in the projected space figure,axis = pyplot.subplots(1,1) plot_data(lx.T,y,axis) plot_data(x,y,axis,0.3) ellipse = make_covariance_ellipse(numpy.eye(2)) axis.add_artist(ellipse) axis.set_title('LMNN\'s linear transform') pyplot.show() import numpy import matplotlib.pyplot as pyplot %matplotlib inline def sandwich_data(): from numpy.random import normal # number of distinct classes num_classes = 6 # number of points per class num_points = 9 # distance between layers, the points of each class are in a layer dist = 0.7 # memory pre-allocation x = numpy.zeros((num_classes*num_points, 2)) y = numpy.zeros(num_classes*num_points) for i,j in zip(xrange(num_classes), xrange(-num_classes//2, num_classes//2 + 1)): for k,l in zip(xrange(num_points), xrange(-num_points//2, num_points//2 + 1)): x[i*num_points + k, :] = numpy.array([normal(l, 0.1), normal(dist*j, 0.1)]) y[i*num_points:i*num_points + num_points] = i return x,y def plot_sandwich_data(x, y, axis=pyplot, cols=['r', 'b', 'g', 'm', 'k', 'y']): for idx,val in enumerate(numpy.unique(y)): xi = x[y==val] axis.scatter(xi[:,0], xi[:,1], s=50, facecolors='none', edgecolors=cols[idx]) x, y = sandwich_data() figure, axis = pyplot.subplots(1, 1, figsize=(5,5)) plot_sandwich_data(x, y, axis) axis.set_aspect('equal') axis.set_title('"Sandwich" toy data set') axis.set_xlabel('x') axis.set_ylabel('y') pyplot.show() from modshogun import KNN, EuclideanDistance, LMNN, RealFeatures, MulticlassLabels def plot_neighborhood_graph(x, nn, axis=pyplot, cols=['r', 'b', 'g', 'm', 'k', 'y']): for i in xrange(x.shape[0]): xs = [x[i,0], x[nn[1,i], 0]] ys = [x[i,1], x[nn[1,i], 1]] axis.plot(xs, ys, cols[int(y[i])]) features = RealFeatures(x.T) labels = MulticlassLabels(y) fig, axes = pyplot.subplots(1, 3, figsize=(15, 10)) # use k = 2 instead of 1 because otherwise the method nearest_neighbors just returns the same # points as their own 1-nearest neighbours k = 2 knn = KNN(k, EuclideanDistance(features, features), labels) plot_sandwich_data(x, y, axes[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axes[0]) axes[0].set_title('Euclidean neighbourhood in the input space') lmnn = LMNN(features, labels, k) # set a large number of iterations. The data set is small so it does not cost a lot, and this way # we ensure a robust solution lmnn.set_maxiter(3000) lmnn.train() knn.set_distance(lmnn.get_distance()) plot_sandwich_data(x, y, axes[1]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axes[1]) axes[1].set_title('LMNN neighbourhood in the input space') # plot features in the transformed space, with the neighbourhood graph computed using the Euclidean distance L = lmnn.get_linear_transform() xl = numpy.dot(x, L.T) features = RealFeatures(xl.T) knn.set_distance(EuclideanDistance(features, features)) plot_sandwich_data(xl, y, axes[2]) plot_neighborhood_graph(xl, knn.nearest_neighbors(), axes[2]) axes[2].set_ylim(-3, 2.5) axes[2].set_title('Euclidean neighbourhood in the transformed space') [axes[i].set_xlabel('x') for i in xrange(len(axes))] [axes[i].set_ylabel('y') for i in xrange(len(axes))] [axes[i].set_aspect('equal') for i in xrange(len(axes))] pyplot.show() from modshogun import CSVFile, RealFeatures, MulticlassLabels ape_features = RealFeatures(CSVFile('../../../data/multiclass/fm_ape_gut.dat')) ape_labels = MulticlassLabels(CSVFile('../../../data/multiclass/label_ape_gut.dat')) print('Number of examples = %d, number of features = %d.' % (ape_features.get_num_vectors(), ape_features.get_num_features())) def visualize_tdsne(features, labels): from modshogun import TDistributedStochasticNeighborEmbedding converter = TDistributedStochasticNeighborEmbedding() converter.set_target_dim(2) converter.set_perplexity(25) embedding = converter.embed(features) import matplotlib.pyplot as pyplot % matplotlib inline x = embedding.get_feature_matrix() y = labels.get_labels() pyplot.scatter(x[0, y==0], x[1, y==0], color='green') pyplot.scatter(x[0, y==1], x[1, y==1], color='red') pyplot.scatter(x[0, y==2], x[1, y==2], color='blue') pyplot.show() visualize_tdsne(ape_features, ape_labels) from modshogun import KNN, EuclideanDistance from modshogun import StratifiedCrossValidationSplitting, CrossValidation from modshogun import CrossValidationResult, MulticlassAccuracy # set up the classifier knn = KNN() knn.set_k(3) knn.set_distance(EuclideanDistance()) # set up 5-fold cross-validation splitting = StratifiedCrossValidationSplitting(ape_labels, 5) # evaluation method evaluator = MulticlassAccuracy() cross_validation = CrossValidation(knn, ape_features, ape_labels, splitting, evaluator) # locking is not supported for kNN, deactivate it to avoid an inoffensive warning cross_validation.set_autolock(False) # number of experiments, the more we do, the less variance in the result num_runs = 200 cross_validation.set_num_runs(num_runs) # perform cross-validation and print the result! result = cross_validation.evaluate() result = CrossValidationResult.obtain_from_generic(result) print('kNN mean accuracy in a total of %d runs is %.4f.' % (num_runs, result.mean)) from modshogun import LMNN import numpy # to make training faster, use a portion of the features fm = ape_features.get_feature_matrix() ape_features_subset = RealFeatures(fm[:300, :]) # number of targer neighbours in LMNN, here we just use the same value that was used for KNN before k = 3 lmnn = LMNN(ape_features_subset, ape_labels, k) lmnn.set_diagonal(True) lmnn.set_maxiter(1200) init_transform = numpy.eye(ape_features_subset.get_num_features()) lmnn.train(init_transform) diagonal = numpy.diag(lmnn.get_linear_transform()) print('%d out of %d elements are non-zero.' % (numpy.sum(diagonal != 0), diagonal.size)) import matplotlib.pyplot as pyplot %matplotlib inline statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Number of iterations') pyplot.ylabel('LMNN objective') pyplot.show() from modshogun import CSVFile, RealFeatures, MulticlassLabels wine_features = RealFeatures(CSVFile('../../../data/multiclass/fm_wine.dat')) wine_labels = MulticlassLabels(CSVFile('../../../data/multiclass/label_wine.dat')) assert(wine_features.get_num_vectors() == wine_labels.get_num_labels()) print('%d feature vectors with %d features from %d different classes.' % (wine_features.get_num_vectors(), \ wine_features.get_num_features(), wine_labels.get_num_classes())) from modshogun import KNN, EuclideanDistance from modshogun import StratifiedCrossValidationSplitting, CrossValidation from modshogun import CrossValidationResult, MulticlassAccuracy import numpy # kNN classifier k = 5 knn = KNN() knn.set_k(k) knn.set_distance(EuclideanDistance()) splitting = StratifiedCrossValidationSplitting(wine_labels, 5) evaluator = MulticlassAccuracy() cross_validation = CrossValidation(knn, wine_features, wine_labels, splitting, evaluator) cross_validation.set_autolock(False) num_runs = 200 cross_validation.set_num_runs(num_runs) result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate()) euclidean_means = numpy.zeros(3) euclidean_means[0] = result.mean print('kNN accuracy with the Euclidean distance %.4f.' % result.mean) from modshogun import LMNN # train LMNN lmnn = LMNN(wine_features, wine_labels, k) lmnn.set_maxiter(5000) lmnn.train() # evaluate kNN using the distance learnt by LMNN knn.set_distance(lmnn.get_distance()) result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate()) lmnn_means = numpy.zeros(3) lmnn_means[0] = result.mean print('kNN accuracy with the distance obtained by LMNN %.4f.' % result.mean) print('minima = ' + str(numpy.min(wine_features, axis=1))) print('maxima = ' + str(numpy.max(wine_features, axis=1))) from modshogun import RescaleFeatures # preprocess features so that all of them vary within [0,1] preprocessor = RescaleFeatures() preprocessor.init(wine_features) wine_features.add_preprocessor(preprocessor) wine_features.apply_preprocessor() # sanity check feature_matrix = wine_features.get_feature_matrix() assert(numpy.min(wine_features) >= 0.0 and numpy.max(wine_features) <= 1.0) # perform kNN classification after the feature rescaling knn.set_distance(EuclideanDistance()) result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate()) euclidean_means[1] = result.mean print('kNN accuracy with the Euclidean distance after feature rescaling %.4f.' % result.mean) # train kNN in the new features and classify with kNN lmnn.train() knn.set_distance(lmnn.get_distance()) result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate()) lmnn_means[1] = result.mean print('kNN accuracy with the distance obtained by LMNN after feature rescaling %.4f.' % result.mean) import scipy.linalg as linalg # shorthand for the feature matrix -- this makes a copy of the feature matrix data = wine_features.get_feature_matrix() # remove mean data = data.T data-= numpy.mean(data, axis=0) # compute the square of the covariance matrix and its inverse M = linalg.sqrtm(numpy.cov(data.T)) # keep only the real part, although the imaginary that pops up in the sqrtm operation should be equal to zero N = linalg.inv(M).real # apply whitening transform white_data = numpy.dot(N, data.T) wine_white_features = RealFeatures(white_data) import matplotlib.pyplot as pyplot %matplotlib inline fig, axarr = pyplot.subplots(1,2) axarr[0].matshow(numpy.cov(wine_features)) axarr[1].matshow(numpy.cov(wine_white_features)) pyplot.show() wine_features = wine_white_features # perform kNN classification after whitening knn.set_distance(EuclideanDistance()) result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate()) euclidean_means[2] = result.mean print('kNN accuracy with the Euclidean distance after whitening %.4f.' % result.mean) # train kNN in the new features and classify with kNN lmnn.train() knn.set_distance(lmnn.get_distance()) result = CrossValidationResult.obtain_from_generic(cross_validation.evaluate()) lmnn_means[2] = result.mean print('kNN accuracy with the distance obtained by LMNN after whitening %.4f.' % result.mean) assert(euclidean_means.shape[0] == lmnn_means.shape[0]) N = euclidean_means.shape[0] # the x locations for the groups ind = 0.5*numpy.arange(N) # bar width width = 0.15 figure, axes = pyplot.subplots() euclidean_rects = axes.bar(ind, euclidean_means, width, color='y') lmnn_rects = axes.bar(ind+width, lmnn_means, width, color='r') # attach information to chart axes.set_ylabel('Accuracies') axes.set_ylim(top=1.4) axes.set_title('kNN accuracy by distance and feature preprocessing') axes.set_xticks(ind+width) axes.set_xticklabels(('Raw', 'Rescaling', 'Whitening')) axes.legend(( euclidean_rects[0], lmnn_rects[0]), ('Euclidean', 'LMNN'), loc='upper right') def autolabel(rects): # attach text labels to bars for rect in rects: height = rect.get_height() axes.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%.3f' % height, ha='center', va='bottom') autolabel(euclidean_rects) autolabel(lmnn_rects) pyplot.show()