import addutils.toc ; addutils.toc.js(ipy_notebook=True)
import scipy.io
import numpy as np
import pandas as pd
from addutils import css_notebook
css_notebook()
Before trying any ML technique it is always a good idea to visualize the available data from different point of view but when the dimensionality of the problem is higher, it's difficult to use some that kind of plots and it is necessary to do dimenionality reduction.
We have already seen some simple visualization example made with scatter plot or scatter matrix on the tutorial ml01. In this lesson we're going furter by working with some of the mamy data projection techniques available in scikit-learn.
In the first example we will consider the digits dataset in which we have many 8x8 grayscale images of handwritten digits. In other words, this dataset is made by samples with 64 features
What we want to do is to obtain a descriptive 2D scatter plot.
from sklearn import datasets
digits = datasets.load_digits()
import addutils.imagegrid as ig
from bokeh.palettes import Greys9
print("Digits images: ", digits.images.shape[0])
# print only the first 80 digits images
num_imgs = 80
fig = ig.imagegrid_figure(images=[ digits.images[i][::-1, :] for i in range(num_imgs) ],
text=[ str(digits.target[i]) for i in range(num_imgs) ],
figure_title=None, palette=Greys9[::-1],
figure_plot_width=760, figure_plot_height=600,
text_color='red', padding=0.1,
grid_size=(10, 8))
bk.show(fig)
Digits images: 1797
We'll start our analysis with Principal Component Analysis (PCA): PCA seeks orthogonal linear combinations of the features which show the greatest variance. In this example we'll use RandomizedPCA
, because it's faster for large datasets.
import addutils.palette as pal
import seaborn as sns
from sklearn import decomposition
from bokeh.models.sources import ColumnDataSource
from bokeh.models.tools import HoverTool
pca = decomposition.PCA(copy=True, iterated_power=3, n_components=2,
whiten=False, svd_solver='randomized')
pca_proj = pca.fit_transform(digits.data)
colors = list(map(pal.to_hex, sns.color_palette('Paired', 10)))
pca_df = pd.DataFrame({'x': pca_proj[:,0],
'y': pca_proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target})
pca_src = ColumnDataSource(data=pca_df)
fig = bk.figure(title='PCA Projection - digits dataset',
x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=560)
fig.scatter(source=pca_src, x='x', y='y',
size=10, fill_alpha=0.9, line_alpha=0.5, line_color='black',
fill_color='color')
hover_tool = HoverTool(tooltips=[("target", "@target"),
("color", "$color[swatch]:color")])
fig.add_tools(hover_tool)
bk.show(fig)
We can notice some structure in this data but it's still difficult to understand if it would be possible to effectively apply some classification algorithm since the different data clusters do not show enough separation.
Principal Component Analysis (PCA) identifies the combination of attributes that account for the most variance in the data.
Linear Discriminant Analysis (LDA) instead tries to identify attributes that account for the most variance between classes. In particular, LDA, in contrast to PCA, is a supervised method, and can be used just when the class labels are available.
Let's see an LDA on the same Dataset:
import warnings
warnings.filterwarnings('ignore', message='Variables are collinear')
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
lda_proj = lda.fit(digits.data, digits.target).transform(digits.data)
lda_df = pd.DataFrame({ 'x': lda_proj[:,0],
'y': lda_proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target })
lda_src = ColumnDataSource(data=lda_df)
fig = bk.figure(title='LDA Projection - digits dataset',
x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=500)
fig.scatter(source=lda_src, x='x', y='y',
size=8, line_color='black', line_alpha=0.5,
fill_color='color')
hover_tool = HoverTool(tooltips=[("target", "@target"),
("color", "$color[swatch]:color")])
fig.add_tools(hover_tool)
bk.show(fig)
As in the previous example we don't see a clear separation of the different clusters. This is because this specific dataset present some non-linear features that can not be separate by linear projections.
The main weakness of the linear techniques seen so far is that they cannot detect non-linear features. A set of algorithms known as Manifold Learning have been developed to address this deficiency.
One of the earliest approaches to manifold learning is the Isomap algorithm, short for Isometric Mapping. Isomap
is a global, nonlinear, nonparametric dimentional reduction algorithm. Isomap can be viewed as an extension of Multi-dimensional Scaling (MDS) or Kernel PCA. Isomap seeks a lower-dimensional embedding which maintains geodesic distances between all points
warnings.filterwarnings('ignore', message='kneighbors*')
from sklearn import manifold
from bokeh.models import GlyphRenderer, Quad, Legend
fig_grid = []
for i in range(2):
row = []
for j in range(2):
iso = manifold.Isomap(n_neighbors=i+2, n_components=2)
proj = iso.fit_transform(digits.data)
df = pd.DataFrame({ 'x': proj[:,0], 'y': proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target })
src = ColumnDataSource(data=df)
fig = bk.figure(title="n_neighbors = %d" % (i+2),
plot_width=340, plot_height=300)
fig.title.text_font_size='12pt'
fig.scatter(source=src, x='x', y='y', fill_color='color',
size=8, line_alpha=0.5, line_color='black')
hover_tool = HoverTool(tooltips=[ ("target", "@target") ])
fig.add_tools(hover_tool)
row.append(fig)
fig_grid.append(row)
bk.show(bk.gridplot(fig_grid))
These visualizations show us that there is hope: even a simple classifier should be able to adequately identify the members of the various classes.
A canonical dataset used in Manifold learning is the S-curve, which we briefly saw in an earlier section:
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
X, y = datasets.make_s_curve(n_samples=1000)
fig, ax = plt.subplots(1, 1, figsize=(12,8))
ax1 = plt.axes(projection='3d')
s = ax1.scatter3D(X[:, 0], X[:, 1], X[:, 2], c=y, s=50, marker="o", cmap='GnBu')
fig.colorbar(s)
ax1.view_init(10, -60)
This is a 2-dimensional dataset embedded in three dimensions, but it is embedded in such a way that PCA cannot discover the underlying data orientation:
seqcolors = list(map(pal.to_hex, sns.color_palette('GnBu', 50)))
X_pca = decomposition.PCA(n_components=2).fit_transform(X)
fig = bk.figure(title='PCA Projection - S-Curve',
x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=500)
fig.scatter(X_pca[:, 0], X_pca[:, 1],
size=10, line_color='black', line_alpha=0.5,
fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)
Manifold learning algorithms, however, available in the sklearn.manifold
submodule, are able to recover the underlying 2-dimensional manifold.
iso = manifold.Isomap(n_neighbors=25, n_components=2)
X_iso = iso.fit_transform(X)
fig = bk.figure(title='Isomap - S-Curve', x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=400)
fig.scatter(X_iso[:, 0], X_iso[:, 1], size=8, alpha=0.8, line_color='black',
fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)
LocallyLinearEmbedding
(LLE) is a local, nonlinear, nonparametric algorithm. The main idea behind the Local Algorithms is to make the local configurations of points in the low-dimensional space resemble the local configurations in the high-dimensional space. LLE seeks a lower-dimensional projection of the data which preserves distances within local neighborhoods. It can be thought of as a series of local Principal Component Analyses which are globally compared to find the best non-linear embedding.
lle = manifold.LocallyLinearEmbedding(n_neighbors=15, n_components=2, method='modified')
X_lle = lle.fit_transform(X)
fig = bk.figure(title='LocallyLinearEmbedding - S-Curve', x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=400)
fig.scatter(X_lle[:, 0], X_lle[:, 1],
size=10, line_color='black', line_alpha=0.75,
fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)
t-SNE
give more importance to local distances and lower importance to non-local distances. In other words, it try to keep closer in the projected space the points that are closer in the original space while neglecting the others.
Moreover t-SNE
has a probabilistic way to find pairwise local distances: it converts each high-dimension similarity into the probability that one data point will pick one other data point as its neighbor. This make t-SNE
almost insensitive to bad feature scaling.
On th other side, the relative local nature of t-SNE
makes it sensitive to the course of the dimensionality of the data.
t-SNE (TSNE) converts affinities of data points to probabilities. The affinities in the original space are represented by Gaussian joint probabilities and the affinities in the embedded space are represented by Student’s t-distributions. This allows t-SNE to be particularly sensitive to local structure and has a few other advantages over existing techniques:
While Isomap, LLE and variants are best suited to unfold a single continuous low dimensional manifold, t-SNE will focus on the local structure of the data and will tend to extract clustered local groups of samples as highlighted on the S-curve example. This ability to group samples based on the local structure might be beneficial to visually disentangle a dataset that comprises several manifolds at once as is the case in the digits dataset.
The Kullback-Leibler (KL) divergence of the joint probabilities in the original space and the embedded space will be minimized by gradient descent. Note that the KL divergence is not convex, i.e. multiple restarts with different initializations will end up in local minima of the KL divergence. Hence, it is sometimes useful to try different seeds and select the embedding with the lowest KL divergence.
tsne = manifold.TSNE(n_components=2, n_iter=500)
tsne_proj = tsne.fit_transform(digits.data)
df = pd.DataFrame({ 'x': tsne_proj[:,0],
'y': tsne_proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target
})
src = ColumnDataSource(data=df)
fig = bk.figure(title='t-SNE - digits dataset', x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=450)
fig.scatter(source=src, x='x', y='y', fill_color='color',
size=8, line_color='black', line_alpha=0.50)
hover_tool = HoverTool(tooltips=[('target', '@target')])
fig.add_tools(hover_tool)
bk.show(fig)
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.