#!/usr/bin/env python
# coding: utf-8

# # Visualizing the Data

# In[1]:


import addutils.toc ; addutils.toc.js(ipy_notebook=True)


# In[2]:


import scipy.io
import numpy as np
import pandas as pd
from addutils import css_notebook
css_notebook()


# ## 1 Principal Component Analysis (PCA)

# Before trying any ML technique it is always a good idea to visualize the available data from different point of view but when the dimensionality of the problem is higher, it's difficult to use some that kind of plots and it is necessary to do **dimenionality reduction**.
# 
# We have already seen some simple visualization example made with scatter plot or scatter matrix on the tutorial ml01. In this lesson we're going furter by working with some of the mamy data projection techniques available in scikit-learn.
# 
# In the first example we will consider the *digits* dataset in which we have many 8x8 grayscale images of handwritten digits. In other words, this dataset is made by samples with 64 features
# 
# What we want to do is to obtain a descriptive 2D scatter plot.

# In[3]:


import bokeh.plotting as bk
bk.output_notebook()


# In[4]:


from sklearn import datasets
digits = datasets.load_digits()


# In[5]:


import addutils.imagegrid as ig
from bokeh.palettes import Greys9

print("Digits images: ", digits.images.shape[0])
# print only the first 80 digits images
num_imgs = 80
fig = ig.imagegrid_figure(images=[ digits.images[i][::-1, :] for i in range(num_imgs) ],
                          text=[ str(digits.target[i]) for i in range(num_imgs) ],
                          figure_title=None, palette=Greys9[::-1],
                          figure_plot_width=760, figure_plot_height=600,
                          text_color='red', padding=0.1,
                          grid_size=(10, 8))
bk.show(fig)


# We'll start our analysis with Principal Component Analysis (PCA): PCA seeks orthogonal linear combinations of the features which show the greatest variance. In this example we'll use `RandomizedPCA`, because it's faster for large datasets.

# In[6]:


import addutils.palette as pal
import seaborn as sns


# In[7]:


from sklearn import decomposition
from bokeh.models.sources import ColumnDataSource
from bokeh.models.tools import HoverTool

pca = decomposition.PCA(copy=True, iterated_power=3, n_components=2, 
                        whiten=False, svd_solver='randomized')
pca_proj = pca.fit_transform(digits.data)

colors = list(map(pal.to_hex, sns.color_palette('Paired', 10)))

pca_df = pd.DataFrame({'x': pca_proj[:,0],
                   'y': pca_proj[:,1],
                   'color': pal.linear_map(digits.target, colors),
                   'target': digits.target})
pca_src = ColumnDataSource(data=pca_df)


fig = bk.figure(title='PCA Projection - digits dataset',
                x_axis_label='c1', y_axis_label='c2',
                plot_width=750, plot_height=560)
fig.scatter(source=pca_src, x='x', y='y',
            size=10, fill_alpha=0.9, line_alpha=0.5, line_color='black',
            fill_color='color')

hover_tool = HoverTool(tooltips=[("target", "@target"),
                                 ("color", "$color[swatch]:color")])
fig.add_tools(hover_tool)

bk.show(fig)


# We can notice some structure in this data but it's still difficult to understand if it would be possible to effectively apply some classification algorithm since the different data clusters do not show enough separation.

# ## 2 Linear Discriminant Analysis (LDA)

# Principal Component Analysis (PCA) identifies the combination of attributes that account for the most variance in the data.
# 
# Linear Discriminant Analysis (LDA) instead tries to identify attributes that account for the most variance between classes. In particular, LDA, in contrast to PCA, is a supervised method, and can be used just when the class labels are available.
# 
# Let's see an LDA on the same Dataset:

# In[8]:


import warnings
warnings.filterwarnings('ignore', message='Variables are collinear')

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)

lda_proj = lda.fit(digits.data, digits.target).transform(digits.data)

lda_df = pd.DataFrame({ 'x': lda_proj[:,0],
                        'y': lda_proj[:,1],
                        'color': pal.linear_map(digits.target, colors),
                        'target': digits.target })
lda_src = ColumnDataSource(data=lda_df)

fig = bk.figure(title='LDA Projection - digits dataset', 
                x_axis_label='c1', y_axis_label='c2',
                plot_width=750, plot_height=500)
fig.scatter(source=lda_src, x='x', y='y',
            size=8, line_color='black', line_alpha=0.5,
            fill_color='color')

hover_tool = HoverTool(tooltips=[("target", "@target"),
                                 ("color", "$color[swatch]:color")])
fig.add_tools(hover_tool)

bk.show(fig)


# As in the previous example we don't see a clear separation of the different clusters. This is because this specific dataset present some non-linear features that can not be separate by linear projections.

# ## 3 Manifold Learning

# The main weakness of the linear techniques seen so far is that they cannot detect non-linear features.  A set of algorithms known as *Manifold Learning* have been developed to address this deficiency.
# 
# One of the earliest approaches to manifold learning is the Isomap algorithm, short for Isometric Mapping. `Isomap` is a global, nonlinear, nonparametric dimentional reduction algorithm. Isomap can be viewed as an extension of Multi-dimensional Scaling (MDS) or Kernel PCA. Isomap seeks a lower-dimensional embedding which maintains geodesic distances between all points

# In[9]:


warnings.filterwarnings('ignore', message='kneighbors*')

from sklearn import manifold

from bokeh.models import GlyphRenderer, Quad, Legend

fig_grid = []
for i in range(2):
    row = []
    for j in range(2):
        iso = manifold.Isomap(n_neighbors=i+2, n_components=2)
        proj = iso.fit_transform(digits.data)
        
        df = pd.DataFrame({ 'x': proj[:,0], 'y': proj[:,1],
                            'color': pal.linear_map(digits.target, colors),
                            'target': digits.target })
        src = ColumnDataSource(data=df)
        
        fig = bk.figure(title="n_neighbors = %d" % (i+2),
                        plot_width=340, plot_height=300)
        fig.title.text_font_size='12pt'
        fig.scatter(source=src, x='x', y='y', fill_color='color',
                    size=8, line_alpha=0.5, line_color='black')
        
        hover_tool = HoverTool(tooltips=[ ("target", "@target") ])
        fig.add_tools(hover_tool)
        
        row.append(fig)
    fig_grid.append(row)

bk.show(bk.gridplot(fig_grid))


# These visualizations show us that there is hope: even a simple classifier should be able to adequately identify the members of the various classes.

# ### 3.1 Another example with a specific test dataset: the S-Curve

# A canonical dataset used in Manifold learning is the *S-curve*, which we briefly saw in an earlier section:

# In[10]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


# In[11]:


X, y = datasets.make_s_curve(n_samples=1000)

fig, ax = plt.subplots(1, 1, figsize=(12,8))
ax1 = plt.axes(projection='3d')
s = ax1.scatter3D(X[:, 0], X[:, 1], X[:, 2], c=y, s=50, marker="o", cmap='GnBu')
fig.colorbar(s)
ax1.view_init(10, -60)


# This is a 2-dimensional dataset embedded in three dimensions, but it is embedded in such a way that PCA cannot discover the underlying data orientation:

# In[12]:


seqcolors = list(map(pal.to_hex, sns.color_palette('GnBu', 50)))


# In[13]:


X_pca = decomposition.PCA(n_components=2).fit_transform(X)

fig = bk.figure(title='PCA Projection - S-Curve',
               x_axis_label='c1', y_axis_label='c2', 
               plot_width=750, plot_height=500)
fig.scatter(X_pca[:, 0], X_pca[:, 1], 
            size=10, line_color='black', line_alpha=0.5,
                fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)


# Manifold learning algorithms, however, available in the `sklearn.manifold` submodule, are able to recover the underlying 2-dimensional manifold.

# In[14]:


iso = manifold.Isomap(n_neighbors=25, n_components=2)
X_iso = iso.fit_transform(X)
fig = bk.figure(title='Isomap - S-Curve', x_axis_label='c1', y_axis_label='c2',
                plot_width=750, plot_height=400)
fig.scatter(X_iso[:, 0], X_iso[:, 1], size=8, alpha=0.8, line_color='black',
            fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)


# `LocallyLinearEmbedding` (LLE) is a local, nonlinear, nonparametric algorithm. The main idea behind the Local Algorithms is to make the local configurations of points in the low-dimensional space resemble the local configurations in the high-dimensional space. LLE seeks a lower-dimensional projection of the data which preserves distances within local neighborhoods. It can be thought of as a series of local Principal Component Analyses which are globally compared to find the best non-linear embedding.

# In[15]:


lle = manifold.LocallyLinearEmbedding(n_neighbors=15, n_components=2, method='modified')
X_lle = lle.fit_transform(X)

fig = bk.figure(title='LocallyLinearEmbedding - S-Curve', x_axis_label='c1', y_axis_label='c2', 
                plot_width=750, plot_height=400)
fig.scatter(X_lle[:, 0], X_lle[:, 1],
            size=10, line_color='black', line_alpha=0.75,
            fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)


# ### 3.2 Some Tips on Manifold Learning practical use

# * Make sure the same scale is used over all features. Because manifold learning methods are based on a nearest-neighbor search, the algorithm may perform poorly otherwise.
# * The reconstruction error can be used to choose the optimal output dimension. For a *d-dimensional* manifold embedded in a * D-dimensional* parameter space, the reconstruction error will decrease as n_components is increased until *n_components == d*.
# * Noisy data can “short-circuit” the manifold, in essence acting as a bridge between parts of the manifold that would otherwise be well-separated.
# * Certain input configurations can lead to singular weight matrices, for example when more than two points in the dataset are identical, or when the data is split into disjointed groups. In this case, solver='arpack' will fail to find the null space. The easiest way to address this is to use solver='dense', though it may be very slow depending on the number of input points.

# ### 3.3 t-SNE t-distribuited Stochastic Neighbor Embedding

# `t-SNE` give more importance to local distances and lower importance to non-local distances. In other words, it try to keep closer in the projected space the points that are closer in the original space while neglecting the others.
# 
# Moreover `t-SNE` has a probabilistic way to find pairwise local distances: it converts each high-dimension similarity into the probability that one data point will pick one other data point as its neighbor. This make `t-SNE` almost insensitive to bad feature scaling.
# 
# On th other side, the relative local nature of `t-SNE` makes it sensitive to the course of the dimensionality of the data.
# 
# t-SNE (TSNE) converts affinities of data points to probabilities. The affinities in the original space are represented by Gaussian joint probabilities and the affinities in the embedded space are represented by Student’s t-distributions. This allows t-SNE to be particularly sensitive to local structure and has a few other advantages over existing techniques:
# 
# * Revealing the structure at many scales on a single map
# * Revealing data that lie in multiple, different, manifolds or clusters
# * Reducing the tendency to crowd points together at the center
# 
# While Isomap, LLE and variants are best suited to unfold a single continuous low dimensional manifold, t-SNE will focus on the local structure of the data and will tend to extract clustered local groups of samples as highlighted on the S-curve example. This ability to group samples based on the local structure might be beneficial to visually disentangle a dataset that comprises several manifolds at once as is the case in the digits dataset.
# 
# The Kullback-Leibler (KL) divergence of the joint probabilities in the original space and the embedded space will be minimized by gradient descent. Note that the KL divergence is not convex, i.e. multiple restarts with different initializations will end up in local minima of the KL divergence. Hence, it is sometimes useful to try different seeds and select the embedding with the lowest KL divergence.

# In[16]:


tsne = manifold.TSNE(n_components=2, n_iter=500)
tsne_proj = tsne.fit_transform(digits.data)


# In[17]:


df = pd.DataFrame({ 'x': tsne_proj[:,0],
                    'y': tsne_proj[:,1],
                    'color': pal.linear_map(digits.target, colors),
                    'target': digits.target
                  })
src = ColumnDataSource(data=df)

fig = bk.figure(title='t-SNE - digits dataset', x_axis_label='c1', y_axis_label='c2',
                plot_width=750, plot_height=450)
fig.scatter(source=src, x='x', y='y', fill_color='color',
            size=8, line_color='black', line_alpha=0.50)

hover_tool = HoverTool(tooltips=[('target', '@target')])
fig.add_tools(hover_tool)

bk.show(fig)


# ---
# 
# Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.
# 
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.