#!/usr/bin/env python # coding: utf-8 # Neural Networks # =============== # # *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes. # $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$ # $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$ # $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$ # $\newcommand{\norm}[1]{\|#1\|}$ # $\newcommand{\abs}[1]{\left|#1\right|}$ # $\newcommand{\choice}[1]{ \left\{ \begin{array}{l} #1 \end{array} \right. }$ # $\newcommand{\pa}[1]{\left(#1\right)}$ # $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$ # $\newcommand{\qandq}{\quad\text{and}\quad}$ # $\newcommand{\qwhereq}{\quad\text{where}\quad}$ # $\newcommand{\qifq}{ \quad \text{if} \quad }$ # $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$ # $\newcommand{\ZZ}{\mathbb{Z}}$ # $\newcommand{\CC}{\mathbb{C}}$ # $\newcommand{\RR}{\mathbb{R}}$ # $\newcommand{\EE}{\mathbb{E}}$ # $\newcommand{\Zz}{\mathcal{Z}}$ # $\newcommand{\Ww}{\mathcal{W}}$ # $\newcommand{\Vv}{\mathcal{V}}$ # $\newcommand{\Nn}{\mathcal{N}}$ # $\newcommand{\NN}{\mathcal{N}}$ # $\newcommand{\Hh}{\mathcal{H}}$ # $\newcommand{\Bb}{\mathcal{B}}$ # $\newcommand{\Ee}{\mathcal{E}}$ # $\newcommand{\Cc}{\mathcal{C}}$ # $\newcommand{\Gg}{\mathcal{G}}$ # $\newcommand{\Ss}{\mathcal{S}}$ # $\newcommand{\Pp}{\mathcal{P}}$ # $\newcommand{\Ff}{\mathcal{F}}$ # $\newcommand{\Xx}{\mathcal{X}}$ # $\newcommand{\Mm}{\mathcal{M}}$ # $\newcommand{\Ii}{\mathcal{I}}$ # $\newcommand{\Dd}{\mathcal{D}}$ # $\newcommand{\Ll}{\mathcal{L}}$ # $\newcommand{\Tt}{\mathcal{T}}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\al}{\alpha}$ # $\newcommand{\la}{\lambda}$ # $\newcommand{\ga}{\gamma}$ # $\newcommand{\Ga}{\Gamma}$ # $\newcommand{\La}{\Lambda}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\Si}{\Sigma}$ # $\newcommand{\be}{\beta}$ # $\newcommand{\de}{\delta}$ # $\newcommand{\De}{\Delta}$ # $\newcommand{\phi}{\varphi}$ # $\newcommand{\th}{\theta}$ # $\newcommand{\om}{\omega}$ # $\newcommand{\Om}{\Omega}$ # $\newcommand{\eqdef}{\equiv}$ # This tour details fully connected multi-layers neural netorks. # We recommend that after doing this Numerical Tours, you apply it to your # own data, for instance using a dataset from [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/). # # _Disclaimer:_ these machine learning tours are intended to be # overly-simplistic implementations and applications of baseline machine learning methods. # For more advanced uses and implementations, we recommend # to use a state-of-the-art library, the most well known being # [Scikit-Learn](http://scikit-learn.org/) # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: # convert to a column vector def MakeCol(y): return y.reshape(-1,1) # convert to a row vector def MakeRow(y): return y.reshape(1,-1) # find non zero/true elements def find(x): return np.nonzero(x)[0] # Dataset Generation # ------------------ # Build a synthetic data set for classification # # # Generate Data # In[3]: n0 = 100 # number of points per class p = 2 # dimensionality k = 3 # number of classes n = n0*k # Total number of points x = np.zeros((p,n)) y = np.zeros((1,n)) for j in np.arange(0,k): I = np.arange(n0*(j-1),n0*j) r = np.linspace(0.0,1,n0) # radius t = np.linspace(j*4,(j+1)*4,n0) + np.random.randn(1,n0)*0.2 # angle x[0,I] = r*np.sin(t) x[1,I] = r*np.cos(t) y[0,I] = j # Display. # In[4]: col = np.array( [ [1,0,0], [0,1,0], [0,0,1], [0,0,0], [0,1,1], [1,0,1], [1,1,0], [1,.5,.5], [.5,1,.5], [.5,.5,1] ] ).transpose() plt.clf for j in np.arange(0,k): I = find(y.flatten()==j) plt.plot(x[0,I], x[1,I], '.', color=col[:,j]) plt.axis('equal') plt.axis('off'); # Class probability matrix. # In[5]: U = np.tile( MakeCol(np.arange(1,k+1)), (1,n) ) Y = np.double( U == np.tile( y+1, (k,1) ) ) # Building the Network # -------------------- # We setup the network. It is parameterized by the dimensions of the layers. # # # The network is composed of $R$ layers, and operate by initialyzing $x_0=x$ and then iterating # $$ \forall r=0,\ldots,R, \quad x_{r+1} \eqdef \rho(A_r x_r + b_r). $$ # Here $\rho : \RR \mapsto \RR$ is a non-linear activation function which # operate coordinate by coordinate. The intermediate variables are $x_r \in \RR^{d_r}$ # with $(d_0,d_{L+1})=(p,k)$. The matrices have size $A_r \in \RR^{d_{r+1} \times d_r}$ # while the biases have size $b_r \in \RR^{d_{r+1}}$. # # # The final value is obtained by comparing the predicted value $x_{R+1}$ # to the data $y$ using some loss function # $$ \ell \eqdef \Ll(x_{R+1},y). $$ # # # Load the loss and its gradient. # Here we use a multi-class logistic loss # $$ \Ll(z,y) \eqdef \log \sum_{i=1}^k e^{z_i} - \dotp{z}{y}. $$ # # # Note that in practice the computation is done in parallel over an array # $x$ of size $(p,n)$ of $n$ points in $\RR^p$, where the class # probability to predict is an array $y$ of size $k,n$ where $k$ is # the number of classes. # # Useful helpers. # In[6]: # def dotp(x,y): return x.flatten().dot( y.flatten().transpose() ) def dotp(x,y): return sum( x.flatten() * y.flatten() ) def max2(S): return np.tile( S.max(axis=1,keepdims=1), (1,S.shape[1]) ) # Stabilized log-sum-exp # In[7]: def LSE0(S): return np.log( np.exp(S).sum(axis=1,keepdims=1)) def LSE(S): return LSE0( S-max2(S) ) + S.max(axis=1,keepdims=1) # stabilized soft-max # In[8]: def SM0(S): return np.exp(S) / np.tile( np.exp(S).sum(axis=1,keepdims=1), (1, S.shape[1]) ) def SM(S): return SM0(S-max2(S)) # Energy, $y$ being a target probability distribution # In[9]: def lossF(z,y): return LSE(z.transpose()).sum() - dotp(z,y) # gradient # In[10]: def lossG(z,y): return SM(z.transpose()).transpose() - y # Load the activation function. Here we use an atan sigmoid function. # In[11]: def rhoF(u): return np.arctan(u) def rhoG(u): return 1/(1+u**2) # Display the activation. # In[12]: t = np.linspace(-5,5,201) plt.clf plt.plot(t, rhoF(t)) plt.axis('tight'); # Dimensions $d_r$ of the layers. # In[13]: D = np.array([p,15,k]) # here a single hidden layer # Initialize the layers randomly. # In[14]: R = D.size-1 A = [] b = [] for r in np.arange(0,R): A.append(np.random.randn(D[r+1],D[r])) b.append(np.random.randn(D[r+1],1)) # Evaluate the network. # Bookkeep the intermediate results: this is crucial for the computation of # the gradient. # In[15]: def ForwardNN(A,b,Z,R): X = [] X.append(Z) for r in np.arange(0,R): X.append( rhoF( np.dot(A[r],X[r]) + np.tile(b[r],[1,Z.shape[1]]) ) ) return X X = ForwardNN(A,b,x,R) L = lossF(X[-1],Y) # Network Optimization # -------------------- # The network parameters are optimized by minimizing the non-convex # empirical loss minimization through gradient descent. # # # Initialize the gradient as # $$ \nabla_{x_{R+1}} \ell = \nabla \Ll(x_{R+1},y) $$ # In[16]: gx = lossG(X[R],Y) # The successive gradient with respect to the intermediate variables $x_r$ # are solutions of a backward recursion, which # corresponds to the celebrated backpropagation algorithm. # $$ \forall r=R,\ldots,1, \quad # \nabla_{x_{r}} \ell = A_r^\top M_r $$ # where we denoted # $$ M_r \eqdef \rho'(A_r x_r + b_r ) \odot \nabla_{x_{r+1}} \ell, $$ # where $\odot$ is entry-wise multiplications. # # # From these gradients with respect to the intermediate layers variables, the # gradient with respect to the network paramters are retrieved as # $$ \nabla_{A_r} \ell = M_r x_r^\top, \qandq # \nabla_{b_r} \ell = M_r 1_n. $$ # # # Perform the back-propagation. # In[17]: def BackwardNN(A,b,X,R): gA = [] # gradient with respect to A gb = [] # gradient with respect to b for r in np.arange(0,R): gA.append([]) gb.append([]) gx = lossG(X[R],Y) # initialize the gradient for r in np.arange(R-1,-1,-1): M = rhoG( A[r].dot(X[r]) + np.tile(b[r],[1,n]) ) * gx # nabla_X[r] gx = A[r].transpose().dot(M) # nabla_A[r] gA[r] = M.dot(X[r].transpose()) # nabla_b[r] gb[r] = MakeCol(M.sum(axis=1)) return [gA,gb] # __Exercise 1__ # # Implement the gradient descent. # In[18]: run -i nt_solutions/ml_6_nn/exo1 # In[19]: ## Insert your code here. # Grid for evaluation. # In[20]: q = 100; t = np.linspace(-1,1,q) [Yg,Xg] = np.meshgrid(t,t) Z = np.vstack([Xg.flatten(), Yg.flatten()]) # Classification maps # In[21]: V = ForwardNN(A,b,Z,R) U = np.reshape(SM(V[R].transpose()), [q,q,k] ) # Turn it into color. # In[22]: R = np.zeros((q,q,3)) for i in np.arange(0,k): for a in np.arange(0,3): R[:,:,a] = R[:,:,a] + U[:,:,i] * col[a,i] # Final display of points and class probability. # In[23]: M=1 plt.clf plt.imshow(R.transpose((1, 0, 2)), origin="lower", extent=[-M,M,-M,M]) for i in np.arange(0,k): I = find(y==i+1) plt.plot(x[0,I], x[1,I], '.', color=col[:,i]*.8) plt.axis('off'); # __Exercise 2__ # # Check the influence of the number of layers # In[24]: run -i nt_solutions/ml_6_nn/exo2 # In[25]: ## Insert your code here. # Automatic Differentiation # -------------------- # # Instead of computing the gradient "by hand", we use an automatic differentation toolbox, here [AutoGrad](https://github.com/HIPS/autograd). # # It uses reverse mode automatic differentation to compute the gradient at the same expense at computing the function itself. # See [this tutorial for more information](https://rufflewind.com/2016-12-30/reverse-mode-automatic-differentiation). # In[26]: from __future__ import absolute_import import autograd.numpy as np import autograd as ag from autograd import elementwise_grad as egrad # Test on a scalar value function, computing and plotting higher-order derivatives. # In[27]: def tanh(x): return (1.0 - np.exp(-x)) / (1.0 + np.exp(-x)) t = np.linspace(-7, 7, 200) plt.plot(t, tanh(t), t, egrad(tanh)(t), # first derivative t, egrad(egrad(tanh))(t), # second derivative t, egrad(egrad(egrad(tanh)))(t), # third derivative t, egrad(egrad(egrad(egrad(tanh))))(t), # fourth derivative t, egrad(egrad(egrad(egrad(egrad(tanh)))))(t), # fifth derivative t, egrad(egrad(egrad(egrad(egrad(egrad(tanh))))))(t)); # sixth derivative # Gradient of a quadratic functions. # In[28]: M = np.random.randn(5,2); def f(x): u = np.dot( M, x ) return np.sum( u*u ) u = MakeCol( np.array( [1.0,2.0] ) ) print( 'f=' + str(f(u)) ) g = ag.grad(f) print( 'nabla f=' + str( g( u ) ) ) # Function to be minimized for neural networks. # In[29]: def FuncNN(P): # A,b X = ForwardNN(P[0],P[1],x,R) return lossF(X[-1],Y) # In[30]: D = [p, 4, 5, 8, 4, k] R = np.size(D)-1 A = [] b = [] for r in np.arange(0,R): A.append(np.random.randn(D[r+1],D[r])) b.append(np.random.randn(D[r+1],1)) # Compute the funcfion and the gradient # In[31]: FuncNNG = ag.value_and_grad(FuncNN) [u,g] = FuncNNG((A,b)) gA = g[0] # gradient with respect to A gb = g[1] # gradient with respect to b # Compare with the gradient computed "by hand". # In[32]: import numpy X = ForwardNN(A,b,x,R) [gA1,gb1] = BackwardNN(A,b,X,R) R = np.size(A) e = 0 for i in range(R): e += np.linalg.norm(gA[r]-gA1[r]) e += np.linalg.norm(gb[r]-gb1[r]) print('Error=' + str(e))