#!/usr/bin/env python
# coding: utf-8

# # Logistic Regression in Theano
# 
# Credits: Forked from [summerschool2015](https://github.com/mila-udem/summerschool2015) by mila-udem
# 
# This notebook is inspired from [the tutorial on logistic regression](http://deeplearning.net/tutorial/logreg.html) on [deeplearning.net](http://deeplearning.net).
# 
# In this notebook, we show how Theano can be used to implement the most basic classifier: the **logistic regression**. We start off with a quick primer of the model, which serves both as a refresher but also to anchor the notation and show how mathematical expressions are mapped onto Theano graphs.
# 
# In the deepest of machine learning traditions, this tutorial will tackle the exciting problem of MNIST digit classification.

# ## Get the data
# 
# In the mean time, let's just download a pre-packaged version of MNIST, and load each split of the dataset as NumPy ndarrays.

# In[ ]:


import os
import requests
import gzip
import six
from six.moves import cPickle

if not os.path.exists('mnist.pkl.gz'):
    r = requests.get('http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz')
    with open('mnist.pkl.gz', 'wb') as data_file:
        data_file.write(r.content)

with gzip.open('mnist.pkl.gz', 'rb') as data_file:
    if six.PY3:
        train_set, valid_set, test_set = cPickle.load(data_file, encoding='latin1')
    else:
        train_set, valid_set, test_set = cPickle.load(data_file)

train_set_x, train_set_y = train_set
valid_set_x, valid_set_y = valid_set
test_set_x, test_set_y = test_set


# ## The model
# Logistic regression is a probabilistic, linear classifier. It is parametrized
# by a weight matrix $W$ and a bias vector $b$. Classification is
# done by projecting an input vector onto a set of hyperplanes, each of which
# corresponds to a class. The distance from the input to a hyperplane reflects
# the probability that the input is a member of the corresponding class.
# 
# Mathematically, the probability that an input vector $x$ is a member of a
# class $i$, a value of a stochastic variable $Y$, can be written as:
# 
# $$P(Y=i|x, W,b) = softmax_i(W x + b) = \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}$$
# 
# The model's prediction $y_{pred}$ is the class whose probability is maximal, specifically:
# 
# $$  y_{pred} = {\rm argmax}_i P(Y=i|x,W,b)$$
# 
# Now, let us define our input variables. First, we need to define the dimension of our tensors:
# - `n_in` is the length of each training vector,
# - `n_out` is the number of classes.
# 
# Our variables will be:
# - `x` is a matrix, where each row contains a different example of the dataset. Its shape is `(batch_size, n_in)`, but `batch_size` does not have to be specified in advance, and can change during training.
# - `W` is a shared matrix, of shape `(n_in, n_out)`, initialized with zeros. Column `k` of `W` represents the separation hyperplane for class `k`.
# - `b` is a shared vector, of length `n_out`, initialized with zeros. Element `k` of `b` represents the free parameter of hyperplane `k`.

# In[ ]:


import numpy
import theano
from theano import tensor

# Size of the data
n_in = 28 * 28
# Number of classes
n_out = 10

x = tensor.matrix('x')
W = theano.shared(value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX),
                  name='W',
                  borrow=True)
b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX),
                  name='b',
                  borrow=True)


# Now, we can build a symbolic expression for the matrix of class-membership probability (`p_y_given_x`), and for the class whose probability is maximal (`y_pred`).

# In[ ]:


p_y_given_x = tensor.nnet.softmax(tensor.dot(x, W) + b)
y_pred = tensor.argmax(p_y_given_x, axis=1)


# ## Defining a loss function
# Learning optimal model parameters involves minimizing a loss function. In the
# case of multi-class logistic regression, it is very common to use the negative
# log-likelihood as the loss. This is equivalent to maximizing the likelihood of the
# data set $\cal{D}$ under the model parameterized by $\theta$. Let
# us first start by defining the likelihood $\cal{L}$ and loss
# $\ell$:
# 
# $$\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
#      \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
#    \ell (\theta=\{W,b\}, \mathcal{D}) = - \mathcal{L} (\theta=\{W,b\}, \mathcal{D})
# $$
# 
# Again, we will express those expressions using Theano. We have one additional input, the actual target class `y`:
# - `y` is an input vector of integers, of length `batch_size` (which will have to match the length of `x` at runtime). The length of `y` can be symbolically expressed by `y.shape[0]`.
# - `log_prob` is a `(batch_size, n_out)` matrix containing the log probabilities of class membership for each example.
# - `arange(y.shape[0])` is a symbolic vector which will contain `[0,1,2,... batch_size-1]`
# - `log_likelihood` is a vector containing the log probability of the target, for each example.
# - `loss` is the mean of the negative `log_likelihood` over the examples in the minibatch.

# In[ ]:


y = tensor.lvector('y')
log_prob = tensor.log(p_y_given_x)
log_likelihood = log_prob[tensor.arange(y.shape[0]), y]
loss = - log_likelihood.mean()


# ## Training procedure
# This notebook will use the method of stochastic gradient descent with mini-batches (MSGD) to find values of `W` and `b` that minimize the loss.
# 
# We can let Theano compute symbolic expressions for the gradient of the loss wrt `W` and `b`.

# In[ ]:


g_W, g_b = theano.grad(cost=loss, wrt=[W, b])


# `g_W` and `g_b` are symbolic variables, which can be used as part of a computation graph. In particular, let us define the expressions for one step of gradient descent for `W` and `b`, for a fixed learning rate.

# In[ ]:


learning_rate = numpy.float32(0.13)
new_W = W - learning_rate * g_W
new_b = b - learning_rate * g_b


# We can then define **update expressions**, or pairs of (shared variable, expression for its update), that we will use when compiling the Theano function. The updates will be performed each time the function gets called.
# 
# The following function, `train_model`, returns the loss on the current minibatch, then changes the values of the shared variables according to the update rules. It needs to be passed `x` and `y` as inputs, but not the shared variables, which are implicit inputs.
# 
# The entire learning algorithm thus consists in looping over all examples in the dataset, considering all the examples in one minibatch at a time, and repeatedly calling the `train_model` function.

# In[ ]:


train_model = theano.function(inputs=[x, y],
                              outputs=loss,
                              updates=[(W, new_W),
                                       (b, new_b)])


# ### Testing the model
# When testing the model, we are interested in the number of misclassified examples (and not only in the likelihood). Here, we build a symbolic expression for retrieving the number of misclassified examples in a minibatch.
# 
# This will also be useful to apply on the validation and testing sets, in order to monitor the progress of the model during training, and to do early stopping.

# In[ ]:


misclass_nb = tensor.neq(y_pred, y)
misclass_rate = misclass_nb.mean()

test_model = theano.function(inputs=[x, y],
                             outputs=misclass_rate)


# ## Training the model
# Here is the main training loop of the algorithm:
# - For each *epoch*, or pass through the training set
#   - split the training set in minibatches, and call `train_model` on each minibatch
#   - split the validation set in minibatches, and call `test_model` on each minibatch to measure the misclassification rate
#   - if the misclassification rate has not improved in a while, stop training
# - Measure performance on the test set
# 
# The **early stopping procedure** is what decide whether the performance has improved enough. There are many variants, and we will not go into the details of this one here.
# 
# We first need to define a few parameters for the training loop and the early stopping procedure.

# In[ ]:


## Define a couple of helper variables and functions for the optimization
batch_size = 500
# compute number of minibatches for training, validation and testing
n_train_batches = train_set_x.shape[0] // batch_size
n_valid_batches = valid_set_x.shape[0] // batch_size
n_test_batches = test_set_x.shape[0] // batch_size

def get_minibatch(i, dataset_x, dataset_y):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_x = dataset_x[start_idx:end_idx]
    batch_y = dataset_y[start_idx:end_idx]
    return (batch_x, batch_y)

## early-stopping parameters
# maximum number of epochs
n_epochs = 1000
# look as this many examples regardless
patience = 5000
# wait this much longer when a new best is found
patience_increase = 2
# a relative improvement of this much is considered significant
improvement_threshold = 0.995

# go through this many minibatches before checking the network on the validation set;
# in this case we check every epoch
validation_frequency = min(n_train_batches, patience / 2)


# In[ ]:


import timeit
from six.moves import xrange

best_validation_loss = numpy.inf
test_score = 0.
start_time = timeit.default_timer()

done_looping = False
epoch = 0
while (epoch < n_epochs) and (not done_looping):
    epoch = epoch + 1
    for minibatch_index in xrange(n_train_batches):
        minibatch_x, minibatch_y = get_minibatch(minibatch_index, train_set_x, train_set_y)
        minibatch_avg_cost = train_model(minibatch_x, minibatch_y)

        # iteration number
        iter = (epoch - 1) * n_train_batches + minibatch_index
        if (iter + 1) % validation_frequency == 0:
            # compute zero-one loss on validation set
            validation_losses = []
            for i in xrange(n_valid_batches):
                valid_xi, valid_yi = get_minibatch(i, valid_set_x, valid_set_y)
                validation_losses.append(test_model(valid_xi, valid_yi))
            this_validation_loss = numpy.mean(validation_losses)
            print('epoch %i, minibatch %i/%i, validation error %f %%' %
                  (epoch,
                   minibatch_index + 1,
                   n_train_batches,
                   this_validation_loss * 100.))

            # if we got the best validation score until now
            if this_validation_loss < best_validation_loss:
                # improve patience if loss improvement is good enough
                if this_validation_loss < best_validation_loss * improvement_threshold:
                    patience = max(patience, iter * patience_increase)

                best_validation_loss = this_validation_loss

                # test it on the test set
                test_losses = []
                for i in xrange(n_test_batches):
                    test_xi, test_yi = get_minibatch(i, test_set_x, test_set_y)
                    test_losses.append(test_model(test_xi, test_yi))

                test_score = numpy.mean(test_losses)
                print('     epoch %i, minibatch %i/%i, test error of  best model %f %%' %
                      (epoch,
                       minibatch_index + 1,
                       n_train_batches,
                       test_score * 100.))

                # save the best parameters
                numpy.savez('best_model.npz', W=W.get_value(), b=b.get_value())

        if patience <= iter:
            done_looping = True
            break

end_time = timeit.default_timer()
print('Optimization complete with best validation score of %f %%, '
      'with test performance %f %%' %
      (best_validation_loss * 100., test_score * 100.))

print('The code ran for %d epochs, with %f epochs/sec' %
      (epoch, 1. * epoch / (end_time - start_time)))


# ## Visualization
# You can visualize the columns of `W`, which correspond to the separation hyperplanes for each class.

# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from utils import tile_raster_images

plt.clf()

# Increase the size of the figure
plt.gcf().set_size_inches(15, 10)

plot_data = tile_raster_images(W.get_value(borrow=True).T,
                               img_shape=(28, 28), tile_shape=(2, 5), tile_spacing=(1, 1))
plt.imshow(plot_data, cmap='Greys', interpolation='none')
plt.axis('off')