#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import theano
import theano.tensor as T
import lasagne

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

import gzip
import pickle


# In[2]:


# Seed for reproducibility
np.random.seed(42)


# In[3]:


# Download the MNIST digits dataset
get_ipython().system('wget -N http://deeplearning.net/data/mnist/mnist.pkl.gz')


# In[4]:


# Load training and test splits as numpy arrays
train, val, test = pickle.load(gzip.open('mnist.pkl.gz'))

X_train, y_train = train
X_val, y_val = val


# In[5]:


# The original 28x28 pixel images are flattened into 784 dimensional feature vectors
X_train.shape


# In[6]:


# Plot the first few examples 
plt.figure(figsize=(12,3))
for i in range(10):
    plt.subplot(1, 10, i+1)
    plt.imshow(X_train[i].reshape((28, 28)), cmap='gray', interpolation='nearest')
    plt.axis('off')


# In[7]:


# For training, we want to sample examples at random in small batches
def batch_gen(X, y, N):
    while True:
        idx = np.random.choice(len(y), N)
        yield X[idx].astype('float32'), y[idx].astype('int32')


# In[8]:


# A very simple network, a single layer with one neuron per target class.
# Using the softmax activation function gives us a probability distribution at the output.
l_in = lasagne.layers.InputLayer((None, 784))
l_out = lasagne.layers.DenseLayer(
    l_in,
    num_units=10,
    nonlinearity=lasagne.nonlinearities.softmax)


# In[9]:


# Symbolic variables for our input features and targets
X_sym = T.matrix()
y_sym = T.ivector()


# In[10]:


# Theano expressions for the output distribution and predicted class
output = lasagne.layers.get_output(l_out, X_sym)
pred = output.argmax(-1)


# In[11]:


# The loss function is cross-entropy averaged over a minibatch, we also compute accuracy as an evaluation metric
loss = T.mean(lasagne.objectives.categorical_crossentropy(output, y_sym))
acc = T.mean(T.eq(pred, y_sym))


# In[12]:


# We retrieve all the trainable parameters in our network - a single weight matrix and bias vector
params = lasagne.layers.get_all_params(l_out)
print(params)


# In[13]:


# Compute the gradient of the loss function with respect to the parameters.
# The stochastic gradient descent algorithm produces updates for each param
grad = T.grad(loss, params)
updates = lasagne.updates.sgd(grad, params, learning_rate=0.05)
print(updates)


# In[14]:


# We define a training function that will compute the loss and accuracy, and take a single optimization step
f_train = theano.function([X_sym, y_sym], [loss, acc], updates=updates)


# In[15]:


# The validation function is similar, but does not update the parameters
f_val = theano.function([X_sym, y_sym], [loss, acc])


# In[16]:


# The prediction function doesn't require targets, and outputs only the predicted class values
f_predict = theano.function([X_sym], pred)


# In[17]:


# We'll choose a batch size, and calculate the number of batches in an "epoch"
# (approximately one pass through the data).
BATCH_SIZE = 64
N_BATCHES = len(X_train) // BATCH_SIZE
N_VAL_BATCHES = len(X_val) // BATCH_SIZE


# In[18]:


# Minibatch generators for the training and validation sets
train_batches = batch_gen(X_train, y_train, BATCH_SIZE)
val_batches = batch_gen(X_val, y_val, BATCH_SIZE)


# In[19]:


# Try sampling from the batch generator.
# Plot an image and corresponding label to verify they match.
X, y = next(train_batches)
plt.imshow(X[0].reshape((28, 28)), cmap='gray', interpolation='nearest')
print(y[0])


# In[20]:


# For each epoch, we call the training function N_BATCHES times,
# accumulating an estimate of the training loss and accuracy.
# Then we do the same thing for the validation set.
# Plotting the ratio of val to train loss can help recognize overfitting.
for epoch in range(10):
    train_loss = 0
    train_acc = 0
    for _ in range(N_BATCHES):
        X, y = next(train_batches)
        loss, acc = f_train(X, y)
        train_loss += loss
        train_acc += acc
    train_loss /= N_BATCHES
    train_acc /= N_BATCHES

    val_loss = 0
    val_acc = 0
    for _ in range(N_VAL_BATCHES):
        X, y = next(val_batches)
        loss, acc = f_val(X, y)
        val_loss += loss
        val_acc += acc
    val_loss /= N_VAL_BATCHES
    val_acc /= N_VAL_BATCHES
    
    print('Epoch {}, Train (val) loss {:.03f} ({:.03f}) ratio {:.03f}'.format(
            epoch, train_loss, val_loss, val_loss/train_loss))
    print('Train (val) accuracy {:.03f} ({:.03f})'.format(train_acc, val_acc))


# In[21]:


# We can retrieve the value of the trained weight matrix from the output layer.
# It can be interpreted as a collection of images, one per class
weights = l_out.W.get_value()
print(weights.shape)


# In[22]:


# Plotting the weight images, we can recognize similarities to the target images
plt.figure(figsize=(12,3))
for i in range(10):
    plt.subplot(1, 10, i+1)
    plt.imshow(weights[:,i].reshape((28, 28)), cmap='gray', interpolation='nearest')
    plt.axis('off')


# Exercises
# =====

# 1. Logistic regression
# ----------------------
# 
# The simple network we created is similar to a logistic regression model. Verify that the accuracy is close to that of `sklearn.linear_model.LogisticRegression`.

# In[23]:


# Uncomment and execute this cell for an example solution
#%load spoilers/logreg.py


# 2. Hidden layer
# ---------------
# 
# Try adding one or more "hidden" `DenseLayers` between the input and output. Experiment with different numbers of hidden units.

# In[24]:


# Uncomment and execute this cell for an example solution
#%load spoilers/hiddenlayer.py


# 3. Optimizer
# ------------
# 
# Try one of the other algorithms available in `lasagne.updates`. You may also want to adjust the learning rate.
# Visualize and compare the trained weights. Different optimization trajectories may lead to very different results, even if the performance is similar. This can be important when training more complicated networks.

# In[25]:


# Uncomment and execute this cell for an example solution
# %load spoilers/optimizer.py