#!/usr/bin/env python
# coding: utf-8

# # Implementing a Neural Network
# 
# *from Stanford CS231n assignment 2*
# 
# In this exercise we will develop a neural network with fully-connected layers to perform classification, and test it out on the CIFAR-10 dataset.

# In[1]:


# A bit of setup
import numpy as np
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


# In[2]:


from itertools import product
import pickle


# The neural network parameters will be stored in a dictionary (`model` below), where the keys are the parameter names and the values are numpy arrays. Below, we initialize toy data and a toy model that we will use to verify your implementations.

# In[3]:


# Create some toy data to check your implementations
input_size = 4
hidden_size = 10
num_classes = 3
num_inputs = 5

def init_toy_model():
    model = {}
    model['W1'] = np.linspace(-0.2, 0.6, num=input_size*hidden_size).reshape(input_size, hidden_size)
    model['b1'] = np.linspace(-0.3, 0.7, num=hidden_size)
    model['W2'] = np.linspace(-0.4, 0.1, num=hidden_size*num_classes).reshape(hidden_size, num_classes)
    model['b2'] = np.linspace(-0.5, 0.9, num=num_classes)
    return model

def init_toy_data():
    X = np.linspace(-0.2, 0.5, num=num_inputs*input_size).reshape(num_inputs, input_size)
    y = np.array([0, 1, 2, 2, 1])
    return X, y

model = init_toy_model()
X, y = init_toy_data()


# # Forward pass: compute scores
# Open the file `cs231n/classifiers/neural_net.py` and look at the function `two_layer_net`. This function is very similar to the loss functions you have written for the SVM and Softmax exercises: It takes the data and weights and computes the class scores, the loss, and the gradients on the parameters. 
# 
# Implement the first part of the forward pass which uses the weights and biases to compute the scores for all inputs.

# In[4]:


def two_layer_net(X, model, y=None, reg=0.0):
    """Compute the loss and gradients for a two layer fully connected NN.

    The net has an input dimension of D, a hidden layer dimension of H,
    and performs classification over C classes. We use a softmax loss function
    and L2 regularization the the weight matrices. The two layer net should
    use a ReLU nonlinearity after the first affine layer.

    The two layer net has the following architecture:

        input - fully connected layer - ReLU - fully connected layer - softmax

    The outputs of the second fully-connected layer are the scores for each
    class.

    Inputs:
    - X: Input data of shape (N, D). Each X[i] is a training sample.
    - model: Dictionary mapping parameter names to arrays of parameter values.
      It should contain the following:
      - W1: First layer weights; has shape (D, H)
      - b1: First layer biases; has shape (H,)
      - W2: Second layer weights; has shape (H, C)
      - b2: Second layer biases; has shape (C,)
    - y: Vector of training labels. y[i] is the label for X[i], and each y[i]
      is an integer in the range 0 <= y[i] < C. This parameter is optional;
      if it is not passed then we only return scores, and if it is passed then
      we instead return the loss and gradients.
    - reg: Regularization strength.

    Returns:
    If y is not passed, return a matrix scores of shape (N, C) where
    scores[i, c] is the score for class c on input X[i].

    If y is passed, instead return a tuple of:
    - loss: Loss (data loss and regularization loss) for this batch of training
      samples.
    - grads: Dictionary mapping parameter names to gradients of those
      parameters with respect to the loss function. This should have the same
      keys as model.
    """

    # unpack variables from the model dictionary
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    N, D = X.shape

    # compute the forward pass
    scores = None   # shape (N, C)

    # Layer 1
    # ReLU forward implementation
    # Ref: http://cs231n.github.io/neural-networks-1/
    s1 = X.dot(W1) + b1  # shape (N, H)
    resp1 = np.where(s1 > 0, s1, 0)  # shape (N, H)

    # Layer 2
    s2 = resp1.dot(W2) + b2  # shape (N, C)
    scores = s2

    # If the targets are not given then jump out, we're done
    if y is None:
        return scores

    # compute the loss
    loss = None
    f = scores.T - np.max(scores, axis=1)  # shape (C, N)
    f = np.exp(f)
    p = f / np.sum(f, axis=0)  # shape (C, N)

    # loss function
    _sample_ix = np.arange(N)
    loss = np.mean(-np.log(p[y, _sample_ix]))
    loss += (0.5 * reg) * np.sum(W1 * W1)
    loss += (0.5 * reg) * np.sum(W2 * W2)

    # compute the gradients
    grads = {}

    df = p  # (C, N)
    df[y, _sample_ix] -= 1
    # (H, C) = ((C, N) x (N, H)).T
    dW2 = df.dot(resp1).T / N  # (H, C)
    dW2 += reg * W2
    grads['W2'] = dW2

    # C = (C, N)
    db2 = np.mean(df, axis=1)  # C
    grads['b2'] = db2

    # (N, H) =  (H, C)
    dresp1 = W2.dot(df).T / N
    ds1 = np.where(s1 > 0, dresp1, 0)  # (N, H)
    dW1 = X.T.dot(ds1)  # (D, H)
    dW1 += reg * W1
    grads['W1'] = dW1

    db1 = np.sum(ds1, axis=0)  # H
    grads['b1'] = db1
    return loss, grads


# In[5]:


scores = two_layer_net(X, model)
print(scores)
correct_scores = [[-0.5328368, 0.20031504, 0.93346689],
 [-0.59412164, 0.15498488, 0.9040914 ],
 [-0.67658362, 0.08978957, 0.85616275],
 [-0.77092643, 0.01339997, 0.79772637],
 [-0.89110401, -0.08754544, 0.71601312]]

# the difference should be very small. We get 3e-8
print('Difference between your scores and correct scores:')
print(np.sum(np.abs(scores - correct_scores)))


# # Forward pass: compute loss
# In the same function, implement the second part that computes the data and regularizaion loss.

# In[6]:


reg = 0.1
loss, _ = two_layer_net(X, model, y, reg)
print(loss)
correct_loss = 1.38191946092

# should be very small, we get 5e-12
print('Difference between your loss and correct loss:')
print(np.sum(np.abs(loss - correct_loss)))


# # Backward pass
# Implement the rest of the function. This will compute the gradient of the loss with respect to the variables `W1`, `b1`, `W2`, and `b2`. Now that you (hopefully!) have a correctly implemented forward pass, you can debug your backward pass using a numeric gradient check:

# In[7]:


from cs231n.gradient_check import eval_numerical_gradient

# Use numeric gradient checking to check your implementation of the backward pass.
# If your implementation is correct, the difference between the numeric and
# analytic gradients should be less than 1e-8 for each of W1, W2, b1, and b2.

loss, grads = two_layer_net(X, model, y, reg)

# these should all be less than 1e-8 or so
for param_name in grads:
    param_grad_num = eval_numerical_gradient(
        lambda W: two_layer_net(X, model, y, reg)[0],
        model[param_name], 
        verbose=False
    )
    print('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))


# # Train the network
# To train the network we will use SGD with Momentum. Last assignment you implemented vanilla SGD. You will now implement the momentum update and the RMSProp update. Open the file `classifier_trainer.py` and familiarze yourself with the `ClassifierTrainer` class. It performs optimization given an arbitrary cost function data, and model. By default it uses vanilla SGD, which we have already implemented for you. First, run the optimization below using Vanilla SGD:

# In[8]:


from cs231n.classifier_trainer import ClassifierTrainer

model = init_toy_model()
trainer = ClassifierTrainer()
# call the trainer to optimize the loss
# Notice that we're using sample_batches=False, so we're performing Gradient Descent (no sampled batches of data)
best_model, loss_history, _, _ = trainer.train(X, y, X, y,
                                             model, two_layer_net,
                                             reg=0.001,
                                             learning_rate=1e-1, momentum=0.0, learning_rate_decay=1,
                                             update='sgd', sample_batches=False,
                                             num_epochs=100,
                                             verbose=False)
print('Final loss with vanilla SGD: %f' % (loss_history[-1], ))


# Now fill in the **momentum update** in the first missing code block inside the `train` function, and run the same optimization as above but with the momentum update. You should see a much better result in the final obtained loss:

# In[9]:


model = init_toy_model()
trainer = ClassifierTrainer()
# call the trainer to optimize the loss
# Notice that we're using sample_batches=False, so we're performing Gradient Descent (no sampled batches of data)
best_model, loss_history, _, _ = trainer.train(X, y, X, y,
                                             model, two_layer_net,
                                             reg=0.001,
                                             learning_rate=1e-1, momentum=0.9, learning_rate_decay=1,
                                             update='momentum', sample_batches=False,
                                             num_epochs=100,
                                             verbose=False)
correct_loss = 0.494394
print('Final loss with momentum SGD: %f. We get: %f' % (loss_history[-1], correct_loss))


# Now also implement the **RMSProp** update rule inside the `train` function and rerun the optimization:

# In[10]:


model = init_toy_model()
trainer = ClassifierTrainer()
# call the trainer to optimize the loss
# Notice that we're using sample_batches=False, so we're performing Gradient Descent (no sampled batches of data)
best_model, loss_history, _, _ = trainer.train(X, y, X, y,
                                             model, two_layer_net,
                                             reg=0.001,
                                             learning_rate=1e-1, momentum=0.9, learning_rate_decay=1,
                                             update='rmsprop', sample_batches=False,
                                             num_epochs=100,
                                             verbose=False)
correct_loss = 0.439368
print('Final loss with RMSProp: %f. We get: %f' % (loss_history[-1], correct_loss))


# # Load the data
# Now that you have implemented a two-layer network that passes gradient checks, it's time to load up our favorite CIFAR-10 data so we can use it to train a classifier.

# In[11]:


from cs231n.data_utils import load_CIFAR10

def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the two-layer neural net classifier. These are the same steps as
    we used for the SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
        
    # Subsample the data
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image

    # Reshape data to rows
    X_train = X_train.reshape(num_training, -1)
    X_val = X_val.reshape(num_validation, -1)
    X_test = X_test.reshape(num_test, -1)

    return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)


# # Train a network
# To train our network we will use SGD with momentum. In addition, we will adjust the learning rate with an exponential learning rate schedule as optimization proceeds; after each epoch, we will reduce the learning rate by multiplying it by a decay rate.

# In[12]:


from IPython.html import widgets
from IPython.display import display as ipydisplay
from cs231n.vis_utils import ParametersInspectorWindow


# In[13]:


def init_two_layer_model(input_size, hidden_size, output_size):
    """Initialize the weights and biases for a two-layer fully connected
    neural network.

    The net has an input dimension of D, a hidden layer dimension of H, and
    performs classification over C classes. Weights are initialized to small
    random values and biases are initialized to zero.

    Inputs:
    - input_size: The dimension D of the input data
    - hidden_size: The number of neurons H in the hidden layer
    - ouput_size: The number of classes C

    Returns:
    A dictionary mapping parameter names to arrays of parameter values.
    It has the following keys:
        - W1: First layer weights; has shape (D, H)
        - b1: First layer biases; has shape (H,)
        - W2: Second layer weights; has shape (H, C)
        - b2: Second layer biases; has shape (C,)
    """
    # initialize a model
    model = {}
    model['W1'] = 0.00001 * np.random.randn(input_size, hidden_size)
    model['b1'] = np.zeros(hidden_size)
    model['W2'] = 0.00001 * np.random.randn(hidden_size, output_size)
    model['b2'] = np.zeros(output_size)
    return model


# In[14]:


w = widgets.IntProgress()
ipydisplay(w)

model = init_two_layer_model(32*32*3, 50, 10) # input size, hidden size, number of classes
trainer = ClassifierTrainer()
best_model, loss_history, train_acc, val_acc = trainer.train(
    X_train, y_train, X_val, y_val,
    model, two_layer_net,
    # parameters to be tuned
    num_epochs=7, reg=1,
    momentum=0.9, learning_rate_decay = 0.95,
    learning_rate=2.5e-5, 
    # end of parameters
    progress_bar=w, verbose=True
)


# In[15]:


train_acc, val_acc


# # Debug the training
# With the default parameters we provided above, you should get a validation accuracy of about 0.37 on the validation set. This isn't very good.
# 
# One strategy for getting insight into what's wrong is to plot the loss function and the accuracies on the training and validation sets during optimization.
# 
# Another strategy is to visualize the weights that were learned in the first layer of the network. In most neural networks trained on visual data, the first layer weights typically show some visible structure when visualized.

# In[16]:


# Plot the loss function and train / validation accuracies
def vis_history(loss_history, train_acc, val_acc):
    fig = plt.figure()
    plt.subplot(2, 1, 1)
    plt.plot(loss_history)
    plt.title('Loss history')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')

    plt.subplot(2, 1, 2)
    plt.plot(train_acc)
    plt.plot(val_acc)
    plt.legend(['Training accuracy', 'Validation accuracy'], loc='lower right')
    plt.xlabel('Epoch')
    plt.ylabel('Clasification accuracy')
    return fig


# In[17]:


fig = vis_history(loss_history, train_acc, val_acc)


# In[18]:


from cs231n.vis_utils import visualize_grid

# Visualize the weights of the network

def show_net_weights(model):
    plt.imshow(visualize_grid(model['W1'].T.reshape(-1, 32, 32, 3), padding=3).astype('uint8'))
    plt.gca().axis('off')
    plt.show()

show_net_weights(best_model)


# # Run on the test set
# When you are done experimenting, you should evaluate your final trained network on the test set. 
# 
# **We will give you extra bonus point for every 1% of accuracy above 56%.**

# In[19]:


scores_test = two_layer_net(X_test, best_model)
print('Test accuracy: ', np.mean(np.argmax(scores_test, axis=1) == y_test))