# Softmax exercise¶

Adapt from the Stanford CS231n assignment1, find the original version on the course website.

In this exercise we will:

• implement a fully-vectorized loss function for the SVM
• implement the fully-vectorized expression for its analytic gradient
• use a validation set to tune the learning rate and regularization strength
• optimize the loss function with SGD
• visualize the final learned weights
In :
import random
import time

import numpy as np
import matplotlib.pyplot as plt

# plotting setting
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


Load the data and split into training / validation / testing datasets. See notebook 00 for more details.

In :
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for the linear classifier. These are the same steps as we used for the
SVM, but condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

# subsample the data
mask = range(num_training, num_training + num_validation)

# Preprocessing: reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape, -1))
X_val = np.reshape(X_val, (X_val.shape, -1))
X_test = np.reshape(X_test, (X_test.shape, -1))

# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis = 0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image

# add bias dimension and transform into columns
X_train = np.hstack([X_train, np.ones((X_train.shape, 1))]).T
X_val = np.hstack([X_val, np.ones((X_val.shape, 1))]).T
X_test = np.hstack([X_test, np.ones((X_test.shape, 1))]).T

return X_train, y_train, X_val, y_val, X_test, y_test

# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape:', X_train.shape)
print('Train labels shape:', y_train.shape)
print('Validation data shape:', X_val.shape)
print('Validation labels shape:', y_val.shape)
print('Test data shape:', X_test.shape)
print('Test labels shape:', y_test.shape)

Train data shape: (3073, 49000)
Train labels shape: (49000,)
Validation data shape: (3073, 1000)
Validation labels shape: (1000,)
Test data shape: (3073, 1000)
Test labels shape: (1000,)

In :
class LinearClassifier:

def __init__(self):
self.W = None

def train(
self, X, y,
learning_rate=1e-3, reg=1e-5, num_iters=100, batch_size=200,
verbose=False, seed=None
):
"""Train this linear classifier using stochastic gradient descent.

Inputs:
- X: D x N array of training data. Each training point is a
D-dimensional column.
- y: 1-dimensional array of length N with labels 0...K-1 for K classes.
- learning_rate: (float) learning rate for optimization.
- reg: (float) regularization strength.
- num_iters: (integer) number of steps to take when optimizing.
- batch_size: (integer) number of training examples to use
at each step.
- verbose: (boolean) If true, print progress during optimization.

Outputs:
A list containing the value of the loss function at each training
iteration.
"""
dim, num_train = X.shape
# assume y takes values 0...K-1 where K is number of classes
num_classes = np.max(y) + 1
if self.W is None:
# lazily initialize W
self.W = np.random.randn(num_classes, dim) * 0.001

batch_rs = np.random.RandomState(seed)
# Run stochastic gradient descent to optimize W
loss_history = []
for it in range(num_iters):
batch_ix = batch_rs.choice(
np.arange(num_train),
size=batch_size, replace=True
)
X_batch = X[:, batch_ix]
y_batch = y[batch_ix]

# evaluate loss and gradient, internally use self.W
loss, grad = self.loss(X_batch, y_batch, reg)
loss_history.append(loss)

# perform parameter update

if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))

return loss_history

def predict(self, X):
"""
Use the trained weights of this linear classifier to predict labels for
data points.

Inputs:
- X: D x N array of training data. Each column is a D-dimensional point.

Returns:
- y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
array of length N, and each element is an integer giving the predicted
class.
"""
score_pred = self.W.dot(X)  # shape: C x N
y_pred = score_pred.argmax(axis=0)
return y_pred

def loss(self, X_batch, y_batch, reg):
"""
Compute the loss function and its derivative.
Subclasses will override this.

Inputs:
- X_batch: D x N array of data; each column is a data point.
- y_batch: 1-dimensional array of length N with labels 0...K-1, for K classes.
- reg: (float) regularization strength.

Returns: A tuple containing:
- loss as a single float
- gradient with respect to self.W; an array of the same shape as W
"""
raise NotImplementedError


## Softmax Classifier¶

In :
def softmax_loss_naive(W, X, y, reg):
"""
Softmax loss function, naive implementation (with loops)

Inputs:
- W: C x D array of weights
- X: D x N array of data. Data are D-dimensional columns
- y: 1-dimensional array of length N with labels 0...K-1, for K classes
- reg: (float) regularization strength

Returns:
a tuple of:
- loss as single float
- gradient with respect to weights W, an array of same size as W
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
num_train = X.shape
f = W.dot(X)  # shape: C x N
p = np.zeros(num_train, dtype=np.float)

for i in range(num_train):
f_i = f[:, i].copy()  # shape C x 1
f_i -= np.max(f_i)    # improve numerical stability
f_i = np.exp(f_i)
x_i = X[:, i]
all_class_p_i = f_i / np.sum(f_i)
p[i] = all_class_p_i[y[i]]

# all_class_p_i no used later, don't copy
dw_x_weight_i = all_class_p_i
dw_x_weight_i[y[i]] -= 1
dW -= dw_x_weight_i[:, np.newaxis] * x_i[np.newaxis, :]

loss += np.mean(-np.log(p))
loss += 0.5 * reg * np.sum(W * W)

# ref: http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression
dW /= -num_train
dW += reg * W
return loss, dW


### Sanity Check¶

In a randomly initiated weight, the loss should be close to $-\log(\frac{1}{10}) = -\log(0.1) \approx 2.303$

In :
%%time
# Generate a random softmax weight matrix and use it to compute the loss.
W = np.random.randn(10, 3073) * 0.0001
loss, grad = softmax_loss_naive(W, X_train, y_train, 0.0)

# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))

loss: 2.361666
sanity check: 2.302585
CPU times: user 6.36 s, sys: 91.5 ms, total: 6.45 s
Wall time: 4.44 s

In :
%%time
# Complete the implementation of softmax_loss_naive and implement a (naive)
# version of the gradient that uses nested loops.
loss, grad = softmax_loss_naive(W, X_train, y_train, 0.0)

# As we did for the SVM, use numeric gradient checking as a debugging tool.
f = lambda w: softmax_loss_naive(w, X_train, y_train, 0.0)

[Class: 0 Dim: 2221] numerical: -1.882017 analytic: -1.882017, relative error: 2.026398e-08
[Class: 6 Dim: 2581] numerical: -0.205202 analytic: -0.205202, relative error: 3.305388e-08
[Class: 2 Dim: 2379] numerical: -0.884728 analytic: -0.884729, relative error: 2.291555e-08
[Class: 9 Dim: 2941] numerical: 0.779991 analytic: 0.779991, relative error: 2.284831e-08
[Class: 0 Dim: 710] numerical: -3.913095 analytic: -3.913095, relative error: 1.904841e-08
CPU times: user 1min 7s, sys: 1.06 s, total: 1min 8s
Wall time: 48.5 s


### Vectorized loss function¶

In :
def softmax_loss_vectorized(W, X, y, reg):
"""
Softmax loss function, vectorized version.

Inputs and outputs are the same as softmax_loss_naive.
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
num_train = X.shape
_train_ix = np.arange(num_train)  # for sample coord 0...N-1

f = W.dot(X)  # shape: C x N
f -= np.max(f, axis=0)  # improve numerical stability
f = np.exp(f)
p = f / np.sum(f, axis=0)  # shape: C x N

# loss function
loss += np.mean(-np.log(p[y, _train_ix]))
loss += 0.5 * reg * np.sum(W * W)

# ref: http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression
dW_x_weight = p  # no use p later, don't copy
dW_x_weight[y, _train_ix] -= 1
# CxD -= CxN dot NxD
dW -= dW_x_weight.dot(X.T)
dW /= -num_train
dW += reg * W

return loss, dW

In :
# Now that we have a naive implementation of the softmax loss function and its gradient,
# implement a vectorized version in softmax_loss_vectorized.
# The two versions should compute the same results, but the vectorized version should be
# much faster.
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_train, y_train, 0.00001)
toc = time.time()
print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))

tic = time.time()
loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_train, y_train, 0.00001)
toc = time.time()
print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))

# The losses should match but your vectorized implementation should be much faster.
# The loss is a single number, so it is easy to compare the values computed
# by the two implementations. The gradient on the other hand is a matrix, so
# we use the Frobenius norm to compare them.
print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))

naive loss: 2.361666e+00 computed in 4.406073s
vectorized loss: 2.361666e+00 computed in 0.124524s
Loss difference: 0.000000


We now have vectorized and efficient expressions for the loss, the gradient and our gradient matches the numerical gradient. We are therefore ready to do SGD to minimize the loss.

In :
class Softmax(LinearClassifier):
""" A subclass that uses the Softmax + Cross-entropy loss function """

def loss(self, X_batch, y_batch, reg):
return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)

In :
# Now implement SGD in LinearSVM.train() function and run it with the code below
softmax = Softmax()
tic = time.time()
loss_hist = softmax.train(
X_train, y_train,
learning_rate = 5e-8, reg=1e3,
# better params:
# learning_rate=5e-7, reg=5e4,
num_iters=1500, seed=9527, verbose=True
)
toc = time.time()
print('That took %fs' % (toc - tic))

iteration 0 / 1500: loss 771.351533
iteration 100 / 1500: loss 6.911816
iteration 200 / 1500: loss 2.059462
iteration 300 / 1500: loss 2.065780
iteration 400 / 1500: loss 2.084819
iteration 500 / 1500: loss 2.094321
iteration 600 / 1500: loss 2.060693
iteration 700 / 1500: loss 2.053175
iteration 800 / 1500: loss 2.047805
iteration 900 / 1500: loss 2.082586
iteration 1000 / 1500: loss 2.086963
iteration 1100 / 1500: loss 2.060702
iteration 1200 / 1500: loss 2.057573
iteration 1300 / 1500: loss 2.116810
iteration 1400 / 1500: loss 2.082615
That took 4.024618s

In :
# A useful debugging strategy is to plot the loss as a function of
# iteration number:
plt.plot(loss_hist)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.show() In :
y_train_pred = softmax.predict(X_train)
print('training accuracy: {:.3%}'.format(np.mean(y_train == y_train_pred)))
y_val_pred = softmax.predict(X_val)
print('validation accuracy: {:.3%}'.format(np.mean(y_val == y_val_pred)))

training accuracy: 32.312%
validation accuracy: 34.700%

In :
# evaluate on test set
# Evaluate the best svm on test set
y_test_pred = softmax.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, ))

softmax on raw pixels final test set accuracy: 0.341000

In :
# Visualize the learned weights for each class
w = softmax.W[:,:-1] # strip out the bias
w = w.reshape(10, 32, 32, 3)

w_min, w_max = np.min(w), np.max(w)

classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
for i in range(10):
plt.subplot(2, 5, i + 1)

# Rescale the weights to be between 0 and 255
wimg = 255.0 * (w[i].squeeze() - w_min) / (w_max - w_min)
plt.imshow(wimg.astype('uint8'))
plt.axis('off')
plt.title(classes[i])

plt.show() In [ ]: