Adapt from the Stanford CS231n assignment1, find the original version on the course website.
In this exercise we will:
import random
import time
import numpy as np
import matplotlib.pyplot as plt
from cs231n.data_utils import load_CIFAR10
from cs231n.gradient_check import grad_check_sparse
# plotting setting
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
Load the data and split into training / validation / testing datasets. See notebook 00 for more details.
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for the linear classifier. These are the same steps as we used for the
SVM, but condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# subsample the data
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
# Preprocessing: reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_val = np.reshape(X_val, (X_val.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis = 0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
# add bias dimension and transform into columns
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]).T
X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]).T
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]).T
return X_train, y_train, X_val, y_val, X_test, y_test
# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape:', X_train.shape)
print('Train labels shape:', y_train.shape)
print('Validation data shape:', X_val.shape)
print('Validation labels shape:', y_val.shape)
print('Test data shape:', X_test.shape)
print('Test labels shape:', y_test.shape)
Train data shape: (3073, 49000) Train labels shape: (49000,) Validation data shape: (3073, 1000) Validation labels shape: (1000,) Test data shape: (3073, 1000) Test labels shape: (1000,)
class LinearClassifier:
def __init__(self):
self.W = None
def train(
self, X, y,
learning_rate=1e-3, reg=1e-5, num_iters=100, batch_size=200,
verbose=False, seed=None
):
"""Train this linear classifier using stochastic gradient descent.
Inputs:
- X: D x N array of training data. Each training point is a
D-dimensional column.
- y: 1-dimensional array of length N with labels 0...K-1 for K classes.
- learning_rate: (float) learning rate for optimization.
- reg: (float) regularization strength.
- num_iters: (integer) number of steps to take when optimizing.
- batch_size: (integer) number of training examples to use
at each step.
- verbose: (boolean) If true, print progress during optimization.
Outputs:
A list containing the value of the loss function at each training
iteration.
"""
dim, num_train = X.shape
# assume y takes values 0...K-1 where K is number of classes
num_classes = np.max(y) + 1
if self.W is None:
# lazily initialize W
self.W = np.random.randn(num_classes, dim) * 0.001
batch_rs = np.random.RandomState(seed)
# Run stochastic gradient descent to optimize W
loss_history = []
for it in range(num_iters):
batch_ix = batch_rs.choice(
np.arange(num_train),
size=batch_size, replace=True
)
X_batch = X[:, batch_ix]
y_batch = y[batch_ix]
# evaluate loss and gradient, internally use self.W
loss, grad = self.loss(X_batch, y_batch, reg)
loss_history.append(loss)
# perform parameter update
self.W -= grad * learning_rate
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
return loss_history
def predict(self, X):
"""
Use the trained weights of this linear classifier to predict labels for
data points.
Inputs:
- X: D x N array of training data. Each column is a D-dimensional point.
Returns:
- y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
array of length N, and each element is an integer giving the predicted
class.
"""
score_pred = self.W.dot(X) # shape: C x N
y_pred = score_pred.argmax(axis=0)
return y_pred
def loss(self, X_batch, y_batch, reg):
"""
Compute the loss function and its derivative.
Subclasses will override this.
Inputs:
- X_batch: D x N array of data; each column is a data point.
- y_batch: 1-dimensional array of length N with labels 0...K-1, for K classes.
- reg: (float) regularization strength.
Returns: A tuple containing:
- loss as a single float
- gradient with respect to self.W; an array of the same shape as W
"""
raise NotImplementedError
def softmax_loss_naive(W, X, y, reg):
"""
Softmax loss function, naive implementation (with loops)
Inputs:
- W: C x D array of weights
- X: D x N array of data. Data are D-dimensional columns
- y: 1-dimensional array of length N with labels 0...K-1, for K classes
- reg: (float) regularization strength
Returns:
a tuple of:
- loss as single float
- gradient with respect to weights W, an array of same size as W
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
num_train = X.shape[1]
f = W.dot(X) # shape: C x N
p = np.zeros(num_train, dtype=np.float)
for i in range(num_train):
f_i = f[:, i].copy() # shape C x 1
f_i -= np.max(f_i) # improve numerical stability
f_i = np.exp(f_i)
x_i = X[:, i]
all_class_p_i = f_i / np.sum(f_i)
p[i] = all_class_p_i[y[i]]
# Update gradient
# all_class_p_i no used later, don't copy
dw_x_weight_i = all_class_p_i
dw_x_weight_i[y[i]] -= 1
dW -= dw_x_weight_i[:, np.newaxis] * x_i[np.newaxis, :]
loss += np.mean(-np.log(p))
# Add regularization
loss += 0.5 * reg * np.sum(W * W)
# Gradient
# ref: http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression
dW /= -num_train
dW += reg * W
return loss, dW
In a randomly initiated weight, the loss should be close to $-\log(\frac{1}{10}) = -\log(0.1) \approx 2.303$
%%time
# Generate a random softmax weight matrix and use it to compute the loss.
W = np.random.randn(10, 3073) * 0.0001
loss, grad = softmax_loss_naive(W, X_train, y_train, 0.0)
# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))
loss: 2.361666 sanity check: 2.302585 CPU times: user 6.36 s, sys: 91.5 ms, total: 6.45 s Wall time: 4.44 s
%%time
# Complete the implementation of softmax_loss_naive and implement a (naive)
# version of the gradient that uses nested loops.
loss, grad = softmax_loss_naive(W, X_train, y_train, 0.0)
# As we did for the SVM, use numeric gradient checking as a debugging tool.
# The numeric gradient should be close to the analytic gradient.
f = lambda w: softmax_loss_naive(w, X_train, y_train, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, grad, 5) # increase 5 here to check for more times
[Class: 0 Dim: 2221] numerical: -1.882017 analytic: -1.882017, relative error: 2.026398e-08 [Class: 6 Dim: 2581] numerical: -0.205202 analytic: -0.205202, relative error: 3.305388e-08 [Class: 2 Dim: 2379] numerical: -0.884728 analytic: -0.884729, relative error: 2.291555e-08 [Class: 9 Dim: 2941] numerical: 0.779991 analytic: 0.779991, relative error: 2.284831e-08 [Class: 0 Dim: 710] numerical: -3.913095 analytic: -3.913095, relative error: 1.904841e-08 CPU times: user 1min 7s, sys: 1.06 s, total: 1min 8s Wall time: 48.5 s
def softmax_loss_vectorized(W, X, y, reg):
"""
Softmax loss function, vectorized version.
Inputs and outputs are the same as softmax_loss_naive.
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
num_train = X.shape[1]
_train_ix = np.arange(num_train) # for sample coord 0...N-1
f = W.dot(X) # shape: C x N
f -= np.max(f, axis=0) # improve numerical stability
f = np.exp(f)
p = f / np.sum(f, axis=0) # shape: C x N
# loss function
loss += np.mean(-np.log(p[y, _train_ix]))
loss += 0.5 * reg * np.sum(W * W)
# gradient
# ref: http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression
dW_x_weight = p # no use p later, don't copy
dW_x_weight[y, _train_ix] -= 1
# CxD -= CxN dot NxD
dW -= dW_x_weight.dot(X.T)
dW /= -num_train
dW += reg * W
return loss, dW
# Now that we have a naive implementation of the softmax loss function and its gradient,
# implement a vectorized version in softmax_loss_vectorized.
# The two versions should compute the same results, but the vectorized version should be
# much faster.
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_train, y_train, 0.00001)
toc = time.time()
print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))
tic = time.time()
loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_train, y_train, 0.00001)
toc = time.time()
print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))
# The losses should match but your vectorized implementation should be much faster.
# The loss is a single number, so it is easy to compare the values computed
# by the two implementations. The gradient on the other hand is a matrix, so
# we use the Frobenius norm to compare them.
grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))
print('Gradient difference: %f' % grad_difference)
naive loss: 2.361666e+00 computed in 4.406073s vectorized loss: 2.361666e+00 computed in 0.124524s Loss difference: 0.000000 Gradient difference: 0.000000
We now have vectorized and efficient expressions for the loss, the gradient and our gradient matches the numerical gradient. We are therefore ready to do SGD to minimize the loss.
class Softmax(LinearClassifier):
""" A subclass that uses the Softmax + Cross-entropy loss function """
def loss(self, X_batch, y_batch, reg):
return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
# Now implement SGD in LinearSVM.train() function and run it with the code below
softmax = Softmax()
tic = time.time()
loss_hist = softmax.train(
X_train, y_train,
learning_rate = 5e-8, reg=1e3,
# better params:
# learning_rate=5e-7, reg=5e4,
num_iters=1500, seed=9527, verbose=True
)
toc = time.time()
print('That took %fs' % (toc - tic))
iteration 0 / 1500: loss 771.351533 iteration 100 / 1500: loss 6.911816 iteration 200 / 1500: loss 2.059462 iteration 300 / 1500: loss 2.065780 iteration 400 / 1500: loss 2.084819 iteration 500 / 1500: loss 2.094321 iteration 600 / 1500: loss 2.060693 iteration 700 / 1500: loss 2.053175 iteration 800 / 1500: loss 2.047805 iteration 900 / 1500: loss 2.082586 iteration 1000 / 1500: loss 2.086963 iteration 1100 / 1500: loss 2.060702 iteration 1200 / 1500: loss 2.057573 iteration 1300 / 1500: loss 2.116810 iteration 1400 / 1500: loss 2.082615 That took 4.024618s
# A useful debugging strategy is to plot the loss as a function of
# iteration number:
plt.plot(loss_hist)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.show()
y_train_pred = softmax.predict(X_train)
print('training accuracy: {:.3%}'.format(np.mean(y_train == y_train_pred)))
y_val_pred = softmax.predict(X_val)
print('validation accuracy: {:.3%}'.format(np.mean(y_val == y_val_pred)))
training accuracy: 32.312% validation accuracy: 34.700%
# evaluate on test set
# Evaluate the best svm on test set
y_test_pred = softmax.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, ))
softmax on raw pixels final test set accuracy: 0.341000
# Visualize the learned weights for each class
w = softmax.W[:,:-1] # strip out the bias
w = w.reshape(10, 32, 32, 3)
w_min, w_max = np.min(w), np.max(w)
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
for i in range(10):
plt.subplot(2, 5, i + 1)
# Rescale the weights to be between 0 and 255
wimg = 255.0 * (w[i].squeeze() - w_min) / (w_max - w_min)
plt.imshow(wimg.astype('uint8'))
plt.axis('off')
plt.title(classes[i])
plt.show()