%matplotlib inline from __future__ import print_function import numpy as np from sklearn.datasets import load_digits digits = load_digits() import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = (8, 5) plt.gray() plt.figure(1, figsize=(3, 3)) plt.imshow(digits.images[0], cmap=plt.cm.gray_r, interpolation='nearest') data = np.asarray(digits.data, dtype='float32') target = np.asarray(digits.target, dtype='int32') x = data y = target from sklearn import preprocessing x = preprocessing.scale(x) from sklearn import cross_validation x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.15, random_state=42) print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) assert (len(set(y_train)) == len(set(y_test))) # should work with random_state=42 ;-) def softmax(W, x): """ P(y=j | X) = exp() / (sum_k(exp())) W: (n_features, n_outputs) X: (n_features,) """ tmp = np.exp(np.dot(x, W)) return tmp / np.sum(tmp) from scipy.optimize.optimize import check_grad from scipy.optimize import fmin_l_bfgs_b class LogisticRegression: def __init__(self): self.name = "Logistic Regression" def loss(self, W): """ Negative log-likelihood as described above """ l = 0. tmp_W = W.reshape((self.x.shape[1], self.n_classes)) for i, xi in enumerate(self.x): p_i = softmax(tmp_W, xi) l += np.log(p_i[self.y[i]]) return - l / self.x.shape[0] def loss_derivative(self, W): """ flattened (1-D) gradient vector """ d = np.zeros((self.x.shape[1], self.n_classes)) # that's the Jacobian tmp_W = W.reshape((self.x.shape[1], self.n_classes)) for i, xi in enumerate(self.x): p_i = softmax(tmp_W, xi) for j, v in enumerate(p_i): if j == self.y[i]: # because we don't have a "one-hot" encoding d[:,j] += (1 - p_i[j]) * xi else: d[:,j] += - p_i[j] * xi d = d.reshape(W.shape) # and we flatten it return - d / self.x.shape[0] def train(self, x_train, y_train, debug=False): self.x = np.hstack((x_train, np.ones((x_train.shape[0], 1)))) self.y = y_train self.n_classes = len(set(y_train)) self.W = np.ones((x_train.shape[1] + 1, self.n_classes)) print("W shape:", self.W.shape) print("x shape:", self.x.shape) print("loss before training:", self.loss(self.W)) print("prediction on x[0] before training:", softmax(self.W, self.x[0])) if debug == True: tmp = self.loss_derivative(self.W.reshape((self.W.shape[0]*self.W.shape[1],))) print("loss derivative shape:", tmp.shape) print("loss derivative value:", tmp) print("checking the gradient with finite differences, show be low (something E-6):", check_grad(self.loss, self.loss_derivative, self.W.reshape((self.W.shape[0]*self.W.shape[1],)))) # that's nothing more than checking that f'(x) ~= (f(x+h)-f(x))/h with h->0 tmp = fmin_l_bfgs_b(self.loss, self.W.reshape((self.W.shape[0]*self.W.shape[1],)), fprime=self.loss_derivative) self.W = tmp[0].reshape((self.x.shape[1], self.n_classes)) print("loss after training:", self.loss(self.W)) print("prediction on x[0] after training:", softmax(self.W, self.x[0])) print("true x[0] label:", self.y[0]) def score(self, x_test, y_test): x = np.hstack((x_test, np.ones((x_test.shape[0], 1)))) y = y_test c = 0 for i, xi in enumerate(x): a = np.argmax(softmax(self.W, xi)) if a == y[i]: c += 1 return c * 1. / y.shape[0] lr = LogisticRegression() lr.train(x_train, y_train) plt.imshow(lr.W.transpose(), interpolation='none') plt.show() print("weights for 0:") plt.imshow(lr.W.transpose()[0][:64].reshape((8,8)), interpolation='none') plt.show() print("weights for 5:") plt.imshow(lr.W.transpose()[5][:64].reshape((8,8)), interpolation='none') plt.show() print("weights for 8 minus weights for 3:") plt.imshow(lr.W.transpose()[8][:64].reshape((8,8)) - lr.W.transpose()[3][:64].reshape((8,8)), interpolation='none') plt.show() print("training accuracy:", lr.score(x_train, y_train)) print("test accuracy:", lr.score(x_test, y_test)) print("We should overfit, i.e. test score << train score, you can add an L2 regularization") from sklearn import svm clf = svm.SVC(kernel='linear') clf.fit(x_train, y_train) print("Linear SVM") print("training accuracy:", clf.score(x_train, y_train)) print("test accuracy:", clf.score(x_test, y_test)) clf = svm.SVC(kernel='rbf') clf.fit(x_train, y_train) print("with and RBF kernel") print("training accuracy:", clf.score(x_train, y_train)) print("test accuracy:", clf.score(x_test, y_test)) def init(lr, x_train, y_train): lr.x = np.hstack((x_train, np.ones((x_train.shape[0], 1)))) lr.y = y_train lr.n_classes = len(set(y_train)) lr.W = np.zeros((x_train.shape[1] + 1, lr.n_classes)) def one_epoch_of_SGD(lr, learning_rate): """ an "epoch" is a full pass/iteration on the dataset """ tmp_W = lr.W[:] for i, xi in enumerate(lr.x): p_i = softmax(tmp_W, xi) for j, v in enumerate(p_i): if j == lr.y[i]: tmp_W[:,j] += learning_rate * (xi * (1 - v)) else: tmp_W[:,j] += learning_rate * (- xi * v) return tmp_W class LogisticRegression2: def __init__(self): self.name = "Logistic Regression" def loss(self): # we redefine it so that we dont have to do all the reshaping, as we don't use scipy's L-BFGS anymore """ Negative log-likelihood as described above """ l = 0. for i, xi in enumerate(self.x): p_i = softmax(self.W, xi) l += np.log(p_i[self.y[i]]) return - l / self.x.shape[0] def score(self, x_test, y_test): x = np.hstack((x_test, np.ones((x_test.shape[0], 1)))) y = y_test c = 0 for i, xi in enumerate(x): a = np.argmax(softmax(self.W, xi)) if a == y[i]: c += 1 return c * 1. / y.shape[0] def stupid_train_with_SGD(lr, x_train, y_train): init(lr, x_train, y_train) print("loss before training:", lr.loss()) lr.W = one_epoch_of_SGD(lr, learning_rate=0.01) old_loss = 1.E31 new_loss = lr.loss() losses = [new_loss] print("loss after one epoch of training:", new_loss) lr.W = one_epoch_of_SGD(lr, learning_rate=0.01) new_loss = lr.loss() losses.append(new_loss) print("loss after two epochs of training:", new_loss) while old_loss - new_loss > 1.E-5: # we want to optimize up to decreasing in the loss of 1.E-5 ==> STUPIDELY BAD old_loss = new_loss lr.W = one_epoch_of_SGD(lr, learning_rate=0.01) new_loss = lr.loss() losses.append(new_loss) print("best loss:", new_loss) plt.plot(np.arange(len(losses)), np.log(losses), label="stupid", linewidth=2, linestyle="--") def slightly_less_stupid_train_with_SGD(lr, x_train, y_train): init(lr, x_train, y_train) print("loss before training:", lr.loss()) old_loss = 1.E31 losses = [] new_loss = 1.E30 while abs(old_loss - new_loss) > 1.E-5: # we want to optimize up to changes in the loss of 1.E-5 ==> BAD old_loss = new_loss lr.W = one_epoch_of_SGD(lr, learning_rate=0.01) new_loss = lr.loss() losses.append(new_loss) print("best loss:", new_loss) plt.plot(np.arange(len(losses)), np.log(losses), label="slightly less stupid", linewidth=2, linestyle=":") def early_stopping_train_with_SGD(lr, x_train, y_train): """ using the training set as validation set, DO NOT DO THIS IN PRACTICE! USE A SEPARATE VALIDATION SET """ init(lr, x_train, y_train) print("loss before training:", lr.loss()) old_score = -1. new_score = lr.score(x_train, y_train) losses = [] patience = 3 # number of epochs of patience while new_score > old_score or patience: if new_score > old_score: patience = 3 else: patience -= 1 old_score = new_score lr.W = one_epoch_of_SGD(lr, learning_rate=0.01) new_score = lr.score(x_train, y_train) losses.append(lr.loss()) print("best loss:", losses[-1]) plt.plot(np.arange(len(losses)), np.log(losses), label="crude early stopping\n3 epochs of patience", linewidth=1.5) lr = LogisticRegression2() print("stupid stopping condition [will stop at the first bump in the loss function]") stupid_train_with_SGD(lr, x_train, y_train) print("training accuracy:", lr.score(x_train, y_train)) print("test accuracy:", lr.score(x_test, y_test)) print() print("slightly less stupid stopping condition [will stop when the loss function is flat]") slightly_less_stupid_train_with_SGD(lr, x_train, y_train) print("training accuracy:", lr.score(x_train, y_train)) print("test accuracy:", lr.score(x_test, y_test)) print() print("(very) early stopping (crude, with 3 epochs of patience)") print("[optimizes for the right thing, but do it on a _separate_ validation set!]") early_stopping_train_with_SGD(lr, x_train, y_train) # using the training set as validation set, # DO NOT DO THIS IN PRACTICE! # USE A SEPARATE VALIDATION SET print("training accuracy:", lr.score(x_train, y_train)) print("test accuracy:", lr.score(x_test, y_test)) plt.ylabel("log of the loss function") plt.xlabel("#epochs") plt.legend() from random import randint def train_with_pure_SGD(lr, x_train, y_train, init_learning_rate=0.1): init(lr, x_train, y_train) old_loss = 1.E31 new_loss = lr.loss() print("loss before training:", new_loss) def get_learning_rate(t): return init_learning_rate / (1 + init_learning_rate*np.sqrt(t)) #return init_learning_rate / (1 + init_learning_rate*t) learning_rate = get_learning_rate(0) losses = [new_loss] gradients = [] # here we do purely stochastic gradient descent t = 0 while t < 1000: # here we see a fixed number of samples t += 1 old_loss = new_loss # pick a random sample i = randint(0, lr.x.shape[0]-1) xi = lr.x[i] p_i = softmax(lr.W, xi) error = np.zeros(p_i.shape) error[lr.y[i]] = 1 error = error - p_i grad = np.outer(xi, error) lr.W += get_learning_rate(t) * grad new_loss = lr.loss() losses.append(new_loss) gradients.append(grad) plt.plot(np.arange(len(losses)), np.log(losses)) plt.ylabel("log of the loss function") plt.xlabel("#samples seen") plt.show() from sklearn.decomposition import PCA pca = PCA(n_components=3) gradients = np.array(gradients) #gradients = np.log(1 + np.abs(gradients.reshape(gradients.shape[0], gradients.shape[1]*gradients.shape[2]))) gradients = gradients.reshape(gradients.shape[0], gradients.shape[1]*gradients.shape[2]) grad = pca.fit_transform(gradients).transpose() #grad = np.log(1 + np.abs(grad)) plt.plot(np.arange(grad.shape[1]), grad[0], label="#1 direction of the gradient") plt.plot(np.arange(grad.shape[1]), grad[1], label="#2 direction of the gradient") plt.plot(np.arange(grad.shape[1]), grad[2], label="#3 direction of the gradient", linestyle=":") m = np.mean(np.abs(gradients), axis=-1) plt.plot(np.arange(grad.shape[1]-50), [10*np.sum(m[i:i+50]) for i in xrange(grad.shape[1]-50)], label="smoothed $\propto$ mean(abs(gradients))", linestyle="--") #plt.ylabel("gradient log-magnitude") plt.ylabel("gradient values") plt.xlabel("#samples seen") plt.legend(loc=4) train_with_pure_SGD(lr, x_train, y_train) print("training accuracy:", lr.score(x_train, y_train)) print("test accuracy:", lr.score(x_test, y_test)) a = np.arange(-3, 3, 0.01) plt.plot(a, a, label="linear", linewidth=2, linestyle='--') plt.plot(a, 1./(1+np.exp(-4*a)), label="sigmoid") # I multiply by 4 so that's it's less flat plt.plot(a, np.tanh(4*a), label="tanh") # I multiply by 4 so that's it's less flat plt.plot(a, (a+abs(a))/2, label="ReLU", linewidth=1.5) plt.legend(loc=4) a = np.random.standard_normal((20,20)) def non_linearities_imshows(a): plt.imshow(a, label="linear", cmap=plt.cm.gray_r, interpolation="none") print("linear") plt.colorbar() plt.show() plt.imshow(1./(1+np.exp(-4*a)), label="sigmoid", cmap=plt.cm.gray_r, interpolation="none") # /!\ mult by 4 print("sigmoid") plt.colorbar() plt.show() plt.imshow(np.tanh(4*a), label="tanh", cmap=plt.cm.gray_r, interpolation="none") # /!\ mult by 4 print("tanh") plt.colorbar() plt.show() plt.imshow((a+abs(a))/2, label="ReLU", cmap=plt.cm.gray_r, interpolation="none") print("Rectified Linear Unit") plt.colorbar() plt.show() non_linearities_imshows(a) #non_linearities_imshows(digits.images[0]) def relu_u(W, x, clip=6): """ ReLU unit function: max(0, W.x) In [1]: a = np.random.standard_normal((1000,1000)) In [2]: %timeit (a+abs(a))/2 100 loops, best of 3: 4.27 ms per loop In [3]: %timeit a[a<0]=0 1000 loops, best of 3: 736 µs per loop """ #tmp = np.dot(x, W) #tmp[tmp < 0] = 0 #return np.append(tmp, 1) # for the biases of the next layer return np.append(np.clip(np.dot(x, W), 0, clip), 1) def relu_d(W, x, clip=4): #return np.asarray(np.dot(x, W) >= 0, dtype='float') tmp = np.dot(x, W) return np.asarray((tmp >= 0) & (tmp < clip), dtype='float') def tanh_u(W, x): tmp = np.dot(x, W) return np.append(np.tanh(tmp), 1) # for the biases of the next layer def tanh_d(W, x): tmp = tanh_u(W, x)[:-1] return 1 - tmp**2 class NeuralNet: def __init__(self): self.name = "Neural Net" #self.act_u = relu_u # try ReLUs, but with a small learning rate #self.act_d = relu_d # (think about why) self.act_u = tanh_u self.act_d = tanh_d def loss(self): """ Negative log-likelihood of logistic regression using the hidden layer """ l = 0. for i, xi in enumerate(self.x): p_i = self.pred(xi) l += np.log(p_i[self.y[i]]) return - l / self.x.shape[0] def one_epoch_of_backprop_SGD(self, learning_rate=0.01): """ applying the backprop equations explained above """ for i, xi in enumerate(self.x): act_xi = self.act_u(self.W_hid, xi) p_i = softmax(self.W_log, act_xi) err_o = np.zeros(p_i.shape) err_o[self.y[i]] = 1 err_o = err_o - p_i backprop_o = np.dot(err_o, self.W_log[:-1,:].T) act_d = self.act_d(self.W_hid, xi) err_h = act_d * backprop_o self.W_hid += learning_rate * np.outer(xi, err_h) self.W_log += learning_rate * np.outer(act_xi, err_o) def pred(self, xi): return softmax(self.W_log, self.act_u(self.W_hid, xi)) def train(self, x_train, y_train, n_hidden=42): # try different numbers of hidden units, e.g. 4, 64, 128, 200 self.x = np.hstack((x_train, np.ones((x_train.shape[0], 1)))) self.y = y_train self.n_classes = len(set(y_train)) self.n_hidden = n_hidden self.W_hid = np.random.random((x_train.shape[1] + 1, self.n_hidden)) - 0.5 self.W_hid *= 2 * 6. / (x_train.shape[1] + self.n_hidden) self.W_log = np.zeros((self.n_hidden + 1, self.n_classes)) # +1 for biases print("x shape:", self.x.shape) print("W_hid shape:", self.W_hid.shape) print("W_log shape:", self.W_log.shape) tmp_loss = self.loss() print("training loss before training:", tmp_loss) print("prediction on x[0] before training:", self.pred(self.x[0])) # training with SGD losses = [tmp_loss] l_r = 0.01 for _ in xrange(10): # 10 epochs self.one_epoch_of_backprop_SGD(learning_rate=l_r) losses.append(self.loss()) # play with the learning rate schemes print("best loss:", losses[-1]) print("training loss after training:", self.loss()) print("prediction on x[0] after training:", self.pred(self.x[0])) print("true x[0] label:", self.y[0]) plt.plot(np.arange(len(losses)), np.log(losses), linewidth=1) plt.ylabel("log of the loss function") plt.xlabel("#epochs") plt.show() plt.imshow(self.W_hid, interpolation='none') plt.colorbar() plt.show() #plt.imshow(self.W_log, interpolation='none') #plt.colorbar() #plt.show() def score(self, x_test, y_test): x = np.hstack((x_test, np.ones((x_test.shape[0], 1)))) y = y_test c = 0 for i, xi in enumerate(x): a = np.argmax(self.pred(xi)) if a == y[i]: c += 1 return c * 1. / y.shape[0] nn = NeuralNet() nn.train(x_train, y_train) print("Train set accuracy:", nn.score(x_train, y_train)) print("Test set accuracy:", nn.score(x_test, y_test)) #nn.train(x_train, y_train, n_hidden=10) #print "Train set accuracy:", nn.score(x_train, y_train) #print "Test set accuracy:", nn.score(x_test, y_test) print("you can also regularize a neural net, just change the loss function appropriately") from theano import tensor as T A = T.fmatrix() B = T.fmatrix() C = T.dot(A, B) C A = np.random.random((10,10)) B = np.random.random((10,10)) C = T.dot(A, B) C.eval() from theano import shared A = shared(A) B = shared(B) A C = T.dot(A, B) C.eval() # solution for the first bonus question alpha = 0.001 def loss_l2(self, W): tmp_W = W.reshape((self.x.shape[1], self.n_classes)) return self.loss(W) + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W]) def loss_derivative_l2(self, W): return self.loss_derivative(W) + 2*alpha*W # in train: tmp = fmin_l_bfgs_b(self.loss_l2, self.W.reshape((self.W.shape[0]*self.W.shape[1],)), fprime=self.loss_derivative_l2) # solution for the second bonus question def loss_hinge_l2(self, W): l = 0. tmp_W = W.reshape((self.x.shape[1], self.n_classes)) for i, xi in enumerate(self.x): tmp_max = -1. good = np.dot(tmp_W[:,self.y[i]], xi) for j in xrange(self.n_classes): if j == self.y[i]: continue delta = good - np.dot(tmp_W[:,j], xi) if delta > tmp_max: tmp_max = delta l += max(0, 1 + tmp_max) return - l / self.x.shape[0] + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W]) def loss_derivative_hinge_l2(self, W): d = np.zeros((self.x.shape[1], self.n_classes)) # that's the Jacobian tmp_W = W.reshape((self.x.shape[1], self.n_classes)) for i, xi in enumerate(self.x): for j in xrange(self.n_classes): if j == self.y[i]: pass else: d[:,j] += -np.dot(tmp_W[:,j], xi) * xi d = d.reshape(W.shape) # and we flatten it return - d / self.x.shape[0] + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W]) def score(self, x_test, y_test): x = np.hstack((x_test, np.ones((x_test.shape[0], 1)))) y = y_test c = 0 for i, xi in enumerate(x): #a = np.argmax(softmax(self.W, xi)) a = np.argmax(np.dot(self.W.transpose(), xi)) if a == y[i]: c += 1 return c * 1. / y.shape[0] # in train: tmp = fmin_l_bfgs_b(self.loss_hinge_l2, self.W.reshape((self.W.shape[0]*self.W.shape[1],)), fprime=self.loss_derivative_hinge_l2)