%matplotlib inline

from __future__ import print_function
import numpy as np
from sklearn.datasets import load_digits

digits = load_digits()
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 5)
plt.gray()
plt.figure(1, figsize=(3, 3))
plt.imshow(digits.images[0], cmap=plt.cm.gray_r, interpolation='nearest')
data = np.asarray(digits.data, dtype='float32')
target = np.asarray(digits.target, dtype='int32')
x = data
y = target
from sklearn import preprocessing
x = preprocessing.scale(x)
from sklearn import cross_validation
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.15, random_state=42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
assert (len(set(y_train)) == len(set(y_test)))  # should work with random_state=42 ;-)

def softmax(W, x):
    """ P(y=j | X) = exp(<X,W_j>) / (sum_k(exp(<X,W_k>)))
    W: (n_features, n_outputs)
    X: (n_features,)
    """
    tmp = np.exp(np.dot(x, W))
    return tmp / np.sum(tmp)

from scipy.optimize.optimize import check_grad
from scipy.optimize import fmin_l_bfgs_b

class LogisticRegression:
    def __init__(self):
        self.name = "Logistic Regression"
        
    def loss(self, W):
        """ Negative log-likelihood as described above """
        l = 0.
        tmp_W = W.reshape((self.x.shape[1], self.n_classes))
        for i, xi in enumerate(self.x):
            p_i = softmax(tmp_W, xi)
            l += np.log(p_i[self.y[i]])
        return - l / self.x.shape[0]
                   
    def loss_derivative(self, W):
        """ flattened (1-D) gradient vector """
        d = np.zeros((self.x.shape[1], self.n_classes)) # that's the Jacobian
        tmp_W = W.reshape((self.x.shape[1], self.n_classes))
        for i, xi in enumerate(self.x):
            p_i = softmax(tmp_W, xi)
            for j, v in enumerate(p_i):
                if j == self.y[i]:  # because we don't have a "one-hot" encoding
                    d[:,j] += (1 - p_i[j]) * xi
                else:
                    d[:,j] += - p_i[j] * xi
        d = d.reshape(W.shape) # and we flatten it
        return - d / self.x.shape[0]
    
    def train(self, x_train, y_train, debug=False):
        self.x = np.hstack((x_train, np.ones((x_train.shape[0], 1))))
        self.y = y_train
        self.n_classes = len(set(y_train))
        self.W = np.ones((x_train.shape[1] + 1, self.n_classes))
        print("W shape:", self.W.shape)
        print("x shape:", self.x.shape)
        print("loss before training:", self.loss(self.W))
        print("prediction on x[0] before training:", softmax(self.W, self.x[0]))
        if debug == True:
            tmp = self.loss_derivative(self.W.reshape((self.W.shape[0]*self.W.shape[1],)))
            print("loss derivative shape:", tmp.shape)
            print("loss derivative value:", tmp)
            print("checking the gradient with finite differences, show be low (something E-6):",
                  check_grad(self.loss, self.loss_derivative, self.W.reshape((self.W.shape[0]*self.W.shape[1],))))
            # that's nothing more than checking that f'(x) ~= (f(x+h)-f(x))/h with h->0
        tmp = fmin_l_bfgs_b(self.loss, self.W.reshape((self.W.shape[0]*self.W.shape[1],)), fprime=self.loss_derivative)
        self.W = tmp[0].reshape((self.x.shape[1], self.n_classes))
        print("loss after training:", self.loss(self.W))
        print("prediction on x[0] after training:", softmax(self.W, self.x[0]))
        print("true x[0] label:", self.y[0])
        
    def score(self, x_test, y_test):
        x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
        y = y_test
        c = 0
        for i, xi in enumerate(x):
            a = np.argmax(softmax(self.W, xi))
            if a == y[i]:
                c += 1
        return c * 1. / y.shape[0]

lr = LogisticRegression()
lr.train(x_train, y_train)
plt.imshow(lr.W.transpose(), interpolation='none')
plt.show()
print("weights for 0:")
plt.imshow(lr.W.transpose()[0][:64].reshape((8,8)), interpolation='none')
plt.show()
print("weights for 5:")
plt.imshow(lr.W.transpose()[5][:64].reshape((8,8)), interpolation='none')
plt.show()
print("weights for 8 minus weights for 3:")
plt.imshow(lr.W.transpose()[8][:64].reshape((8,8)) - lr.W.transpose()[3][:64].reshape((8,8)), interpolation='none')
plt.show()
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
print("We should overfit, i.e. test score << train score, you can add an L2 regularization")

from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
print("Linear SVM")
print("training accuracy:", clf.score(x_train, y_train))
print("test accuracy:", clf.score(x_test, y_test))
clf = svm.SVC(kernel='rbf')
clf.fit(x_train, y_train)
print("with and RBF kernel")
print("training accuracy:", clf.score(x_train, y_train))
print("test accuracy:", clf.score(x_test, y_test))

def init(lr, x_train, y_train):
    lr.x = np.hstack((x_train, np.ones((x_train.shape[0], 1))))
    lr.y = y_train
    lr.n_classes = len(set(y_train))
    lr.W = np.zeros((x_train.shape[1] + 1, lr.n_classes))
    
    
def one_epoch_of_SGD(lr, learning_rate):
    """ an "epoch" is a full pass/iteration on the dataset """
    tmp_W = lr.W[:]
    for i, xi in enumerate(lr.x):
        p_i = softmax(tmp_W, xi)
        for j, v in enumerate(p_i):
            if j == lr.y[i]:
                tmp_W[:,j] += learning_rate * (xi * (1 - v))
            else:
                tmp_W[:,j] += learning_rate * (- xi * v)
    return tmp_W
    
    
class LogisticRegression2:
    def __init__(self):
        self.name = "Logistic Regression"
        
    def loss(self):  # we redefine it so that we dont have to do all the reshaping, as we don't use scipy's L-BFGS anymore
        """ Negative log-likelihood as described above """
        l = 0.
        for i, xi in enumerate(self.x):
            p_i = softmax(self.W, xi)
            l += np.log(p_i[self.y[i]])
        return - l / self.x.shape[0]
    
    def score(self, x_test, y_test):
        x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
        y = y_test
        c = 0
        for i, xi in enumerate(x):
            a = np.argmax(softmax(self.W, xi))
            if a == y[i]:
                c += 1
        return c * 1. / y.shape[0]

    
def stupid_train_with_SGD(lr, x_train, y_train):
    init(lr, x_train, y_train)
    print("loss before training:", lr.loss())
    lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
    old_loss = 1.E31
    new_loss = lr.loss()
    losses = [new_loss]
    print("loss after one epoch of training:", new_loss)
    lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
    new_loss = lr.loss()
    losses.append(new_loss)
    print("loss after two epochs of training:", new_loss)
    while old_loss - new_loss > 1.E-5:  # we want to optimize up to decreasing in the loss of 1.E-5 ==> STUPIDELY BAD
        old_loss = new_loss
        lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
        new_loss = lr.loss()
        losses.append(new_loss)
    print("best loss:", new_loss)
    plt.plot(np.arange(len(losses)), np.log(losses), label="stupid", linewidth=2, linestyle="--")
    
    
def slightly_less_stupid_train_with_SGD(lr, x_train, y_train):
    init(lr, x_train, y_train)
    print("loss before training:", lr.loss())
    old_loss = 1.E31
    losses = []
    new_loss = 1.E30
    while abs(old_loss - new_loss) > 1.E-5:  # we want to optimize up to changes in the loss of 1.E-5 ==> BAD
        old_loss = new_loss
        lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
        new_loss = lr.loss()
        losses.append(new_loss)
    print("best loss:", new_loss)
    plt.plot(np.arange(len(losses)), np.log(losses), label="slightly less stupid", linewidth=2, linestyle=":")
    
    
def early_stopping_train_with_SGD(lr, x_train, y_train):
    """ using the training set as validation set, 
    DO NOT DO THIS IN PRACTICE! USE A SEPARATE VALIDATION SET """
    init(lr, x_train, y_train)
    print("loss before training:", lr.loss())
    old_score = -1.
    new_score = lr.score(x_train, y_train)
    losses = []
    patience = 3  # number of epochs of patience
    while new_score > old_score or patience:
        if new_score > old_score:
            patience = 3
        else:
            patience -= 1
        old_score = new_score
        lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
        new_score = lr.score(x_train, y_train)
        losses.append(lr.loss())
    print("best loss:", losses[-1])
    plt.plot(np.arange(len(losses)), np.log(losses), label="crude early stopping\n3 epochs of patience", linewidth=1.5)

    
lr = LogisticRegression2()
    
print("stupid stopping condition [will stop at the first bump in the loss function]")
stupid_train_with_SGD(lr, x_train, y_train)
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
print()

print("slightly less stupid stopping condition [will stop when the loss function is flat]")
slightly_less_stupid_train_with_SGD(lr, x_train, y_train)
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
print()

print("(very) early stopping (crude, with 3 epochs of patience)")
print("[optimizes for the right thing, but do it on a _separate_ validation set!]")
early_stopping_train_with_SGD(lr, x_train, y_train)  # using the training set as validation set, 
                                                     # DO NOT DO THIS IN PRACTICE!
                                                     # USE A SEPARATE VALIDATION SET
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
plt.ylabel("log of the loss function")
plt.xlabel("#epochs")
plt.legend()

from random import randint

def train_with_pure_SGD(lr, x_train, y_train, init_learning_rate=0.1):
    init(lr, x_train, y_train)
    old_loss = 1.E31
    new_loss = lr.loss()
    print("loss before training:", new_loss)
    def get_learning_rate(t):
        return init_learning_rate / (1 + init_learning_rate*np.sqrt(t))
        #return init_learning_rate / (1 + init_learning_rate*t)
    learning_rate = get_learning_rate(0)
    losses = [new_loss]
    gradients = []
    # here we do purely stochastic gradient descent
    t = 0
    while t < 1000:  # here we see a fixed number of samples
        t += 1
        old_loss = new_loss
        # pick a random sample
        i = randint(0, lr.x.shape[0]-1)
        xi = lr.x[i]
        p_i = softmax(lr.W, xi)
        error = np.zeros(p_i.shape)
        error[lr.y[i]] = 1
        error = error - p_i
        grad = np.outer(xi, error)
        lr.W += get_learning_rate(t) * grad
        new_loss = lr.loss()
        losses.append(new_loss)
        gradients.append(grad)
    plt.plot(np.arange(len(losses)), np.log(losses))
    plt.ylabel("log of the loss function")
    plt.xlabel("#samples seen")
    plt.show()
    from sklearn.decomposition import PCA
    pca = PCA(n_components=3)
    gradients = np.array(gradients)
    #gradients = np.log(1 + np.abs(gradients.reshape(gradients.shape[0], gradients.shape[1]*gradients.shape[2])))
    gradients = gradients.reshape(gradients.shape[0], gradients.shape[1]*gradients.shape[2])
    grad = pca.fit_transform(gradients).transpose()
    #grad = np.log(1 + np.abs(grad))
    plt.plot(np.arange(grad.shape[1]), grad[0], label="#1 direction of the gradient")
    plt.plot(np.arange(grad.shape[1]), grad[1], label="#2 direction of the gradient")
    plt.plot(np.arange(grad.shape[1]), grad[2], label="#3 direction of the gradient", linestyle=":")
    m = np.mean(np.abs(gradients), axis=-1)
    plt.plot(np.arange(grad.shape[1]-50), [10*np.sum(m[i:i+50]) for i in xrange(grad.shape[1]-50)],
             label="smoothed $\propto$ mean(abs(gradients))", linestyle="--")
    #plt.ylabel("gradient log-magnitude")
    plt.ylabel("gradient values")
    plt.xlabel("#samples seen")
    plt.legend(loc=4)

train_with_pure_SGD(lr, x_train, y_train)
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))

a = np.arange(-3, 3, 0.01)
plt.plot(a, a, label="linear", linewidth=2, linestyle='--')
plt.plot(a, 1./(1+np.exp(-4*a)), label="sigmoid")  # I multiply by 4 so that's it's less flat
plt.plot(a, np.tanh(4*a), label="tanh")            # I multiply by 4 so that's it's less flat
plt.plot(a, (a+abs(a))/2, label="ReLU", linewidth=1.5)
plt.legend(loc=4)

a = np.random.standard_normal((20,20))
def non_linearities_imshows(a):
    plt.imshow(a, label="linear", cmap=plt.cm.gray_r, interpolation="none")
    print("linear")
    plt.colorbar()
    plt.show()
    plt.imshow(1./(1+np.exp(-4*a)), label="sigmoid", cmap=plt.cm.gray_r, interpolation="none") # /!\ mult by 4
    print("sigmoid")
    plt.colorbar()
    plt.show()
    plt.imshow(np.tanh(4*a), label="tanh", cmap=plt.cm.gray_r, interpolation="none") # /!\ mult by 4
    print("tanh")
    plt.colorbar()
    plt.show()
    plt.imshow((a+abs(a))/2, label="ReLU", cmap=plt.cm.gray_r, interpolation="none")
    print("Rectified Linear Unit")
    plt.colorbar()
    plt.show()

non_linearities_imshows(a)
#non_linearities_imshows(digits.images[0])

def relu_u(W, x, clip=6):
    """
    ReLU unit function: max(0, W.x)
    In [1]: a = np.random.standard_normal((1000,1000))
    In [2]: %timeit (a+abs(a))/2
    100 loops, best of 3: 4.27 ms per loop
    In [3]: %timeit a[a<0]=0
    1000 loops, best of 3: 736 µs per loop
    """
    #tmp = np.dot(x, W)
    #tmp[tmp < 0] = 0
    #return np.append(tmp, 1)  # for the biases of the next layer
    return np.append(np.clip(np.dot(x, W), 0, clip), 1)

def relu_d(W, x, clip=4):
    #return np.asarray(np.dot(x, W) >= 0, dtype='float')
    tmp = np.dot(x, W)
    return np.asarray((tmp >= 0) & (tmp < clip), dtype='float')

def tanh_u(W, x):
    tmp = np.dot(x, W)
    return np.append(np.tanh(tmp), 1)  # for the biases of the next layer

def tanh_d(W, x):
    tmp = tanh_u(W, x)[:-1]
    return 1 - tmp**2


class NeuralNet:
    def __init__(self):
        self.name = "Neural Net"
        #self.act_u = relu_u  # try ReLUs, but with a small learning rate
        #self.act_d = relu_d  # (think about why)
        self.act_u = tanh_u
        self.act_d = tanh_d
        
    def loss(self):
        """ Negative log-likelihood of logistic regression using the hidden layer """
        l = 0.
        for i, xi in enumerate(self.x):
            p_i = self.pred(xi)
            l += np.log(p_i[self.y[i]])
        return - l / self.x.shape[0]
            
    def one_epoch_of_backprop_SGD(self, learning_rate=0.01):
        """ applying the backprop equations explained above """
        for i, xi in enumerate(self.x):
            act_xi = self.act_u(self.W_hid, xi)
            p_i = softmax(self.W_log, act_xi)
            err_o = np.zeros(p_i.shape)
            err_o[self.y[i]] = 1
            err_o = err_o - p_i
            backprop_o = np.dot(err_o, self.W_log[:-1,:].T)
            act_d = self.act_d(self.W_hid, xi)
            err_h = act_d * backprop_o
            self.W_hid += learning_rate * np.outer(xi, err_h)
            self.W_log += learning_rate * np.outer(act_xi, err_o)
    
    def pred(self, xi):
        return softmax(self.W_log, self.act_u(self.W_hid, xi))
                    
    def train(self, x_train, y_train, n_hidden=42):  # try different numbers of hidden units, e.g. 4, 64, 128, 200
        self.x = np.hstack((x_train, np.ones((x_train.shape[0], 1))))
        self.y = y_train
        self.n_classes = len(set(y_train))
        self.n_hidden = n_hidden
        self.W_hid = np.random.random((x_train.shape[1] + 1, self.n_hidden)) - 0.5
        self.W_hid *= 2 * 6. / (x_train.shape[1] + self.n_hidden)
        self.W_log = np.zeros((self.n_hidden + 1, self.n_classes))  # +1 for biases
        print("x shape:", self.x.shape)
        print("W_hid shape:", self.W_hid.shape)
        print("W_log shape:", self.W_log.shape)
        tmp_loss = self.loss()
        print("training loss before training:", tmp_loss)
        print("prediction on x[0] before training:", self.pred(self.x[0]))
        # training with SGD
        losses = [tmp_loss]
        l_r = 0.01
        for _ in xrange(10):  # 10 epochs
            self.one_epoch_of_backprop_SGD(learning_rate=l_r)
            losses.append(self.loss())
            # play with the learning rate schemes
        print("best loss:", losses[-1])
        print("training loss after training:", self.loss())
        print("prediction on x[0] after training:", self.pred(self.x[0]))
        print("true x[0] label:", self.y[0])
        plt.plot(np.arange(len(losses)), np.log(losses), linewidth=1)
        plt.ylabel("log of the loss function")
        plt.xlabel("#epochs")
        plt.show()
        plt.imshow(self.W_hid, interpolation='none')
        plt.colorbar()
        plt.show()
        #plt.imshow(self.W_log, interpolation='none')
        #plt.colorbar()
        #plt.show()
        
    def score(self, x_test, y_test):
        x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
        y = y_test
        c = 0
        for i, xi in enumerate(x):
            a = np.argmax(self.pred(xi))
            if a == y[i]:
                c += 1
        return c * 1. / y.shape[0]
    
nn = NeuralNet()
nn.train(x_train, y_train)
print("Train set accuracy:", nn.score(x_train, y_train))
print("Test set accuracy:", nn.score(x_test, y_test))
#nn.train(x_train, y_train, n_hidden=10)
#print "Train set accuracy:", nn.score(x_train, y_train)
#print "Test set accuracy:", nn.score(x_test, y_test)
print("you can also regularize a neural net, just change the loss function appropriately")

from theano import tensor as T
A = T.fmatrix()
B = T.fmatrix()
C = T.dot(A, B)
C

A = np.random.random((10,10))
B = np.random.random((10,10))
C = T.dot(A, B)
C.eval()

from theano import shared
A = shared(A)
B = shared(B)
A

C = T.dot(A, B)
C.eval()

# solution for the first bonus question
alpha = 0.001    
    def loss_l2(self, W):
        tmp_W = W.reshape((self.x.shape[1], self.n_classes))
        return self.loss(W) + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W])
    
    def loss_derivative_l2(self, W):
        return self.loss_derivative(W) + 2*alpha*W    
    
    # in train:
        tmp = fmin_l_bfgs_b(self.loss_l2, self.W.reshape((self.W.shape[0]*self.W.shape[1],)), fprime=self.loss_derivative_l2)
        
    
# solution for the second bonus question
    def loss_hinge_l2(self, W):
        l = 0.
        tmp_W = W.reshape((self.x.shape[1], self.n_classes))
        for i, xi in enumerate(self.x):
            tmp_max = -1.
            good = np.dot(tmp_W[:,self.y[i]], xi)
            for j in xrange(self.n_classes):
                if j == self.y[i]:
                    continue
                delta = good - np.dot(tmp_W[:,j], xi)
                if delta > tmp_max:
                    tmp_max = delta
            l += max(0, 1 + tmp_max)
        return - l / self.x.shape[0] + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W])

    def loss_derivative_hinge_l2(self, W):
        d = np.zeros((self.x.shape[1], self.n_classes)) # that's the Jacobian
        tmp_W = W.reshape((self.x.shape[1], self.n_classes))
        for i, xi in enumerate(self.x):
            for j in xrange(self.n_classes):
                if j == self.y[i]:
                    pass
                else:
                    d[:,j] += -np.dot(tmp_W[:,j], xi) * xi               
        d = d.reshape(W.shape) # and we flatten it
        return - d / self.x.shape[0] + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W])
    
    def score(self, x_test, y_test):
        x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
        y = y_test
        c = 0
        for i, xi in enumerate(x):
            #a = np.argmax(softmax(self.W, xi))
            a = np.argmax(np.dot(self.W.transpose(), xi))
            if a == y[i]:
                c += 1
        return c * 1. / y.shape[0]
    
    # in train:
        tmp = fmin_l_bfgs_b(self.loss_hinge_l2, self.W.reshape((self.W.shape[0]*self.W.shape[1],)),
                            fprime=self.loss_derivative_hinge_l2)