import numpy as np
from numpy.linalg import norm

%matplotlib inline
import matplotlib.pylab as plt

def least_squares(x, features, labels):
    """Evaluates the least square function."""
    n_samples = features.shape[0]
    x = x.reshape(1, n_features)
    loss_array = (features.dot(x.T) - labels) ** 2
    return np.sum(loss_array, axis=0) / (2. * n_samples)

def least_squares_grad(x, features, labels):
    """Evaluates the gradient of the least square function."""
    n_samples = features.shape[0]
    x = x.reshape(1, n_features)  # Added for scipy.optimize compatibility
    grad_array = (features.dot(x.T) - labels) * features
    return np.sum(grad_array, axis=0) / n_samples

logistic = lambda x: 1. / (1. + np.exp(-x))

def logistic_loss(x, features, labels):
    """Evaluates the logistic loss function."""
    n_samples, n_features = features.shape
    x = x.reshape(1, n_features)
    loss_array = np.log(1 + np.exp(-labels * features.dot(x.T)))
    return np.sum(loss_array, axis=0) / n_samples

def logistic_loss_grad(x, features, labels):
    """Evaluates the gradient of the logistic loss."""
    n_samples = features.shape[0]
    x = x.reshape(1, n_features)  # Added for scipy.optimize compatibility
    grad_array = -labels / (1 + np.exp(labels * features.dot(x.T))) * features
    return np.sum(grad_array, axis=0) / n_samples

def prox_l1(x, l=1.):
    """ Proximal operator of the l1 norm."""
    x_abs = np.abs(x)
    return np.sign(x) * (x_abs - l) * (x_abs > l)

def prox_l2(x, l=1.):
    """ Proximal operator of the l2 norm."""
    return 1. / (1 + l) * x

def prox_enet(x, l_l1, l_l2, t=1.):
    """Proximal operator for the elastic net at x"""
    x_abs = np.abs(x)
    prox_l1 = np.sign(x) * (x_abs - t * l_l1) * (x_abs > t * l_l1)
    return prox_l1 / (1. + t * l_l2)

def inspector(loss_fun, x_real, verbose=False):
    """A closure called to update metrics after each iteration."""
    objectives = []
    errors = []
    it = [0]  # This is a hack to be able to modify 'it' inside the closure.
    def inspector_cl(xk):
        obj = loss_fun(xk)
        err = norm(xk - x_real) / norm(x_real)
        objectives.append(obj)
        errors.append(err)
        if verbose == True:
            if it[0] == 0:
                print ' | '.join([name.center(8) for name in ["it", "obj", "err"]])
            if it[0] % (n_iter / 5) == 0:
                print ' | '.join([("%d" % it[0]).rjust(8), ("%.2e" % obj).rjust(8), ("%.2e" % err).rjust(8)])
            it[0] += 1
    inspector_cl.obj = objectives
    inspector_cl.err = errors
    return inspector_cl

def gd(x_init, grad, n_iter=100, step=1., callback=None):
    """Basic gradient descent algorithm."""
    x = x_init.copy()
    
    for _ in range(n_iter):
        x -= step * grad(x)
        
        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x

def ista(x_init, grad, prox, n_iter=100, step=1., callback=None):
    """ISTA algorithm."""
    x = x_init.copy()
    
    for _ in range(n_iter):
        x = prox(x - step * grad(x), step)
        
        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x

def fista(x_init, grad, prox, n_iter=100, step=1., callback=None):
    """FISTA algorithm."""
    x = x_init.copy()
    y = x_init.copy()
    t = 1.
    
    for _ in range(n_iter):
        x_new = prox(y - step * grad(y), step)
        t_new = (1. + (1. + 4. * t**2)**.5) / 2
        y = x_new + (t - 1) / t_new * (x_new - x)
        t = t_new
        x = x_new

        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x

def sgd(x_init, features, labels, grad, n_iter=100, step=1., callback=None):
    """Stochastic gradient descent algorithm."""
    x = x_init.copy()
    
    n_samples, n_features = features.shape
    
    for i in range(n_iter):
        idx = np.random.randint(n_samples)
        feature = features[idx].reshape(1, n_features)
        label = labels[idx]
        x -= step * grad(x, feature, label) / (n_samples * np.sqrt(i + 1))
        
        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x

def sag(x_init, features, labels, grad, n_iter=100, step=1., callback=None):
    """Stochastic average gradient algorithm."""
    x = x_init.copy()
    
    n_samples, n_features = features.shape
    y_old = np.zeros((n_samples, n_features))
    y = np.zeros(n_features)
    
    for _ in range(n_iter):
        idx = np.random.randint(n_samples)
        feature = features[idx].reshape(1, n_features)
        label = labels[idx]
        y_new = grad(x, feature, label)
        y += (y_new - y_old[idx]) / n_samples
        y_old[idx] = y_new
        x -= step * y / n_samples
        
        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x
    

def miso(x_init, features, labels, grad, prox, n_iter=100, step=1., callback=None):
    """MISO algorithm."""
    x = x_init.copy()
    
    n_samples, n_features = features.shape
    z_old = np.zeros((n_samples, n_features))
    z = np.zeros(n_features)
    
    for _ in range(n_iter):
        idx = np.random.randint(n_samples)
        feature = features[idx].reshape(1, n_features)
        label = labels[idx]
        z_new = grad(x, feature, label) 
        z += (z_new - z_old[idx]) / n_samples
        z_old[idx] = z_new
        x = prox(x - step * z, step)
        
        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x

def svrg(x_init, features, labels, grad, n_iter=100, batch_size=10, step=1., callback=None):
    """Stochastic variance reduction gradient algorithm."""
    x = x_init.copy()
    
    n_samples, n_features = features.shape
    x_old = sgd(x_init, features, labels, grad, n_iter=batch_size, step=step, callback=None)
    mu = grad(x_old, features, labels)
    
    x = x_old
    for i in range(n_iter):
        if i % batch_size == 0:
            x_old = x
            mu = grad(x, features, labels)
        idx = np.random.randint(n_samples)
        feature = features[idx].reshape(1, n_features)
        label = labels[idx]
        z_new = grad(x, feature, label)
        z_old = grad(x_old, feature, label)
        x -= step * (z_new - z_old + mu) / n_samples
        
        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x

def sdca(x_init, features, labels, grad, n_iter=100, step=1., callback=None):
    """Stochastic dual coordinate ascent."""
    x = x_init.copy()
    
    # TODO
        
        # Update metrics after each iteration.
        if callback is not None:
            callback(x)
    return x

# Generate a fake dataset

n_samples = 2000
n_features = 50

idx = np.arange(n_features).reshape(1, n_features)
params = 2 * (-1) ** (idx - 1) * .9**idx
params[0, 20:50] = 0
diag = np.random.rand(n_features)
features = np.random.multivariate_normal(np.zeros(n_features), np.diag(diag), n_samples)

# Show the condition number of the gram matrix
print "cond = %.2f" % (diag.max() / diag.min())

# Change this to 'False' to estimate the logitic regression model.
linear = True
if linear == True:
    residuals = np.random.randn(n_samples, 1)
    labels = features.dot(params.T) + residuals
else:
    labels = np.array([[float(np.random.rand() < p)] for p in logistic(features.dot(params.T))])

plt.figure(figsize=(8, 4))
plt.stem(params[0])
plt.title("True parameters", fontsize=16)
plt.show()

# Initialize stuff
x_init = 1 - 2 * np.random.rand(1, n_features)
n_iter = 30
l_l1 = 0.0
l_l2 = 0.1

# f and gradient
if linear == True:
    f = lambda x: least_squares(x, features, labels)
    grad_f = lambda x: least_squares_grad(x, features, labels)
    step = norm(features.T.dot(features) / n_samples, 2)
else:
    f = lambda x: logistic_loss(x, features, labels)[0]
    grad_f = lambda x: logistic_loss_grad(x, features, labels)
    step = 1. # np.sum(features**2, axis=1).max() / (4 * n_samples)

# g, F and prox.
g = lambda x: l_l1 * np.abs(x).sum() + 0.5 * l_l2 * np.sum(x**2)
F = lambda x: f(x) + g(x)
prox_g = lambda x, l: prox_enet(x, l_l1, l_l2, l)

print "Type: %s" % ('linear' if linear else 'logistic')
print "n_iter: %d" % n_iter
print "step size: %.2f" % step

import scipy.optimize

ls = lambda x: logistic_loss(x, features, labels)
print scipy.optimize.approx_fprime(x_init.ravel(), ls, 1e-3)
print logistic_loss_grad(x_init, features, labels)


plt.figure(figsize=(8, 4))
plt.stem(x_init[0])
plt.title("Initial guess", fontsize=16)
plt.show()

# ISTA
ista_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_ista = ista(x_init, grad=grad_f, prox=prox_g, n_iter=n_iter, step=step, callback=ista_inspector)

# FISTA
fista_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_fista = fista(x_init, grad=grad_f, prox=prox_g, n_iter=n_iter, step=step, callback=fista_inspector)

# Gradient descent
grad_gd = lambda x: grad_f(x) + l_l1 * np.abs(x) + l_l2 * x
gd_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_gd = gd(x_init, grad=grad_gd, n_iter=n_iter, step=step, callback=gd_inspector)

plt.figure(figsize=(18, 5))
plt.suptitle("Final estimates", fontsize=18)
plt.subplot(1, 4, 1)
plt.title("Real params")
plt.stem(params[0])
plt.subplot(1, 4, 2)
plt.title("ISTA")
plt.stem(x_ista[0], color='red')
plt.subplot(1, 4, 3)
plt.title("FISTA")
plt.stem(x_fista[0])
plt.subplot(1, 4, 4)
plt.title("GD")
plt.stem(x_gd[0])
plt.show()

plt.figure(figsize=(17, 5))
plt.subplot(1, 2, 1)
plt.plot(gd_inspector.obj, 'b')
plt.plot(ista_inspector.obj, 'r')
plt.plot(fista_inspector.obj, 'g')
plt.title("Loss", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("value", fontsize=14)
plt.legend(["gd", "ista", "fista"])
plt.subplot(1, 2, 2)
plt.plot(gd_inspector.err, 'b')
plt.plot(ista_inspector.err, 'r')
plt.plot(fista_inspector.err, 'g')
plt.title("Distance to x_real", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("distance", fontsize=14)
plt.legend(["gd", "ista", "fista"])
plt.show()

n_iter = 10 * n_samples

# Initialize stuff
if linear == True:
    f = lambda x: least_squares(x, features, labels)[0]
    grad_stoc = least_squares_grad
    L = norm(features.T.dot(features), 2)  / n_samples
else:
    f = lambda x: logistic_loss(x, features, labels)[0]
    grad_stoc = logistic_loss_grad
    L = 1. #4. * n_samples / np.sum(features**2, axis=1).max()

print "Type: %s" % ('linear' if linear else 'logistic')
print "n_iter: %d" % n_iter
print "step size: %.2f" % step

plt.figure(figsize=(8, 4))
plt.stem(x_init[0])
plt.title("Initial guess", fontsize=16)
plt.show()

grad_step = 10. / L
grad_sgd = lambda x, feat, lab: grad_stoc(x, feat, lab) + l_l2 * x[0]
sgd_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_sgd = sgd(x_init, features=features, labels=labels, grad=grad_sgd, n_iter=n_iter, step=step, callback=sgd_inspector)

sag_step = 1. / L
grad_sag = lambda x, feat, lab: grad_stoc(x, feat, lab) + l_l2 * x[0]
sag_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_sag = sag(x_init, features=features, labels=labels, grad=grad_sag, n_iter=n_iter, step=step, callback=sag_inspector)

miso_step = 1. / L
prox_g = lambda x, l: prox_enet(x, l_l1, l_l2, l)
miso_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_miso = miso(x_init, features, labels, grad=grad_stoc, prox=prox_g, n_iter=n_iter, step=step,
              callback=miso_inspector)

svrg_step = 1. / L
grad_svrg = lambda x, feat, lab: grad_stoc(x, feat, lab) + l_l2 * x[0]
svrg_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_svrg = svrg(x_init, features, labels, grad=grad_svrg, n_iter=n_iter, batch_size=100, step=svrg_step, callback=svrg_inspector)

plt.figure(figsize=(17, 5))
plt.subplot(1, 2, 1)
it = np.linspace(0, float(n_iter) / n_samples, n_iter)
plt.plot(it, sgd_inspector.obj, 'b')
plt.plot(it, sag_inspector.obj, 'r')
plt.plot(it, miso_inspector.obj, 'g')
plt.plot(it, svrg_inspector.obj, 'y')
plt.title("Loss", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("value", fontsize=14)
plt.legend(["sgd", "sag", "miso", "svrg"])
plt.subplot(1, 2, 2)
plt.plot(it, sgd_inspector.err, 'b')
plt.plot(it, sag_inspector.err, 'r')
plt.plot(it, miso_inspector.err, 'g')
plt.plot(it, svrg_inspector.err, 'y')
plt.title("Distance to x_real", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("distance", fontsize=14)
plt.legend(["sgd", "sag", "miso", "svrg"])
plt.show()

plt.figure(figsize=(18, 10))
plt.suptitle("Final estimates", fontsize=18)
plt.subplot(2, 4, 1)
plt.title("Real params")
plt.stem(params[0])
plt.subplot(2, 4, 2)
plt.title("SGD")
plt.stem(x_sgd[0], color='red')
plt.subplot(2, 4, 3)
plt.title("SAG")
plt.stem(x_sag[0])
plt.subplot(2, 4, 4)
plt.title("MISO")
plt.stem(x_miso[0])
plt.subplot(2, 4, 5)
plt.title("SVRG")
plt.stem(x_svrg[0])
plt.show()

n_iter = 50

# Conjugate gradient descent
from scipy.optimize import fmin_cg

cg_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
res_cg = fmin_cg(F, x_init.ravel(), fprime=grad_gd, maxiter=n_iter, callback=cg_inspector)

# L-BFGS-B
from scipy.optimize import fmin_l_bfgs_b

lbfgs_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
res_lbfgs = fmin_l_bfgs_b(F, x_init.ravel(), fprime=grad_gd, maxiter=n_iter, callback=lbfgs_inspector)

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.plot(gd_inspector.obj, 'b')
plt.plot(ista_inspector.obj, 'r')
plt.plot(fista_inspector.obj, 'g')
plt.plot(cg_inspector.obj, 'y')
plt.plot(lbfgs_inspector.obj, 'pink')
plt.title("Loss", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("value", fontsize=14)
plt.legend(["gd", "ista", "fista", "cg", "l-bfgs-b"])
plt.subplot(1, 2, 2)
plt.plot(gd_inspector.err, 'b')
plt.plot(ista_inspector.err, 'r')
plt.plot(fista_inspector.err, 'g')
plt.plot(cg_inspector.err, 'y')
plt.plot(lbfgs_inspector.err, 'pink')
plt.title("Distance to x_real", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("distance", fontsize=14)
plt.legend(["gd", "ista", "fista", "cg", "l-bfgs-b"])
plt.show()

def contours(f, xlim=(-10, 10), ylim=(-10, 10)):
    
    def apply_to_grid(f, xlim, ylim, s=100j):
        X, Y = np.mgrid[xlim[0]:xlim[1]:s, ylim[0]:ylim[1]:s]
        z = positions = np.vstack([X.ravel(), Y.ravel()]).T
        return X, Y, np.reshape(f(z), (100, 100))
    
    loss = lambda x: f(x, features, labels)
    X, Y, Z = apply_to_grid(loss, xlim, ylim)

    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18)
    plt.pcolor(X, Y, Z, cmap=plt.cm.RdBu)
    CS = plt.contour(X, Y, Z, colors='k')
    plt.clabel(CS, inline=1, fontsize=10)
    plt.grid(True)

plt.figure(figsize=(14, 6))
plt.suptitle("Contours of the loss", fontsize=18)

n_samples = 2000
n_features = 2

plt.subplot(1, 2, 1)
plt.title(r"$\lambda_{max} / \lambda_{min} = 1$", fontsize=16)
params = 1 - 2 * np.random.rand(1, n_features)
diag = np.array([.1, .1])
features = np.random.multivariate_normal(np.zeros(n_features), np.diag(diag), n_samples)
labels = np.array([np.random.rand() < p for p in logistic(features.dot(params.T))])
contours(least_squares)

plt.subplot(1, 2, 2)
plt.title(r"$\lambda_{max} / \lambda_{min} = 10$", fontsize=16)
params = 1 - 2 * np.random.rand(1, n_features)
diag = np.array([.01, .1])
features = np.random.multivariate_normal(np.zeros(n_features), np.diag(diag), n_samples)
labels = np.array([np.random.rand() < p for p in logistic(features.dot(params.T))])
contours(least_squares)
plt.show()