Notebook to play with a few first order optimization methods. Currently covers proximal algorithms for l1 and l2 regularization as well as more recent stochastic methods like SAG and MISO. Optimization is done either on the least squares or logistic loss.
1. Loss function and gradients
2. Proximal gradient derived from a majorization-minimization method
3. Proximal operators
4. Full gradient methods (GD, ISTA, FISTA)
5. Stochastic methods (SGD, SAG, MISO, SDCA, SVRG)
6. Generate dataset
7. Optimize (full gradient)
8. Optimize (stochastic)
Appendix
import numpy as np
from numpy.linalg import norm
%matplotlib inline
import matplotlib.pylab as plt
Least squares:
$$ \begin{align*} \ell(\theta; \mathbf{x}, \mathbf{y}) = \frac{1}{2n}\sum_{i=1}^n(\theta^Tx_i - y_i)^2 \\ \frac{\partial \ell(\theta; \mathbf{x}, \mathbf{y})}{\partial\theta} = \frac{1}{n}\sum_{i=1}^n(\theta^Tx_i - y_i)x_i \\ \end{align*} $$We use 2D numpy array for features
and labels
to take advantage of array magic.
def least_squares(x, features, labels):
"""Evaluates the least square function."""
n_samples = features.shape[0]
x = x.reshape(1, n_features)
loss_array = (features.dot(x.T) - labels) ** 2
return np.sum(loss_array, axis=0) / (2. * n_samples)
def least_squares_grad(x, features, labels):
"""Evaluates the gradient of the least square function."""
n_samples = features.shape[0]
x = x.reshape(1, n_features) # Added for scipy.optimize compatibility
grad_array = (features.dot(x.T) - labels) * features
return np.sum(grad_array, axis=0) / n_samples
Logistic and loss: $$ \begin{align*} s(x) & = \frac{1}{1+e^{-x}} \\ \ell(\theta; \mathbf{x}, \mathbf{y}) & = \frac{1}{n}\sum_{i=1}^n\log(1 + e^{(-y_i\theta^Tx_i)}) \\ \frac{\partial \ell(\theta; \mathbf{x}, \mathbf{y})}{\partial\theta} & = \frac{1}{n}\sum_{i=1}^n\frac{-y_i x_i}{1+\exp(y_i\theta^T x_i)} \end{align*} $$
logistic = lambda x: 1. / (1. + np.exp(-x))
def logistic_loss(x, features, labels):
"""Evaluates the logistic loss function."""
n_samples, n_features = features.shape
x = x.reshape(1, n_features)
loss_array = np.log(1 + np.exp(-labels * features.dot(x.T)))
return np.sum(loss_array, axis=0) / n_samples
def logistic_loss_grad(x, features, labels):
"""Evaluates the gradient of the logistic loss."""
n_samples = features.shape[0]
x = x.reshape(1, n_features) # Added for scipy.optimize compatibility
grad_array = -labels / (1 + np.exp(labels * features.dot(x.T))) * features
return np.sum(grad_array, axis=0) / n_samples
Let's say we would like to minimize $f + g$, with $f: \mathbb{R}^n \rightarrow \mathbb{R}$ a convex, $\beta$-smooth function and $g$ a convex function with a simple proximal operator.
A simple upper bound on $f+g$ is:
$$ \begin{align*} f(y) + g(y) & \leq \{f(x) + \nabla{f(x)^T}(y - x) + \frac{1}{2\lambda}\|x-y\|_{2}^2 + g(y)\\ & = q_\lambda(x, y), \end{align*} $$where $\lambda \in (0, 1/\beta]$.
Iteratively minimizing $q_\lambda(x^t, \cdot)$ will converge the minimum of $f+g$ $$ \DeclareMathOperator*{\argmin}{argmin} \DeclareMathOperator*{\prox}{prox} $$ $$ \begin{align*} \argmin_x q_\lambda(x^t, x) & = \argmin_x \{f(x^t) + \nabla{f(x^t)^T}(x - x^t) + \frac{1}{2\lambda}\|x-x^t\|_{2}^2 + g(x)\} \\ & = \argmin_x \{\frac{1}{2}\|x - (x^t - \lambda\nabla f(x^t))\|_2^2 + \lambda g(x)\} \\ & = \text{prox}_{\lambda g}\left(x^t - \lambda\nabla f(x^t)\right) \\ \end{align*} $$
The derivation of the proximal operators leads to:
$$ \begin{align*} \text{prox}_{\lambda\|\cdot\|_1}(v) & = \text{sign}(v) \odot (|v| − \lambda)_+ \\ \text{prox}_{\lambda\|\cdot\|_2^2}(v) & = \frac{1}{1 + \lambda} v \\ \text{prox}_{\lambda_1\|\cdot\|_1 + \lambda_2 / 2\|\cdot\|_2^2}(v) & = \frac{1}{1 + \lambda_2} \text{sign}(v) \odot (|v| - \lambda_1)_+ \end{align*} $$def prox_l1(x, l=1.):
""" Proximal operator of the l1 norm."""
x_abs = np.abs(x)
return np.sign(x) * (x_abs - l) * (x_abs > l)
def prox_l2(x, l=1.):
""" Proximal operator of the l2 norm."""
return 1. / (1 + l) * x
def prox_enet(x, l_l1, l_l2, t=1.):
"""Proximal operator for the elastic net at x"""
x_abs = np.abs(x)
prox_l1 = np.sign(x) * (x_abs - t * l_l1) * (x_abs > t * l_l1)
return prox_l1 / (1. + t * l_l2)
Before implementing the logic of GD, ISTA or FISTA we provide a simple function to be called after each iteration to gather and display a few metrics about current the minimization process.
def inspector(loss_fun, x_real, verbose=False):
"""A closure called to update metrics after each iteration."""
objectives = []
errors = []
it = [0] # This is a hack to be able to modify 'it' inside the closure.
def inspector_cl(xk):
obj = loss_fun(xk)
err = norm(xk - x_real) / norm(x_real)
objectives.append(obj)
errors.append(err)
if verbose == True:
if it[0] == 0:
print ' | '.join([name.center(8) for name in ["it", "obj", "err"]])
if it[0] % (n_iter / 5) == 0:
print ' | '.join([("%d" % it[0]).rjust(8), ("%.2e" % obj).rjust(8), ("%.2e" % err).rjust(8)])
it[0] += 1
inspector_cl.obj = objectives
inspector_cl.err = errors
return inspector_cl
And now we can implement the algorithms.
def gd(x_init, grad, n_iter=100, step=1., callback=None):
"""Basic gradient descent algorithm."""
x = x_init.copy()
for _ in range(n_iter):
x -= step * grad(x)
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
def ista(x_init, grad, prox, n_iter=100, step=1., callback=None):
"""ISTA algorithm."""
x = x_init.copy()
for _ in range(n_iter):
x = prox(x - step * grad(x), step)
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
def fista(x_init, grad, prox, n_iter=100, step=1., callback=None):
"""FISTA algorithm."""
x = x_init.copy()
y = x_init.copy()
t = 1.
for _ in range(n_iter):
x_new = prox(y - step * grad(y), step)
t_new = (1. + (1. + 4. * t**2)**.5) / 2
y = x_new + (t - 1) / t_new * (x_new - x)
t = t_new
x = x_new
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
Same but for (more recent) stochastic algorithms.
def sgd(x_init, features, labels, grad, n_iter=100, step=1., callback=None):
"""Stochastic gradient descent algorithm."""
x = x_init.copy()
n_samples, n_features = features.shape
for i in range(n_iter):
idx = np.random.randint(n_samples)
feature = features[idx].reshape(1, n_features)
label = labels[idx]
x -= step * grad(x, feature, label) / (n_samples * np.sqrt(i + 1))
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
def sag(x_init, features, labels, grad, n_iter=100, step=1., callback=None):
"""Stochastic average gradient algorithm."""
x = x_init.copy()
n_samples, n_features = features.shape
y_old = np.zeros((n_samples, n_features))
y = np.zeros(n_features)
for _ in range(n_iter):
idx = np.random.randint(n_samples)
feature = features[idx].reshape(1, n_features)
label = labels[idx]
y_new = grad(x, feature, label)
y += (y_new - y_old[idx]) / n_samples
y_old[idx] = y_new
x -= step * y / n_samples
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
def miso(x_init, features, labels, grad, prox, n_iter=100, step=1., callback=None):
"""MISO algorithm."""
x = x_init.copy()
n_samples, n_features = features.shape
z_old = np.zeros((n_samples, n_features))
z = np.zeros(n_features)
for _ in range(n_iter):
idx = np.random.randint(n_samples)
feature = features[idx].reshape(1, n_features)
label = labels[idx]
z_new = grad(x, feature, label)
z += (z_new - z_old[idx]) / n_samples
z_old[idx] = z_new
x = prox(x - step * z, step)
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
def svrg(x_init, features, labels, grad, n_iter=100, batch_size=10, step=1., callback=None):
"""Stochastic variance reduction gradient algorithm."""
x = x_init.copy()
n_samples, n_features = features.shape
x_old = sgd(x_init, features, labels, grad, n_iter=batch_size, step=step, callback=None)
mu = grad(x_old, features, labels)
x = x_old
for i in range(n_iter):
if i % batch_size == 0:
x_old = x
mu = grad(x, features, labels)
idx = np.random.randint(n_samples)
feature = features[idx].reshape(1, n_features)
label = labels[idx]
z_new = grad(x, feature, label)
z_old = grad(x_old, feature, label)
x -= step * (z_new - z_old + mu) / n_samples
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
def sdca(x_init, features, labels, grad, n_iter=100, step=1., callback=None):
"""Stochastic dual coordinate ascent."""
x = x_init.copy()
# TODO
# Update metrics after each iteration.
if callback is not None:
callback(x)
return x
File "<ipython-input-16-2ee839f8e03f>", line 8 if callback is not None: ^ IndentationError: unexpected indent
We test the above algorithms on a generated dataset.
# Generate a fake dataset
n_samples = 2000
n_features = 50
idx = np.arange(n_features).reshape(1, n_features)
params = 2 * (-1) ** (idx - 1) * .9**idx
params[0, 20:50] = 0
diag = np.random.rand(n_features)
features = np.random.multivariate_normal(np.zeros(n_features), np.diag(diag), n_samples)
# Show the condition number of the gram matrix
print "cond = %.2f" % (diag.max() / diag.min())
cond = 83.77
# Change this to 'False' to estimate the logitic regression model.
linear = True
if linear == True:
residuals = np.random.randn(n_samples, 1)
labels = features.dot(params.T) + residuals
else:
labels = np.array([[float(np.random.rand() < p)] for p in logistic(features.dot(params.T))])
plt.figure(figsize=(8, 4))
plt.stem(params[0])
plt.title("True parameters", fontsize=16)
plt.show()
All algorithms use a constant step size set to $1/L$, where $L$ is a Lipschitz constant for the gradient of the loss.
1.Least squares
$$L = \frac{ \|\mathbf{x}^T \mathbf{x}\|_{op}}{n}$$2.Logistic loss $$L = \frac{\underset{i}{\max}(\|x_i\|_2^2)}{4n} $$
# Initialize stuff
x_init = 1 - 2 * np.random.rand(1, n_features)
n_iter = 30
l_l1 = 0.0
l_l2 = 0.1
# f and gradient
if linear == True:
f = lambda x: least_squares(x, features, labels)
grad_f = lambda x: least_squares_grad(x, features, labels)
step = norm(features.T.dot(features) / n_samples, 2)
else:
f = lambda x: logistic_loss(x, features, labels)[0]
grad_f = lambda x: logistic_loss_grad(x, features, labels)
step = 1. # np.sum(features**2, axis=1).max() / (4 * n_samples)
# g, F and prox.
g = lambda x: l_l1 * np.abs(x).sum() + 0.5 * l_l2 * np.sum(x**2)
F = lambda x: f(x) + g(x)
prox_g = lambda x, l: prox_enet(x, l_l1, l_l2, l)
print "Type: %s" % ('linear' if linear else 'logistic')
print "n_iter: %d" % n_iter
print "step size: %.2f" % step
Type: linear n_iter: 30 step size: 1.04
import scipy.optimize
ls = lambda x: logistic_loss(x, features, labels)
print scipy.optimize.approx_fprime(x_init.ravel(), ls, 1e-3)
print logistic_loss_grad(x_init, features, labels)
[ 0.30424101 -0.26230376 0.4674678 -0.16728611 0.11170092 -0.08324431 0.35837712 -0.1988963 0.10854892 -0.10317867 0.10031488 0.00586717 0.0221764 -0.12311822 0.00355377 -0.11660948 0.02513881 -0.06978285 0.27948178 -0.20718274 -0.15874555 -0.13193407 0.04810521 0.13370537 -0.00458892 0.05095335 -0.06442196 0.07053449 0.05177863 0.10642691 -0.09323656 -0.21188971 -0.0784917 0.33497437 0.07748511 -0.20380341 0.0670902 0.02193815 0.15782084 -0.07780451 -0.03801027 -0.12387279 0.09561104 0.06024432 -0.11468991 0.2260843 0.07523159 -0.23414648 0.01797566 0.08944417] [ 0.30413249 -0.26240177 0.46732437 -0.16733445 0.11164676 -0.08330004 0.3582644 -0.19901026 0.10852345 -0.10325725 0.10021726 0.00585475 0.02217071 -0.12326196 0.00354838 -0.11665473 0.02512696 -0.06986414 0.27938023 -0.20736488 -0.15881298 -0.13204417 0.04805784 0.13355314 -0.0045907 0.05082515 -0.06452787 0.07044994 0.05168006 0.10637235 -0.0933726 -0.21198799 -0.07857703 0.33478578 0.07737347 -0.20389057 0.06703433 0.02192291 0.15774254 -0.07792483 -0.03811912 -0.1239431 0.09546933 0.0601289 -0.114753 0.22595846 0.07515347 -0.23427506 0.01789923 0.0893184 ]
plt.figure(figsize=(8, 4))
plt.stem(x_init[0])
plt.title("Initial guess", fontsize=16)
plt.show()
# ISTA
ista_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_ista = ista(x_init, grad=grad_f, prox=prox_g, n_iter=n_iter, step=step, callback=ista_inspector)
it | obj | err 0 | 2.47e+00 | 8.32e-01 6 | 1.27e+00 | 4.47e-01 12 | 1.25e+00 | 3.98e-01 18 | 1.24e+00 | 3.84e-01 24 | 1.24e+00 | 3.78e-01
# FISTA
fista_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_fista = fista(x_init, grad=grad_f, prox=prox_g, n_iter=n_iter, step=step, callback=fista_inspector)
it | obj | err 0 | 2.47e+00 | 8.32e-01 6 | 1.25e+00 | 3.92e-01 12 | 1.24e+00 | 3.69e-01 18 | 1.24e+00 | 3.75e-01 24 | 1.24e+00 | 3.75e-01
# Gradient descent
grad_gd = lambda x: grad_f(x) + l_l1 * np.abs(x) + l_l2 * x
gd_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_gd = gd(x_init, grad=grad_gd, n_iter=n_iter, step=step, callback=gd_inspector)
it | obj | err 0 | 2.30e+00 | 7.96e-01 6 | 1.26e+00 | 4.35e-01 12 | 1.25e+00 | 3.93e-01 18 | 1.24e+00 | 3.81e-01 24 | 1.24e+00 | 3.77e-01
plt.figure(figsize=(18, 5))
plt.suptitle("Final estimates", fontsize=18)
plt.subplot(1, 4, 1)
plt.title("Real params")
plt.stem(params[0])
plt.subplot(1, 4, 2)
plt.title("ISTA")
plt.stem(x_ista[0], color='red')
plt.subplot(1, 4, 3)
plt.title("FISTA")
plt.stem(x_fista[0])
plt.subplot(1, 4, 4)
plt.title("GD")
plt.stem(x_gd[0])
plt.show()
plt.figure(figsize=(17, 5))
plt.subplot(1, 2, 1)
plt.plot(gd_inspector.obj, 'b')
plt.plot(ista_inspector.obj, 'r')
plt.plot(fista_inspector.obj, 'g')
plt.title("Loss", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("value", fontsize=14)
plt.legend(["gd", "ista", "fista"])
plt.subplot(1, 2, 2)
plt.plot(gd_inspector.err, 'b')
plt.plot(ista_inspector.err, 'r')
plt.plot(fista_inspector.err, 'g')
plt.title("Distance to x_real", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("distance", fontsize=14)
plt.legend(["gd", "ista", "fista"])
plt.show()
n_iter = 10 * n_samples
# Initialize stuff
if linear == True:
f = lambda x: least_squares(x, features, labels)[0]
grad_stoc = least_squares_grad
L = norm(features.T.dot(features), 2) / n_samples
else:
f = lambda x: logistic_loss(x, features, labels)[0]
grad_stoc = logistic_loss_grad
L = 1. #4. * n_samples / np.sum(features**2, axis=1).max()
print "Type: %s" % ('linear' if linear else 'logistic')
print "n_iter: %d" % n_iter
print "step size: %.2f" % step
Type: linear n_iter: 20000 step size: 1.09
plt.figure(figsize=(8, 4))
plt.stem(x_init[0])
plt.title("Initial guess", fontsize=16)
plt.show()
grad_step = 10. / L
grad_sgd = lambda x, feat, lab: grad_stoc(x, feat, lab) + l_l2 * x[0]
sgd_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_sgd = sgd(x_init, features=features, labels=labels, grad=grad_sgd, n_iter=n_iter, step=step, callback=sgd_inspector)
it | obj | err 0 | 1.28e+01 | 1.48e+00 4000 | 1.16e+01 | 1.43e+00 8000 | 1.12e+01 | 1.41e+00 12000 | 1.08e+01 | 1.39e+00 16000 | 1.05e+01 | 1.38e+00
sag_step = 1. / L
grad_sag = lambda x, feat, lab: grad_stoc(x, feat, lab) + l_l2 * x[0]
sag_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_sag = sag(x_init, features=features, labels=labels, grad=grad_sag, n_iter=n_iter, step=step, callback=sag_inspector)
it | obj | err 0 | 1.28e+01 | 1.49e+00 4000 | 2.43e+00 | 8.13e-01 8000 | 1.53e+00 | 5.41e-01 12000 | 1.30e+00 | 4.55e-01 16000 | 1.26e+00 | 4.24e-01
miso_step = 1. / L
prox_g = lambda x, l: prox_enet(x, l_l1, l_l2, l)
miso_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_miso = miso(x_init, features, labels, grad=grad_stoc, prox=prox_g, n_iter=n_iter, step=step,
callback=miso_inspector)
it | obj | err 0 | 1.17e+01 | 1.42e+00 4000 | 1.26e+00 | 3.76e-01 8000 | 1.25e+00 | 3.76e-01 12000 | 1.24e+00 | 3.74e-01 16000 | 1.24e+00 | 3.74e-01
svrg_step = 1. / L
grad_svrg = lambda x, feat, lab: grad_stoc(x, feat, lab) + l_l2 * x[0]
svrg_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
x_svrg = svrg(x_init, features, labels, grad=grad_svrg, n_iter=n_iter, batch_size=100, step=svrg_step, callback=svrg_inspector)
it | obj | err 0 | 1.27e+01 | 1.48e+00 4000 | 2.22e+00 | 7.60e-01 8000 | 1.46e+00 | 5.75e-01 12000 | 1.32e+00 | 4.96e-01 16000 | 1.28e+00 | 4.54e-01
plt.figure(figsize=(17, 5))
plt.subplot(1, 2, 1)
it = np.linspace(0, float(n_iter) / n_samples, n_iter)
plt.plot(it, sgd_inspector.obj, 'b')
plt.plot(it, sag_inspector.obj, 'r')
plt.plot(it, miso_inspector.obj, 'g')
plt.plot(it, svrg_inspector.obj, 'y')
plt.title("Loss", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("value", fontsize=14)
plt.legend(["sgd", "sag", "miso", "svrg"])
plt.subplot(1, 2, 2)
plt.plot(it, sgd_inspector.err, 'b')
plt.plot(it, sag_inspector.err, 'r')
plt.plot(it, miso_inspector.err, 'g')
plt.plot(it, svrg_inspector.err, 'y')
plt.title("Distance to x_real", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("distance", fontsize=14)
plt.legend(["sgd", "sag", "miso", "svrg"])
plt.show()
plt.figure(figsize=(18, 10))
plt.suptitle("Final estimates", fontsize=18)
plt.subplot(2, 4, 1)
plt.title("Real params")
plt.stem(params[0])
plt.subplot(2, 4, 2)
plt.title("SGD")
plt.stem(x_sgd[0], color='red')
plt.subplot(2, 4, 3)
plt.title("SAG")
plt.stem(x_sag[0])
plt.subplot(2, 4, 4)
plt.title("MISO")
plt.stem(x_miso[0])
plt.subplot(2, 4, 5)
plt.title("SVRG")
plt.stem(x_svrg[0])
plt.show()
n_iter = 50
# Conjugate gradient descent
from scipy.optimize import fmin_cg
cg_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
res_cg = fmin_cg(F, x_init.ravel(), fprime=grad_gd, maxiter=n_iter, callback=cg_inspector)
it | obj | err 0 | 6.24e+00 | 8.27e-01 Warning: Desired error not necessarily achieved due to precision loss. Current function value: 6.239769 Iterations: 1 Function evaluations: 27 Gradient evaluations: 15
# L-BFGS-B
from scipy.optimize import fmin_l_bfgs_b
lbfgs_inspector = inspector(loss_fun=F, x_real=params, verbose=True)
res_lbfgs = fmin_l_bfgs_b(F, x_init.ravel(), fprime=grad_gd, maxiter=n_iter, callback=lbfgs_inspector)
it | obj | err 0 | 6.21e+00 | 1.30e+00
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.plot(gd_inspector.obj, 'b')
plt.plot(ista_inspector.obj, 'r')
plt.plot(fista_inspector.obj, 'g')
plt.plot(cg_inspector.obj, 'y')
plt.plot(lbfgs_inspector.obj, 'pink')
plt.title("Loss", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("value", fontsize=14)
plt.legend(["gd", "ista", "fista", "cg", "l-bfgs-b"])
plt.subplot(1, 2, 2)
plt.plot(gd_inspector.err, 'b')
plt.plot(ista_inspector.err, 'r')
plt.plot(fista_inspector.err, 'g')
plt.plot(cg_inspector.err, 'y')
plt.plot(lbfgs_inspector.err, 'pink')
plt.title("Distance to x_real", fontsize=18)
plt.xlabel("iteration", fontsize=14)
plt.ylabel("distance", fontsize=14)
plt.legend(["gd", "ista", "fista", "cg", "l-bfgs-b"])
plt.show()
def contours(f, xlim=(-10, 10), ylim=(-10, 10)):
def apply_to_grid(f, xlim, ylim, s=100j):
X, Y = np.mgrid[xlim[0]:xlim[1]:s, ylim[0]:ylim[1]:s]
z = positions = np.vstack([X.ravel(), Y.ravel()]).T
return X, Y, np.reshape(f(z), (100, 100))
loss = lambda x: f(x, features, labels)
X, Y, Z = apply_to_grid(loss, xlim, ylim)
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18)
plt.pcolor(X, Y, Z, cmap=plt.cm.RdBu)
CS = plt.contour(X, Y, Z, colors='k')
plt.clabel(CS, inline=1, fontsize=10)
plt.grid(True)
plt.figure(figsize=(14, 6))
plt.suptitle("Contours of the loss", fontsize=18)
n_samples = 2000
n_features = 2
plt.subplot(1, 2, 1)
plt.title(r"$\lambda_{max} / \lambda_{min} = 1$", fontsize=16)
params = 1 - 2 * np.random.rand(1, n_features)
diag = np.array([.1, .1])
features = np.random.multivariate_normal(np.zeros(n_features), np.diag(diag), n_samples)
labels = np.array([np.random.rand() < p for p in logistic(features.dot(params.T))])
contours(least_squares)
plt.subplot(1, 2, 2)
plt.title(r"$\lambda_{max} / \lambda_{min} = 10$", fontsize=16)
params = 1 - 2 * np.random.rand(1, n_features)
diag = np.array([.01, .1])
features = np.random.multivariate_normal(np.zeros(n_features), np.diag(diag), n_samples)
labels = np.array([np.random.rand() < p for p in logistic(features.dot(params.T))])
contours(least_squares)
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-37-503e4a8580e4> in <module>() 28 features = np.random.multivariate_normal(np.zeros(n_features), np.diag(diag), n_samples) 29 labels = np.array([np.random.rand() < p for p in logistic(features.dot(params.T))]) ---> 30 contours(least_squares) 31 32 plt.subplot(1, 2, 2) <ipython-input-37-503e4a8580e4> in contours(f, xlim, ylim) 7 8 loss = lambda x: f(x, features, labels) ----> 9 X, Y, Z = apply_to_grid(loss, xlim, ylim) 10 11 plt.xlabel(r"$x_1$", fontsize=18) <ipython-input-37-503e4a8580e4> in apply_to_grid(f, xlim, ylim, s) 4 X, Y = np.mgrid[xlim[0]:xlim[1]:s, ylim[0]:ylim[1]:s] 5 z = positions = np.vstack([X.ravel(), Y.ravel()]).T ----> 6 return X, Y, np.reshape(f(z), (100, 100)) 7 8 loss = lambda x: f(x, features, labels) <ipython-input-37-503e4a8580e4> in <lambda>(x) 6 return X, Y, np.reshape(f(z), (100, 100)) 7 ----> 8 loss = lambda x: f(x, features, labels) 9 X, Y, Z = apply_to_grid(loss, xlim, ylim) 10 <ipython-input-34-4473aa8093dc> in least_squares(x, features, labels) 2 """Evaluates the least square function.""" 3 n_samples = features.shape[0] ----> 4 x = x.reshape(1, n_features) 5 loss_array = (features.dot(x.T) - labels) ** 2 6 return np.sum(loss_array, axis=0) / (2. * n_samples) ValueError: total size of new array must be unchanged