from time import time
import sys
from sklearn.utils.validation import check_arrays
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse import csr_matrix, issparse
from pandas import DataFrame
X_wide = np.random.uniform(low=0, high=1, size=(50, 100))
X_tall = np.random.uniform(low=0, high=1, size=(100, 50))
Y_wide = np.random.uniform(low=0, high=1, size=(50, 200))
Y_tall = np.random.uniform(low=0, high=1, size=(100, 200))
def prepare_data(shape='wide', sparse=1, n_targets=10):
X, Y = (X_wide, Y_wide) if shape == 'wide' else (X_tall, Y_tall)
X, Y = X.copy(), Y[:, :n_targets].copy()
X[X > sparse] = 0
Y[Y > sparse] = 0
if sparse < 0.5:
Y = csr_matrix(Y)
Y.data /= sparse
return X, Y
def all_configurations(all_n_targets=[200]):
for shape in ('wide', 'tall'):
for sparsity in (0.1, 0.4, 1.0):
for n_targets in all_n_targets:
X, Y = prepare_data(shape, sparsity, n_targets)
yield shape, sparsity, n_targets, X, Y
def safe_fro(X, squared=False):
if issparse(X):
nrm = np.sum(X.data ** 2)
else:
if hasattr(X, 'A'):
X = X.A
nrm = np.sum(X ** 2)
return nrm if squared else np.sqrt(nrm)
def time_it(f, *args, **kwargs):
best_time = np.inf
for _ in xrange(3):
t0 = time()
res = f(*args, **kwargs)
timing = time() - t0
if timing < best_time:
best_time = timing
return best_time, res
def filter_plot(frame, x, y, label, ls, marker, **kwargs):
these_res = frame.select(lambda x: all(results[key][x] == val for (key, val) in kwargs.items()))
plt.plot(x(these_res), y(these_res), label=label, ls=ls, marker=marker)
from scipy.optimize import nnls
def nls_activeset(X, Y, W_init=None):
"""Non-negative least squares by the active set method
solves min ||Y - XW||_F, no regularization
Parameters
----------
X, Y : array-like
The known parts of the problem.
W_init : array-like,
Preallocated memory to store results in. Its contents are not used.
Returns
-------
W : array-like
The solution.
residual : float,
The value of the target function at the objective.
"""
X, Y = check_arrays(X, Y, check_ccontiguous=True, sparse_format='csr')
n_samples, n_features = X.shape
n_samples, n_targets = Y.shape
residual = 0
if not W_init or W_init.shape != (n_features, n_targets):
W = np.empty((n_features, n_targets))
for k in xrange(n_targets):
this_Y = Y[:, k]
# densify the column vector if sparse
if hasattr(this_Y, 'toarray'):
this_Y = this_Y.toarray().squeeze()
W[:, k], this_res = nnls(X, this_Y)
residual += this_res ** 2
return W, np.sqrt(residual)
from scipy.optimize.lbfgsb import fmin_l_bfgs_b
# Authors: Mathieu Blondel, Vlad Niculae
def nls_lbfgs_b(X, Y, W_init=None, l1_reg=0, l2_reg=0, max_iter=5000, tol=1e-3, callback=None):
"""Non-negative least squares solver using L-BFGS-B.
Solves for W in
min 0.5 ||Y - XW||^2_F + + l1_reg * sum(W) + 0.5 * l2_reg * ||W||^2_F
"""
X, Y = check_arrays(X, Y, sparse_format='csr')
n_samples, n_features = X.shape
n_targets = Y.shape[1]
G = safe_sparse_dot(X.T, X)
Xy = safe_sparse_dot(X.T, Y)
def f(w, *args):
W = w.reshape((n_features, n_targets))
diff = (safe_sparse_dot(X, W) - Y)
diff = diff.A if hasattr(diff, 'A') else diff
res = 0.5 * np.sum(diff ** 2)
if l2_reg:
res += 0.5 * l2_reg * np.sum(W ** 2)
if l1_reg:
res += l1_reg * np.sum(W)
return res
def fprime(w, *args):
W = w.reshape((n_features, n_targets))
grad = (np.dot(G, W) - Xy).ravel()
if l2_reg:
grad += l2_reg * w
if l1_reg:
grad += l1_reg
return grad
if W_init is None:
W = np.zeros((n_features * n_targets,), dtype=np.float64)
else:
W = W_init.ravel().copy()
W, residual, d = fmin_l_bfgs_b(
f, x0=W, fprime=fprime, pgtol=tol,
bounds=[(0, None)] * n_features * n_targets,
maxiter=max_iter,
callback=callback)
# testing reveals that sometimes, very small negative values occur
W[W < 0] = 0
if l1_reg:
residual -= l1_reg * np.sum(W)
if l2_reg:
residual -= 0.5 * l2_reg * np.sum(W ** 2)
residual = np.sqrt(2 * residual)
if d['warnflag'] > 0:
print("L-BFGS-B failed to converge")
return W.reshape((n_features, n_targets)), residual
# Author: Vlad Niculae <vlad@vene.ro>
# Lars Buitinck <L.J.Buitinck@uva.nl>
# Author: Chih-Jen Lin, National Taiwan University (original projected gradient
# NMF implementation)
# Author: Anthony Di Franco (original Python and NumPy port)
# License: BSD 3 clause
def nls_projgrad(X, Y, W_init=None, l1_reg=0, l2_reg=0, tol=1e-3, max_iter=5000,
sigma=0.01, beta=0.1, callback=None):
"""Non-negative least square solver
Solves a non-negative least squares subproblem using the
projected gradient descent algorithm.
min 0.5 * || XW - Y ||^2_F + l1_reg * sum(W) + 0.5 * l2_reg * * ||W||^2_F
Parameters
----------
Y, X : array-like
Constant matrices.
W_init : array-like
Initial guess for the solution.
l1_reg, l2_reg : float,
Regularization factors
tol : float
Tolerance of the stopping condition.
max_iter : int
Maximum number of iterations before timing out.
sigma : float
Constant used in the sufficient decrease condition checked by the line
search. Smaller values lead to a looser sufficient decrease condition,
thus reducing the time taken by the line search, but potentially
increasing the number of iterations of the projected gradient
procedure. 0.01 is a commonly used value in the optimization
literature.
beta : float
Factor by which the step size is decreased (resp. increased) until
(resp. as long as) the sufficient decrease condition is satisfied.
Larger values allow to find a better step size but lead to longer line
search. 0.1 is a commonly used value in the optimization literature.
Returns
-------
W : array-like
Solution to the non-negative least squares problem.
grad : array-like
The gradient.
n_iter : int
The number of iterations done by the algorithm.
Reference
---------
C.-J. Lin. Projected gradient methods
for non-negative matrix factorization. Neural
Computation, 19(2007), 2756-2779.
http://www.csie.ntu.edu.tw/~cjlin/nmf/
"""
X, Y = check_arrays(X, Y, sparse_format='csr')
n_samples, n_features = X.shape
n_targets = Y.shape[1]
XY = safe_sparse_dot(X.T, Y)
G = np.dot(X.T, X)
if W_init is None:
W = np.zeros((n_features, n_targets), dtype=np.float64)
else:
W = W_init.copy()
init_grad_norm = safe_fro(np.dot(G, W) - XY + l2_reg * W + l1_reg)
# values justified in the paper
alpha = 1
for n_iter in range(1, max_iter + 1):
grad = np.dot(G, W) - XY # X'(XW - Y) using precomputation
if l2_reg:
grad += l2_reg * W
if l1_reg:
grad += l1_reg
# The following multiplication with a boolean array is more than twice
# as fast as indexing into grad.
proj_grad_norm = np.linalg.norm(grad * np.logical_or(grad < 0, W > 0))
if proj_grad_norm / init_grad_norm < tol:
break
W_prev = W
for inner_iter in range(20):
# Gradient step.
W_next = W - alpha * grad
# Projection step.
W_next *= W_next > 0
d = W_next - W
gradd = np.dot(grad.ravel(), d.ravel())
dGd = np.dot(np.dot((G + l2_reg), d).ravel(), d.ravel())
suff_decr = (1 - sigma) * gradd + 0.5 * dGd < 0
if inner_iter == 0:
decr_alpha = not suff_decr
if decr_alpha:
if suff_decr:
W = W_next
break
else:
alpha *= beta
elif not suff_decr or (W_prev == W_next).all():
W = W_prev
break
else:
alpha /= beta
W_prev = W_next
if callback:
callback(W)
if n_iter == max_iter:
print("PG failed to converge")
residual = safe_fro(np.dot(X, W) - Y)
return W, residual
solvers=dict(activeset=nls_activeset, lbfgsb=nls_lbfgs_b, projgrad=nls_projgrad)
from numpy.testing import assert_almost_equal
from sklearn.utils.testing import assert_true
def test_residual():
for _, _, _, X, Y in all_configurations():
for name, nls in solvers.items():
W, resid = nls(X, Y)
true_resid = safe_fro(Y - np.dot(X, W))
assert_almost_equal(resid, true_resid, err_msg=name)
test_residual()
def test_non_negative():
for _, _, _, X, Y in all_configurations():
for name, nls in solvers.items():
W, resid = nls(X, Y)
assert_true(np.all(W >= 0), name)
test_non_negative()
# Test actual correctness
# using the active set solver as reference
%pylab inline --no-import-all
def cb_append_norm(W, W_ref, dest_norm, dest_time):
dest_norm.append(safe_fro(W - W_ref))
dest_time.append(time())
plt.figure(figsize=(22, 10))
for k, (shape, sparsity, _, X, Y) in enumerate(all_configurations()):
mse_projgrad = []
time_projgrad = []
mse_lbfgs_b = []
time_lbfgs_b = []
W_ref, _ = nls_activeset(X, Y)
t0 = time()
nls_projgrad(X, Y,
callback=lambda W: cb_append_norm(W, W_ref, mse_projgrad, time_projgrad),
tol=1e-6, max_iter=30000)
time_projgrad = np.array(time_projgrad) - t0
t0 = time()
nls_lbfgs_b(X, Y,
callback=lambda W: cb_append_norm(W.reshape(W_ref.shape), W_ref, mse_lbfgs_b, time_lbfgs_b),
tol=1e-6, max_iter=30000)
time_lbfgs_b = np.array(time_lbfgs_b) - t0
plt.subplot(2, 6, k + 1)
plt.semilogy(mse_projgrad, label='projgrad', ls=':', c='k')
plt.semilogy(mse_lbfgs_b, label='lbfgsb', c='k')
plt.title("{} sp={}".format(shape, sparsity))
plt.ylabel("MSE")
plt.xlabel("n_iterations")
plt.legend()
plt.subplot(2, 6, k + 7)
plt.semilogy(time_projgrad, mse_projgrad, label='projgrad', ls=':', c='k')
plt.semilogy(time_lbfgs_b, mse_lbfgs_b, label='lbfgsb', c='k')
plt.title("{} sp={}".format(shape, sparsity))
plt.ylabel("MSE")
plt.xlabel("time")
plt.legend()
plt.show()
Populating the interactive namespace from numpy and matplotlib PG failed to converge
results = DataFrame(columns='solver shape sparsity n_targets time residual'.split())
for shape, sparsity, n_targets, X, Y in all_configurations([1] + range(25, 201, 25)):
sys.stdout.write('.')
sys.stdout.flush()
X, Y = prepare_data(shape, sparsity, n_targets)
for solver_name, solver, args in (
('activeset', nls_activeset, {}),
('lbfgsb', nls_lbfgs_b, {}),
('pg-l2', nls_projgrad, {'l2_reg': 2.0}),
('pg-l1', nls_projgrad, {'l1_reg': 2.0}),
('pg', nls_projgrad, {}),
('lbfgsb-l2', nls_lbfgs_b, {'l2_reg': 2.0}),
('lbfgsb-l1', nls_lbfgs_b, {'l1_reg': 2.0})
):
timing, (W, residual) = time_it(solver, X, Y, **args)
results = results.append(
dict(solver=solver_name,
shape=shape,
sparsity=sparsity,
n_targets=n_targets,
time=timing,
residual=residual / safe_fro(Y)),
ignore_index=True)
......................................................
%pylab inline --no-import-all
styles = {1: '-', 0.1: '--', 0.01: ':'}
markers = {'activeset': '.', 'lbfgsb': 'v', 'lbfgsb-l1': '^', 'lbfgsb-l2': 's', 'pg': 'x', 'pg-l2': 'o', 'pg-l1': '+'}
plt.figure(figsize=(22, 10))
for i, sparsity in enumerate((1, 0.4, 0.1)):
for k, shape in enumerate(('tall', 'wide')):
plt.subplot(2, 6, (2 * i + k) + 1)
for solver in markers.keys():
filter_plot(results,
lambda x: x['n_targets'].tolist(),
lambda x: x['time'].tolist(),
label="{}".format(solver),
ls='-', # styles[sparsity],
marker=markers[solver],
solver=solver, shape=shape, sparsity=sparsity)
plt.title("{} sp={}".format(shape, sparsity))
plt.xlabel("n_targets")
plt.ylabel("time")
plt.legend(loc='upper left')
plt.subplot(2, 6, 7 + (2 * i + k))
for solver in markers.keys():
filter_plot(results,
lambda x: x['n_targets'].tolist(),
lambda x: x['residual'].tolist(),
label="{}".format(solver),
ls='-', # styles[sparsity],
marker=markers[solver],
solver=solver, shape=shape, sparsity=sparsity)
plt.title("{} sp={}".format(shape, sparsity))
plt.xlabel("n_targets")
plt.ylabel("residual")
plt.legend(loc='upper left')
Populating the interactive namespace from numpy and matplotlib
Non-negative sparse coding of the 20 newsgroups dataset against a random dictionary.
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
dataset = fetch_20newsgroups(shuffle=True, random_state=0)
X = TfidfVectorizer(max_features=1000, max_df=0.9, use_idf=True).fit_transform(dataset.data[:300])
print(X.shape)
(300, 1000)
D = np.random.uniform(size=(300, 1000))
D[D > 0.01] = 0
D /= 0.01
news_res = DataFrame(columns='solver n_targets time residual'.split())
tol = 1e-2
max_iter = 300
for n_targets in [1] + range(50, 301, 50):
sys.stdout.write('.')
sys.stdout.flush()
this_D = D[:n_targets, :]
for solver_name, solver, args in (
('activeset', nls_activeset, {}),
('lbfgsb', nls_lbfgs_b, {'tol': tol, 'max_iter': max_iter}),
('pg-l2', nls_projgrad, {'l2_reg': 2.0, 'tol': tol, 'max_iter': max_iter}),
('pg-l1', nls_projgrad, {'l1_reg': 2.0, 'tol': tol, 'max_iter': max_iter}),
('pg', nls_projgrad, {'tol': tol, 'max_iter': max_iter}),
('lbfgsb-l2', nls_lbfgs_b, {'l2_reg': 2.0, 'tol': tol, 'max_iter': max_iter}),
('lbfgsb-l1', nls_lbfgs_b, {'l1_reg': 2.0, 'tol': tol, 'max_iter': max_iter})
):
timing, (W, residual) = time_it(solver, D.T, X.T, **args)
news_res = news_res.append(
dict(solver=solver_name,
n_targets=n_targets,
time=timing,
residual=residual / safe_fro(Y)),
ignore_index=True)
.......
plt.figure(figsize=(22, 4))
plt.subplot(1, 3, 1)
for solver in markers.keys():
filter_plot(news_res,
lambda x: x['n_targets'].tolist(),
lambda x: x['time'].tolist(),
label="{}".format(solver),
ls='-',
marker=markers[solver],
solver=solver)
plt.xlabel("n_targets")
plt.ylabel("time")
plt.legend(loc='upper left')
# skip activeset as it's way too slow
plt.subplot(1, 3, 2)
for solver in markers.keys():
if solver == 'activeset':
continue
filter_plot(news_res,
lambda x: x['n_targets'].tolist(),
lambda x: x['time'].tolist(),
label="{}".format(solver),
ls='-',
marker=markers[solver],
solver=solver)
plt.xlabel("n_targets")
plt.ylabel("time")
plt.legend(loc='upper left')
plt.subplot(1, 3, 3)
for solver in markers.keys():
filter_plot(news_res,
lambda x: x['n_targets'].tolist(),
lambda x: x['residual'].tolist(),
label="{}".format(solver),
ls='-',
marker=markers[solver],
solver=solver)
plt.xlabel("n_targets")
plt.ylabel("residual")
plt.legend(loc='upper left')