from time import time import sys from sklearn.utils.validation import check_arrays from sklearn.utils.extmath import safe_sparse_dot from scipy.sparse import csr_matrix, issparse from pandas import DataFrame X_wide = np.random.uniform(low=0, high=1, size=(50, 100)) X_tall = np.random.uniform(low=0, high=1, size=(100, 50)) Y_wide = np.random.uniform(low=0, high=1, size=(50, 200)) Y_tall = np.random.uniform(low=0, high=1, size=(100, 200)) def prepare_data(shape='wide', sparse=1, n_targets=10): X, Y = (X_wide, Y_wide) if shape == 'wide' else (X_tall, Y_tall) X, Y = X.copy(), Y[:, :n_targets].copy() X[X > sparse] = 0 Y[Y > sparse] = 0 if sparse < 0.5: Y = csr_matrix(Y) Y.data /= sparse return X, Y def all_configurations(all_n_targets=[200]): for shape in ('wide', 'tall'): for sparsity in (0.1, 0.4, 1.0): for n_targets in all_n_targets: X, Y = prepare_data(shape, sparsity, n_targets) yield shape, sparsity, n_targets, X, Y def safe_fro(X, squared=False): if issparse(X): nrm = np.sum(X.data ** 2) else: if hasattr(X, 'A'): X = X.A nrm = np.sum(X ** 2) return nrm if squared else np.sqrt(nrm) def time_it(f, *args, **kwargs): best_time = np.inf for _ in xrange(3): t0 = time() res = f(*args, **kwargs) timing = time() - t0 if timing < best_time: best_time = timing return best_time, res def filter_plot(frame, x, y, label, ls, marker, **kwargs): these_res = frame.select(lambda x: all(results[key][x] == val for (key, val) in kwargs.items())) plt.plot(x(these_res), y(these_res), label=label, ls=ls, marker=marker) from scipy.optimize import nnls def nls_activeset(X, Y, W_init=None): """Non-negative least squares by the active set method solves min ||Y - XW||_F, no regularization Parameters ---------- X, Y : array-like The known parts of the problem. W_init : array-like, Preallocated memory to store results in. Its contents are not used. Returns ------- W : array-like The solution. residual : float, The value of the target function at the objective. """ X, Y = check_arrays(X, Y, check_ccontiguous=True, sparse_format='csr') n_samples, n_features = X.shape n_samples, n_targets = Y.shape residual = 0 if not W_init or W_init.shape != (n_features, n_targets): W = np.empty((n_features, n_targets)) for k in xrange(n_targets): this_Y = Y[:, k] # densify the column vector if sparse if hasattr(this_Y, 'toarray'): this_Y = this_Y.toarray().squeeze() W[:, k], this_res = nnls(X, this_Y) residual += this_res ** 2 return W, np.sqrt(residual) from scipy.optimize.lbfgsb import fmin_l_bfgs_b # Authors: Mathieu Blondel, Vlad Niculae def nls_lbfgs_b(X, Y, W_init=None, l1_reg=0, l2_reg=0, max_iter=5000, tol=1e-3, callback=None): """Non-negative least squares solver using L-BFGS-B. Solves for W in min 0.5 ||Y - XW||^2_F + + l1_reg * sum(W) + 0.5 * l2_reg * ||W||^2_F """ X, Y = check_arrays(X, Y, sparse_format='csr') n_samples, n_features = X.shape n_targets = Y.shape[1] G = safe_sparse_dot(X.T, X) Xy = safe_sparse_dot(X.T, Y) def f(w, *args): W = w.reshape((n_features, n_targets)) diff = (safe_sparse_dot(X, W) - Y) diff = diff.A if hasattr(diff, 'A') else diff res = 0.5 * np.sum(diff ** 2) if l2_reg: res += 0.5 * l2_reg * np.sum(W ** 2) if l1_reg: res += l1_reg * np.sum(W) return res def fprime(w, *args): W = w.reshape((n_features, n_targets)) grad = (np.dot(G, W) - Xy).ravel() if l2_reg: grad += l2_reg * w if l1_reg: grad += l1_reg return grad if W_init is None: W = np.zeros((n_features * n_targets,), dtype=np.float64) else: W = W_init.ravel().copy() W, residual, d = fmin_l_bfgs_b( f, x0=W, fprime=fprime, pgtol=tol, bounds=[(0, None)] * n_features * n_targets, maxiter=max_iter, callback=callback) # testing reveals that sometimes, very small negative values occur W[W < 0] = 0 if l1_reg: residual -= l1_reg * np.sum(W) if l2_reg: residual -= 0.5 * l2_reg * np.sum(W ** 2) residual = np.sqrt(2 * residual) if d['warnflag'] > 0: print("L-BFGS-B failed to converge") return W.reshape((n_features, n_targets)), residual # Author: Vlad Niculae # Lars Buitinck # Author: Chih-Jen Lin, National Taiwan University (original projected gradient # NMF implementation) # Author: Anthony Di Franco (original Python and NumPy port) # License: BSD 3 clause def nls_projgrad(X, Y, W_init=None, l1_reg=0, l2_reg=0, tol=1e-3, max_iter=5000, sigma=0.01, beta=0.1, callback=None): """Non-negative least square solver Solves a non-negative least squares subproblem using the projected gradient descent algorithm. min 0.5 * || XW - Y ||^2_F + l1_reg * sum(W) + 0.5 * l2_reg * * ||W||^2_F Parameters ---------- Y, X : array-like Constant matrices. W_init : array-like Initial guess for the solution. l1_reg, l2_reg : float, Regularization factors tol : float Tolerance of the stopping condition. max_iter : int Maximum number of iterations before timing out. sigma : float Constant used in the sufficient decrease condition checked by the line search. Smaller values lead to a looser sufficient decrease condition, thus reducing the time taken by the line search, but potentially increasing the number of iterations of the projected gradient procedure. 0.01 is a commonly used value in the optimization literature. beta : float Factor by which the step size is decreased (resp. increased) until (resp. as long as) the sufficient decrease condition is satisfied. Larger values allow to find a better step size but lead to longer line search. 0.1 is a commonly used value in the optimization literature. Returns ------- W : array-like Solution to the non-negative least squares problem. grad : array-like The gradient. n_iter : int The number of iterations done by the algorithm. Reference --------- C.-J. Lin. Projected gradient methods for non-negative matrix factorization. Neural Computation, 19(2007), 2756-2779. http://www.csie.ntu.edu.tw/~cjlin/nmf/ """ X, Y = check_arrays(X, Y, sparse_format='csr') n_samples, n_features = X.shape n_targets = Y.shape[1] XY = safe_sparse_dot(X.T, Y) G = np.dot(X.T, X) if W_init is None: W = np.zeros((n_features, n_targets), dtype=np.float64) else: W = W_init.copy() init_grad_norm = safe_fro(np.dot(G, W) - XY + l2_reg * W + l1_reg) # values justified in the paper alpha = 1 for n_iter in range(1, max_iter + 1): grad = np.dot(G, W) - XY # X'(XW - Y) using precomputation if l2_reg: grad += l2_reg * W if l1_reg: grad += l1_reg # The following multiplication with a boolean array is more than twice # as fast as indexing into grad. proj_grad_norm = np.linalg.norm(grad * np.logical_or(grad < 0, W > 0)) if proj_grad_norm / init_grad_norm < tol: break W_prev = W for inner_iter in range(20): # Gradient step. W_next = W - alpha * grad # Projection step. W_next *= W_next > 0 d = W_next - W gradd = np.dot(grad.ravel(), d.ravel()) dGd = np.dot(np.dot((G + l2_reg), d).ravel(), d.ravel()) suff_decr = (1 - sigma) * gradd + 0.5 * dGd < 0 if inner_iter == 0: decr_alpha = not suff_decr if decr_alpha: if suff_decr: W = W_next break else: alpha *= beta elif not suff_decr or (W_prev == W_next).all(): W = W_prev break else: alpha /= beta W_prev = W_next if callback: callback(W) if n_iter == max_iter: print("PG failed to converge") residual = safe_fro(np.dot(X, W) - Y) return W, residual solvers=dict(activeset=nls_activeset, lbfgsb=nls_lbfgs_b, projgrad=nls_projgrad) from numpy.testing import assert_almost_equal from sklearn.utils.testing import assert_true def test_residual(): for _, _, _, X, Y in all_configurations(): for name, nls in solvers.items(): W, resid = nls(X, Y) true_resid = safe_fro(Y - np.dot(X, W)) assert_almost_equal(resid, true_resid, err_msg=name) test_residual() def test_non_negative(): for _, _, _, X, Y in all_configurations(): for name, nls in solvers.items(): W, resid = nls(X, Y) assert_true(np.all(W >= 0), name) test_non_negative() # Test actual correctness # using the active set solver as reference %pylab inline --no-import-all def cb_append_norm(W, W_ref, dest_norm, dest_time): dest_norm.append(safe_fro(W - W_ref)) dest_time.append(time()) plt.figure(figsize=(22, 10)) for k, (shape, sparsity, _, X, Y) in enumerate(all_configurations()): mse_projgrad = [] time_projgrad = [] mse_lbfgs_b = [] time_lbfgs_b = [] W_ref, _ = nls_activeset(X, Y) t0 = time() nls_projgrad(X, Y, callback=lambda W: cb_append_norm(W, W_ref, mse_projgrad, time_projgrad), tol=1e-6, max_iter=30000) time_projgrad = np.array(time_projgrad) - t0 t0 = time() nls_lbfgs_b(X, Y, callback=lambda W: cb_append_norm(W.reshape(W_ref.shape), W_ref, mse_lbfgs_b, time_lbfgs_b), tol=1e-6, max_iter=30000) time_lbfgs_b = np.array(time_lbfgs_b) - t0 plt.subplot(2, 6, k + 1) plt.semilogy(mse_projgrad, label='projgrad', ls=':', c='k') plt.semilogy(mse_lbfgs_b, label='lbfgsb', c='k') plt.title("{} sp={}".format(shape, sparsity)) plt.ylabel("MSE") plt.xlabel("n_iterations") plt.legend() plt.subplot(2, 6, k + 7) plt.semilogy(time_projgrad, mse_projgrad, label='projgrad', ls=':', c='k') plt.semilogy(time_lbfgs_b, mse_lbfgs_b, label='lbfgsb', c='k') plt.title("{} sp={}".format(shape, sparsity)) plt.ylabel("MSE") plt.xlabel("time") plt.legend() plt.show() results = DataFrame(columns='solver shape sparsity n_targets time residual'.split()) for shape, sparsity, n_targets, X, Y in all_configurations([1] + range(25, 201, 25)): sys.stdout.write('.') sys.stdout.flush() X, Y = prepare_data(shape, sparsity, n_targets) for solver_name, solver, args in ( ('activeset', nls_activeset, {}), ('lbfgsb', nls_lbfgs_b, {}), ('pg-l2', nls_projgrad, {'l2_reg': 2.0}), ('pg-l1', nls_projgrad, {'l1_reg': 2.0}), ('pg', nls_projgrad, {}), ('lbfgsb-l2', nls_lbfgs_b, {'l2_reg': 2.0}), ('lbfgsb-l1', nls_lbfgs_b, {'l1_reg': 2.0}) ): timing, (W, residual) = time_it(solver, X, Y, **args) results = results.append( dict(solver=solver_name, shape=shape, sparsity=sparsity, n_targets=n_targets, time=timing, residual=residual / safe_fro(Y)), ignore_index=True) %pylab inline --no-import-all styles = {1: '-', 0.1: '--', 0.01: ':'} markers = {'activeset': '.', 'lbfgsb': 'v', 'lbfgsb-l1': '^', 'lbfgsb-l2': 's', 'pg': 'x', 'pg-l2': 'o', 'pg-l1': '+'} plt.figure(figsize=(22, 10)) for i, sparsity in enumerate((1, 0.4, 0.1)): for k, shape in enumerate(('tall', 'wide')): plt.subplot(2, 6, (2 * i + k) + 1) for solver in markers.keys(): filter_plot(results, lambda x: x['n_targets'].tolist(), lambda x: x['time'].tolist(), label="{}".format(solver), ls='-', # styles[sparsity], marker=markers[solver], solver=solver, shape=shape, sparsity=sparsity) plt.title("{} sp={}".format(shape, sparsity)) plt.xlabel("n_targets") plt.ylabel("time") plt.legend(loc='upper left') plt.subplot(2, 6, 7 + (2 * i + k)) for solver in markers.keys(): filter_plot(results, lambda x: x['n_targets'].tolist(), lambda x: x['residual'].tolist(), label="{}".format(solver), ls='-', # styles[sparsity], marker=markers[solver], solver=solver, shape=shape, sparsity=sparsity) plt.title("{} sp={}".format(shape, sparsity)) plt.xlabel("n_targets") plt.ylabel("residual") plt.legend(loc='upper left') from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer dataset = fetch_20newsgroups(shuffle=True, random_state=0) X = TfidfVectorizer(max_features=1000, max_df=0.9, use_idf=True).fit_transform(dataset.data[:300]) print(X.shape) D = np.random.uniform(size=(300, 1000)) D[D > 0.01] = 0 D /= 0.01 news_res = DataFrame(columns='solver n_targets time residual'.split()) tol = 1e-2 max_iter = 300 for n_targets in [1] + range(50, 301, 50): sys.stdout.write('.') sys.stdout.flush() this_D = D[:n_targets, :] for solver_name, solver, args in ( ('activeset', nls_activeset, {}), ('lbfgsb', nls_lbfgs_b, {'tol': tol, 'max_iter': max_iter}), ('pg-l2', nls_projgrad, {'l2_reg': 2.0, 'tol': tol, 'max_iter': max_iter}), ('pg-l1', nls_projgrad, {'l1_reg': 2.0, 'tol': tol, 'max_iter': max_iter}), ('pg', nls_projgrad, {'tol': tol, 'max_iter': max_iter}), ('lbfgsb-l2', nls_lbfgs_b, {'l2_reg': 2.0, 'tol': tol, 'max_iter': max_iter}), ('lbfgsb-l1', nls_lbfgs_b, {'l1_reg': 2.0, 'tol': tol, 'max_iter': max_iter}) ): timing, (W, residual) = time_it(solver, D.T, X.T, **args) news_res = news_res.append( dict(solver=solver_name, n_targets=n_targets, time=timing, residual=residual / safe_fro(Y)), ignore_index=True) plt.figure(figsize=(22, 4)) plt.subplot(1, 3, 1) for solver in markers.keys(): filter_plot(news_res, lambda x: x['n_targets'].tolist(), lambda x: x['time'].tolist(), label="{}".format(solver), ls='-', marker=markers[solver], solver=solver) plt.xlabel("n_targets") plt.ylabel("time") plt.legend(loc='upper left') # skip activeset as it's way too slow plt.subplot(1, 3, 2) for solver in markers.keys(): if solver == 'activeset': continue filter_plot(news_res, lambda x: x['n_targets'].tolist(), lambda x: x['time'].tolist(), label="{}".format(solver), ls='-', marker=markers[solver], solver=solver) plt.xlabel("n_targets") plt.ylabel("time") plt.legend(loc='upper left') plt.subplot(1, 3, 3) for solver in markers.keys(): filter_plot(news_res, lambda x: x['n_targets'].tolist(), lambda x: x['residual'].tolist(), label="{}".format(solver), ls='-', marker=markers[solver], solver=solver) plt.xlabel("n_targets") plt.ylabel("residual") plt.legend(loc='upper left')