from time import time
import sys
from sklearn.utils.validation import check_arrays
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse import csr_matrix, issparse
from pandas import DataFrame

X_wide = np.random.uniform(low=0, high=1, size=(50, 100))
X_tall = np.random.uniform(low=0, high=1, size=(100, 50))
Y_wide = np.random.uniform(low=0, high=1, size=(50, 200))
Y_tall = np.random.uniform(low=0, high=1, size=(100, 200))


def prepare_data(shape='wide', sparse=1, n_targets=10):
    X, Y = (X_wide, Y_wide) if shape == 'wide' else (X_tall, Y_tall)
    X, Y = X.copy(), Y[:, :n_targets].copy()
    X[X > sparse] = 0
    Y[Y > sparse] = 0
    
    if sparse < 0.5:
        Y = csr_matrix(Y)
        Y.data /= sparse
    return X, Y


def all_configurations(all_n_targets=[200]):
    for shape in ('wide', 'tall'):
        for sparsity in (0.1, 0.4, 1.0):
            for n_targets in all_n_targets:
                X, Y = prepare_data(shape, sparsity, n_targets)
                yield shape, sparsity, n_targets, X, Y

def safe_fro(X, squared=False):
    if issparse(X):
        nrm = np.sum(X.data ** 2)
    else:
        if hasattr(X, 'A'):
            X = X.A
        nrm = np.sum(X ** 2)
    return nrm if squared else np.sqrt(nrm)

def time_it(f, *args, **kwargs):
    best_time = np.inf
    for _ in xrange(3):
        t0 = time()
        res = f(*args, **kwargs)
        timing = time() - t0
        if timing < best_time:
            best_time = timing
    return best_time, res

def filter_plot(frame, x, y, label, ls, marker, **kwargs):
    these_res = frame.select(lambda x: all(results[key][x] == val for (key, val) in kwargs.items()))
    plt.plot(x(these_res), y(these_res), label=label, ls=ls, marker=marker)

from scipy.optimize import nnls

def nls_activeset(X, Y, W_init=None):
    """Non-negative least squares by the active set method
    
    solves min ||Y - XW||_F, no regularization
    
    Parameters
    ----------
    X, Y : array-like
        The known parts of the problem.

    W_init : array-like,
        Preallocated memory to store results in. Its contents are not used.
    
    Returns
    -------
    
    W : array-like
        The solution.
    
    residual : float,
        The value of the target function at the objective.
    """
    X, Y = check_arrays(X, Y, check_ccontiguous=True, sparse_format='csr')
    n_samples, n_features = X.shape
    n_samples, n_targets = Y.shape
    residual = 0
    if not W_init or W_init.shape != (n_features, n_targets):
        W = np.empty((n_features, n_targets))
    for k in xrange(n_targets):
        this_Y = Y[:, k]
        # densify the column vector if sparse
        if hasattr(this_Y, 'toarray'):
            this_Y = this_Y.toarray().squeeze()
        W[:, k], this_res = nnls(X, this_Y)
        residual += this_res ** 2
    return W, np.sqrt(residual)

from scipy.optimize.lbfgsb import fmin_l_bfgs_b

# Authors: Mathieu Blondel, Vlad Niculae
def nls_lbfgs_b(X, Y, W_init=None, l1_reg=0, l2_reg=0, max_iter=5000, tol=1e-3, callback=None):
    """Non-negative least squares solver using L-BFGS-B.
        
    Solves for W in
    min 0.5 ||Y - XW||^2_F + + l1_reg * sum(W) + 0.5 * l2_reg * ||W||^2_F
    
    """
    X, Y = check_arrays(X, Y, sparse_format='csr')
    n_samples, n_features = X.shape
    n_targets = Y.shape[1]
    G = safe_sparse_dot(X.T, X)
    Xy = safe_sparse_dot(X.T, Y)

    def f(w, *args):
        W = w.reshape((n_features, n_targets))
        diff = (safe_sparse_dot(X, W) - Y)
        diff = diff.A if hasattr(diff, 'A') else diff
        res = 0.5 * np.sum(diff ** 2)
        if l2_reg:
            res += 0.5 * l2_reg * np.sum(W ** 2)
        if l1_reg:
            res += l1_reg * np.sum(W)
        return res

    def fprime(w, *args):
        W = w.reshape((n_features, n_targets))
        grad = (np.dot(G, W) - Xy).ravel()
        if l2_reg:
            grad += l2_reg * w
        if l1_reg:
            grad += l1_reg
        return grad

    if W_init is None:
        W = np.zeros((n_features * n_targets,), dtype=np.float64)
    else:
        W = W_init.ravel().copy()
    W, residual, d = fmin_l_bfgs_b(
                f, x0=W, fprime=fprime, pgtol=tol,
                bounds=[(0, None)] * n_features * n_targets,
                maxiter=max_iter,
                callback=callback)
    
    # testing reveals that sometimes, very small negative values occur
    W[W < 0] = 0
    
    if l1_reg:
        residual -= l1_reg * np.sum(W)
    if l2_reg:
        residual -= 0.5 * l2_reg * np.sum(W ** 2)
    residual = np.sqrt(2 * residual)
    if d['warnflag'] > 0:
        print("L-BFGS-B failed to converge")
    
    return W.reshape((n_features, n_targets)), residual

# Author: Vlad Niculae <vlad@vene.ro>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
# Author: Chih-Jen Lin, National Taiwan University (original projected gradient
#     NMF implementation)
# Author: Anthony Di Franco (original Python and NumPy port)
# License: BSD 3 clause


def nls_projgrad(X, Y, W_init=None, l1_reg=0, l2_reg=0, tol=1e-3, max_iter=5000,
                 sigma=0.01, beta=0.1, callback=None):
    """Non-negative least square solver

    Solves a non-negative least squares subproblem using the
    projected gradient descent algorithm.
    min 0.5 * || XW - Y ||^2_F + l1_reg * sum(W) + 0.5 * l2_reg * * ||W||^2_F

    Parameters
    ----------
    Y, X : array-like
        Constant matrices.

    W_init : array-like
        Initial guess for the solution.

    l1_reg, l2_reg : float,
        Regularization factors

    tol : float
        Tolerance of the stopping condition.

    max_iter : int
        Maximum number of iterations before timing out.

    sigma : float
        Constant used in the sufficient decrease condition checked by the line
        search.  Smaller values lead to a looser sufficient decrease condition,
        thus reducing the time taken by the line search, but potentially
        increasing the number of iterations of the projected gradient
        procedure. 0.01 is a commonly used value in the optimization
        literature.

    beta : float
        Factor by which the step size is decreased (resp. increased) until
        (resp. as long as) the sufficient decrease condition is satisfied.
        Larger values allow to find a better step size but lead to longer line
        search. 0.1 is a commonly used value in the optimization literature.

    Returns
    -------
    W : array-like
        Solution to the non-negative least squares problem.

    grad : array-like
        The gradient.
    
    n_iter : int
        The number of iterations done by the algorithm.

    Reference
    ---------

    C.-J. Lin. Projected gradient methods
    for non-negative matrix factorization. Neural
    Computation, 19(2007), 2756-2779.
    http://www.csie.ntu.edu.tw/~cjlin/nmf/

    """
    X, Y = check_arrays(X, Y, sparse_format='csr')
    n_samples, n_features = X.shape
    n_targets = Y.shape[1]
    XY = safe_sparse_dot(X.T, Y)
    G = np.dot(X.T, X)

    if W_init is None:
        W = np.zeros((n_features, n_targets), dtype=np.float64)
    else:
        W = W_init.copy()
    init_grad_norm = safe_fro(np.dot(G, W) - XY + l2_reg * W + l1_reg)
    # values justified in the paper
    alpha = 1
    for n_iter in range(1, max_iter + 1):
        grad = np.dot(G, W) - XY  # X'(XW - Y) using precomputation
        if l2_reg:
            grad += l2_reg * W
        if l1_reg:
            grad += l1_reg

        # The following multiplication with a boolean array is more than twice
        # as fast as indexing into grad.
        proj_grad_norm = np.linalg.norm(grad * np.logical_or(grad < 0, W > 0))
        if proj_grad_norm / init_grad_norm < tol:
            break

        W_prev = W

        for inner_iter in range(20):
            # Gradient step.
            W_next = W - alpha * grad
            # Projection step.
            W_next *= W_next > 0
            d = W_next - W
            gradd = np.dot(grad.ravel(), d.ravel())
            dGd = np.dot(np.dot((G + l2_reg), d).ravel(), d.ravel())
            suff_decr = (1 - sigma) * gradd + 0.5 * dGd < 0
            if inner_iter == 0:
                decr_alpha = not suff_decr

            if decr_alpha:
                if suff_decr:
                    W = W_next
                    break
                else:
                    alpha *= beta
            elif not suff_decr or (W_prev == W_next).all():
                W = W_prev
                break
            else:
                alpha /= beta
                W_prev = W_next
        if callback:
            callback(W)

    if n_iter == max_iter:
        print("PG failed to converge")
    residual = safe_fro(np.dot(X, W) - Y)
    return W, residual

solvers=dict(activeset=nls_activeset, lbfgsb=nls_lbfgs_b, projgrad=nls_projgrad)

from numpy.testing import assert_almost_equal
from sklearn.utils.testing import assert_true

def test_residual():
    for _, _, _, X, Y in all_configurations():
        for name, nls in solvers.items():
            W, resid = nls(X, Y)
            true_resid = safe_fro(Y - np.dot(X, W))
            assert_almost_equal(resid, true_resid, err_msg=name)

test_residual()

def test_non_negative():
    for _, _, _, X, Y in all_configurations():
        for name, nls in solvers.items():
            W, resid = nls(X, Y)
            assert_true(np.all(W >= 0), name)

test_non_negative()

# Test actual correctness
# using the active set solver as reference
%pylab inline --no-import-all

def cb_append_norm(W, W_ref, dest_norm, dest_time):
    dest_norm.append(safe_fro(W - W_ref))
    dest_time.append(time())

plt.figure(figsize=(22, 10))
for k, (shape, sparsity, _, X, Y) in enumerate(all_configurations()):
    mse_projgrad = []
    time_projgrad = []
    mse_lbfgs_b = []
    time_lbfgs_b = []
    
    W_ref, _ = nls_activeset(X, Y)
    
    t0 = time()
    nls_projgrad(X, Y,
                 callback=lambda W: cb_append_norm(W, W_ref, mse_projgrad, time_projgrad),
                 tol=1e-6, max_iter=30000)
    time_projgrad = np.array(time_projgrad) - t0
    t0 = time()
    nls_lbfgs_b(X, Y,
                callback=lambda W: cb_append_norm(W.reshape(W_ref.shape), W_ref, mse_lbfgs_b, time_lbfgs_b),
                tol=1e-6, max_iter=30000)
    time_lbfgs_b = np.array(time_lbfgs_b) - t0
    plt.subplot(2, 6, k + 1)
    plt.semilogy(mse_projgrad, label='projgrad', ls=':', c='k')
    plt.semilogy(mse_lbfgs_b, label='lbfgsb', c='k')
    plt.title("{} sp={}".format(shape, sparsity))
    plt.ylabel("MSE")
    plt.xlabel("n_iterations")
    plt.legend()
    
    plt.subplot(2, 6, k + 7)
    plt.semilogy(time_projgrad, mse_projgrad, label='projgrad', ls=':', c='k')
    plt.semilogy(time_lbfgs_b, mse_lbfgs_b, label='lbfgsb', c='k')
    plt.title("{} sp={}".format(shape, sparsity))
    plt.ylabel("MSE")
    plt.xlabel("time")
    plt.legend()

plt.show()

results = DataFrame(columns='solver shape sparsity n_targets time residual'.split())
for shape, sparsity, n_targets, X, Y in all_configurations([1] + range(25, 201, 25)):
    sys.stdout.write('.')
    sys.stdout.flush()
    X, Y = prepare_data(shape, sparsity, n_targets)
    for solver_name, solver, args in (
            ('activeset', nls_activeset, {}),
            ('lbfgsb', nls_lbfgs_b, {}),
            ('pg-l2', nls_projgrad, {'l2_reg': 2.0}),
            ('pg-l1', nls_projgrad, {'l1_reg': 2.0}),
            ('pg', nls_projgrad, {}),
            ('lbfgsb-l2', nls_lbfgs_b, {'l2_reg': 2.0}),
            ('lbfgsb-l1', nls_lbfgs_b, {'l1_reg': 2.0})
            ):
        timing, (W, residual) = time_it(solver, X, Y, **args)
        results = results.append(
            dict(solver=solver_name,
                 shape=shape,
                 sparsity=sparsity,
                 n_targets=n_targets,
                 time=timing,
                 residual=residual / safe_fro(Y)),
            ignore_index=True)
            

%pylab inline --no-import-all
styles = {1: '-', 0.1: '--', 0.01: ':'}
markers = {'activeset': '.', 'lbfgsb': 'v', 'lbfgsb-l1': '^', 'lbfgsb-l2': 's', 'pg': 'x', 'pg-l2': 'o', 'pg-l1': '+'}
plt.figure(figsize=(22, 10))

for i, sparsity in enumerate((1, 0.4, 0.1)):
    for k, shape in enumerate(('tall', 'wide')):
        plt.subplot(2, 6, (2 * i + k) + 1)
        for solver in markers.keys():
            filter_plot(results,
                        lambda x: x['n_targets'].tolist(),
                        lambda x: x['time'].tolist(),
                        label="{}".format(solver),
                        ls='-',  # styles[sparsity],
                        marker=markers[solver],
                        solver=solver, shape=shape, sparsity=sparsity)
            plt.title("{} sp={}".format(shape, sparsity))
            plt.xlabel("n_targets")
            plt.ylabel("time")
            plt.legend(loc='upper left')
        plt.subplot(2, 6, 7 + (2 * i + k))
        for solver in markers.keys():
            filter_plot(results,
                        lambda x: x['n_targets'].tolist(),
                        lambda x: x['residual'].tolist(),
                        label="{}".format(solver),
                        ls='-',  # styles[sparsity],
                        marker=markers[solver],
                        solver=solver, shape=shape, sparsity=sparsity)
            plt.title("{} sp={}".format(shape, sparsity))
            plt.xlabel("n_targets")
            plt.ylabel("residual")
            plt.legend(loc='upper left')

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

dataset = fetch_20newsgroups(shuffle=True, random_state=0)
X = TfidfVectorizer(max_features=1000, max_df=0.9, use_idf=True).fit_transform(dataset.data[:300])
print(X.shape)

D = np.random.uniform(size=(300, 1000))
D[D > 0.01] = 0
D /= 0.01

news_res = DataFrame(columns='solver n_targets time residual'.split())
tol = 1e-2
max_iter = 300
for n_targets in [1] + range(50, 301, 50):
    sys.stdout.write('.')
    sys.stdout.flush()
    this_D = D[:n_targets, :]
    for solver_name, solver, args in (
            ('activeset', nls_activeset, {}),
            ('lbfgsb', nls_lbfgs_b, {'tol': tol, 'max_iter': max_iter}),
            ('pg-l2', nls_projgrad, {'l2_reg': 2.0, 'tol': tol, 'max_iter': max_iter}),
            ('pg-l1', nls_projgrad, {'l1_reg': 2.0, 'tol': tol, 'max_iter': max_iter}),
            ('pg', nls_projgrad, {'tol': tol, 'max_iter': max_iter}),
            ('lbfgsb-l2', nls_lbfgs_b, {'l2_reg': 2.0, 'tol': tol, 'max_iter': max_iter}),
            ('lbfgsb-l1', nls_lbfgs_b, {'l1_reg': 2.0, 'tol': tol, 'max_iter': max_iter})
            ):
        timing, (W, residual) = time_it(solver, D.T, X.T, **args)
        news_res = news_res.append(
            dict(solver=solver_name,
                 n_targets=n_targets,
                 time=timing,
                 residual=residual / safe_fro(Y)),
            ignore_index=True)

plt.figure(figsize=(22, 4))

plt.subplot(1, 3, 1)
for solver in markers.keys():
    filter_plot(news_res,
                lambda x: x['n_targets'].tolist(),
                lambda x: x['time'].tolist(),
                label="{}".format(solver),
                ls='-',
                marker=markers[solver],
                solver=solver)
    plt.xlabel("n_targets")
    plt.ylabel("time")
    plt.legend(loc='upper left')

# skip activeset as it's way too slow
plt.subplot(1, 3, 2)
for solver in markers.keys():
    if solver == 'activeset':
        continue
    filter_plot(news_res,
                lambda x: x['n_targets'].tolist(),
                lambda x: x['time'].tolist(),
                label="{}".format(solver),
                ls='-',
                marker=markers[solver],
                solver=solver)
    plt.xlabel("n_targets")
    plt.ylabel("time")
    plt.legend(loc='upper left')


plt.subplot(1, 3, 3)
for solver in markers.keys():
    filter_plot(news_res,
                lambda x: x['n_targets'].tolist(),
                lambda x: x['residual'].tolist(),
                label="{}".format(solver),
                ls='-',
                marker=markers[solver],
                solver=solver)
    plt.xlabel("n_targets")
    plt.ylabel("residual")
    plt.legend(loc='upper left')