# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Vlad Niculae <vlad@vene.ro>

# License: BSD 3 clause

from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import ProjectedGradientNMF, MultiplicativeNMF
from sklearn import datasets

from time import time

n_topics = 10

%%time

dataset = datasets.fetch_20newsgroups(shuffle=True, random_state=0)

rec_errors = []
for n_samples, n_features in ((1000, 900),):
    vect = TfidfVectorizer(max_features=n_features, max_df=0.9, use_idf=True)
    X = vect.fit_transform(dataset.data[:n_samples])
    for init in ('random', 'nndsvda', 'nndsvdar', 'nndsvd'):
        print(init)
        #print("n_samples={}".format(n_samples))
        #print("n_features={}".format(n_features))
        #print("actual_n_features={}".format(len(vect.vocabulary_)))
        for n_components in (5, 10):
            print("n_components={}".format(n_components))
            for tol, tol_label in zip((1e-2, 1e-4), ('high', 'low')):
                
                for n_iter in xrange(1, 77, 5):
                    t0 = time()
                    nmf = ProjectedGradientNMF(n_components=n_components, max_iter=n_iter, init=init, tol=tol)
                    pgnmf_err = nmf.fit(X).reconstruction_err_
                    pgnmf_time = time() - t0
                    
                    rec_errors.append((init, n_samples, n_features, n_components,
                                       0, tol_label, nmf.n_iter_, pgnmf_err, pgnmf_time))
                    t0 = time()
                    nmf = MultiplicativeNMF(n_components=n_components, max_iter=n_iter, init=init, tol=tol)
                    multnmf_err = nmf.fit(X).reconstruction_err_
                    multnmf_time = time() - t0
                    
                    rec_errors.append((init, n_samples, n_features, n_components,
                                       1, tol_label, nmf.n_iter_, multnmf_err, multnmf_time))
                    
                    #t0 = time()
                    #nmf = LBfgsNMF(n_components=n_components, max_iter=n_iter, init=init, tol=1e-2,
                    #               nls_max_iter=20)
                    #lbnmf_err = nmf.fit(X).reconstruction_err_
                    #lbnmf_time = time() - t0
                    #rec_errors.append((init, n_samples, n_features, n_components,
                    #                   2, n_iter, lbnmf_err, lbnmf_time))

import pandas
rec_errors_df = pandas.DataFrame(rec_errors, columns="init n_samples n_features n_components "
                                 "method tol n_iter err time".split())
print(len(rec_errors_df))

%pylab inline --no-import-all
colors = {5: 'b', 10: 'g'}
styles = (":", "-", '--')
plt.figure(figsize=(22, 25))
for i, init_name in enumerate(np.unique(rec_errors_df['init'])):
    for k, (n_samples, n_features) in enumerate([(1000, 900)]):
        for j, tol_label in enumerate(('high', 'low')):
            plt.subplot(4, 4, 4 * i + 2 * j + 1)
            for n_components in np.unique(rec_errors_df['n_components']):
                for method, method_name in zip((0, 1, 2), ('PG', 'MULT')):
                    selected_items=rec_errors_df\
                        [rec_errors_df['init'] == init_name]\
                        [rec_errors_df['n_features'] == n_features]\
                        [rec_errors_df['n_samples'] == n_samples]\
                        [rec_errors_df['n_components'] == n_components]\
                        [rec_errors_df['method'] == method]\
                        [rec_errors_df['tol'] == tol_label]
                    plt.plot(selected_items['n_iter'], selected_items['err'],
                             color=colors[n_components],
                             ls=styles[method],
                             label="{} n_comp={}".format(method_name, n_components))
                    plt.xlabel("n_iter")
                    plt.legend()
                    plt.title("{} tol={}".format(init_name, tol_label))
            plt.subplot(4, 4, 4 * i + 2 * j + 2)
            for n_components in np.unique(rec_errors_df['n_components']):
                for method, method_name in zip((0, 1, 2), ('PG', 'MULT')):
                    selected_items=rec_errors_df\
                        [rec_errors_df['init'] == init_name]\
                        [rec_errors_df['n_features'] == n_features]\
                        [rec_errors_df['n_samples'] == n_samples]\
                        [rec_errors_df['n_components'] == n_components]\
                        [rec_errors_df['method'] == method]\
                        [rec_errors_df['tol'] == tol_label]
                    plt.plot(selected_items['time'], selected_items['err'],
                             color=colors[n_components],
                             ls=styles[method],
                             label="{} n_comp={}".format(method_name, n_components))
                    plt.xlabel("Time (s)")
                    plt.legend()
                    plt.title("{} tol={}".format(init_name, tol_label))

# Compare error and time of convergence for 2000 x 10000 x 10
selected_items = rec_errors_df\
    [rec_errors_df['n_features'] == 900]\
    [rec_errors_df['n_samples'] == 1000]\
    [rec_errors_df['n_components'] == 10]\

colors={'random': 'r', 'nndsvd': 'g', 'nndsvda': 'b', 'nndsvdar': 'y'}
method_names=['PG', 'MULT']
markers="xov"
plt.figure(figsize=(15, 6))
#plt.ylim((0.95, 1.05))
for k, tol_name in enumerate(['low', 'high']):
    plt.subplot(1, 2, k + 1)
    for init in ('random', 'nndsvd', 'nndsvda', 'nndsvdar'):
        for method in (0, 1):
            these_items = selected_items[selected_items['init'] == init]\
                                        [selected_items['method'] == method]\
                                        [selected_items['tol']==tol_name]
            these_items = these_items[these_items['n_iter'] == these_items['n_iter'].max()]
            t = these_items['time'].tolist()[0]
            err = these_items['err'].tolist()[0]
            plt.scatter(t, err / np.sqrt(np.sum(X.data ** 2)), marker=markers[method], color=colors[init],
                        label="{}-{}".format(method_names[method], init))
    plt.legend()
    plt.xlabel('Time (s)')
    plt.ylabel('Reconstruction error')
    plt.title("tol={}".format(tol_name))