# Author: Olivier Grisel # Vlad Niculae # License: BSD 3 clause from __future__ import print_function from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import ProjectedGradientNMF, MultiplicativeNMF from sklearn import datasets from time import time n_topics = 10 %%time dataset = datasets.fetch_20newsgroups(shuffle=True, random_state=0) rec_errors = [] for n_samples, n_features in ((1000, 900),): vect = TfidfVectorizer(max_features=n_features, max_df=0.9, use_idf=True) X = vect.fit_transform(dataset.data[:n_samples]) for init in ('random', 'nndsvda', 'nndsvdar', 'nndsvd'): print(init) #print("n_samples={}".format(n_samples)) #print("n_features={}".format(n_features)) #print("actual_n_features={}".format(len(vect.vocabulary_))) for n_components in (5, 10): print("n_components={}".format(n_components)) for tol, tol_label in zip((1e-2, 1e-4), ('high', 'low')): for n_iter in xrange(1, 77, 5): t0 = time() nmf = ProjectedGradientNMF(n_components=n_components, max_iter=n_iter, init=init, tol=tol) pgnmf_err = nmf.fit(X).reconstruction_err_ pgnmf_time = time() - t0 rec_errors.append((init, n_samples, n_features, n_components, 0, tol_label, nmf.n_iter_, pgnmf_err, pgnmf_time)) t0 = time() nmf = MultiplicativeNMF(n_components=n_components, max_iter=n_iter, init=init, tol=tol) multnmf_err = nmf.fit(X).reconstruction_err_ multnmf_time = time() - t0 rec_errors.append((init, n_samples, n_features, n_components, 1, tol_label, nmf.n_iter_, multnmf_err, multnmf_time)) #t0 = time() #nmf = LBfgsNMF(n_components=n_components, max_iter=n_iter, init=init, tol=1e-2, # nls_max_iter=20) #lbnmf_err = nmf.fit(X).reconstruction_err_ #lbnmf_time = time() - t0 #rec_errors.append((init, n_samples, n_features, n_components, # 2, n_iter, lbnmf_err, lbnmf_time)) import pandas rec_errors_df = pandas.DataFrame(rec_errors, columns="init n_samples n_features n_components " "method tol n_iter err time".split()) print(len(rec_errors_df)) %pylab inline --no-import-all colors = {5: 'b', 10: 'g'} styles = (":", "-", '--') plt.figure(figsize=(22, 25)) for i, init_name in enumerate(np.unique(rec_errors_df['init'])): for k, (n_samples, n_features) in enumerate([(1000, 900)]): for j, tol_label in enumerate(('high', 'low')): plt.subplot(4, 4, 4 * i + 2 * j + 1) for n_components in np.unique(rec_errors_df['n_components']): for method, method_name in zip((0, 1, 2), ('PG', 'MULT')): selected_items=rec_errors_df\ [rec_errors_df['init'] == init_name]\ [rec_errors_df['n_features'] == n_features]\ [rec_errors_df['n_samples'] == n_samples]\ [rec_errors_df['n_components'] == n_components]\ [rec_errors_df['method'] == method]\ [rec_errors_df['tol'] == tol_label] plt.plot(selected_items['n_iter'], selected_items['err'], color=colors[n_components], ls=styles[method], label="{} n_comp={}".format(method_name, n_components)) plt.xlabel("n_iter") plt.legend() plt.title("{} tol={}".format(init_name, tol_label)) plt.subplot(4, 4, 4 * i + 2 * j + 2) for n_components in np.unique(rec_errors_df['n_components']): for method, method_name in zip((0, 1, 2), ('PG', 'MULT')): selected_items=rec_errors_df\ [rec_errors_df['init'] == init_name]\ [rec_errors_df['n_features'] == n_features]\ [rec_errors_df['n_samples'] == n_samples]\ [rec_errors_df['n_components'] == n_components]\ [rec_errors_df['method'] == method]\ [rec_errors_df['tol'] == tol_label] plt.plot(selected_items['time'], selected_items['err'], color=colors[n_components], ls=styles[method], label="{} n_comp={}".format(method_name, n_components)) plt.xlabel("Time (s)") plt.legend() plt.title("{} tol={}".format(init_name, tol_label)) # Compare error and time of convergence for 2000 x 10000 x 10 selected_items = rec_errors_df\ [rec_errors_df['n_features'] == 900]\ [rec_errors_df['n_samples'] == 1000]\ [rec_errors_df['n_components'] == 10]\ colors={'random': 'r', 'nndsvd': 'g', 'nndsvda': 'b', 'nndsvdar': 'y'} method_names=['PG', 'MULT'] markers="xov" plt.figure(figsize=(15, 6)) #plt.ylim((0.95, 1.05)) for k, tol_name in enumerate(['low', 'high']): plt.subplot(1, 2, k + 1) for init in ('random', 'nndsvd', 'nndsvda', 'nndsvdar'): for method in (0, 1): these_items = selected_items[selected_items['init'] == init]\ [selected_items['method'] == method]\ [selected_items['tol']==tol_name] these_items = these_items[these_items['n_iter'] == these_items['n_iter'].max()] t = these_items['time'].tolist()[0] err = these_items['err'].tolist()[0] plt.scatter(t, err / np.sqrt(np.sum(X.data ** 2)), marker=markers[method], color=colors[init], label="{}-{}".format(method_names[method], init)) plt.legend() plt.xlabel('Time (s)') plt.ylabel('Reconstruction error') plt.title("tol={}".format(tol_name))