from sklearn import datasets dataset = datasets.fetch_20newsgroups(shuffle=True, random_state=1) print dataset.target_names[dataset.target[0]] print dataset.data[0] n_samples = 1000 n_features = 1000 from sklearn.feature_extraction import text vectorizer = text.CountVectorizer(max_df=0.95, max_features=n_features) counts = vectorizer.fit_transform(dataset.data[:n_samples]) tfidf = text.TfidfTransformer().fit_transform(counts) tfidf tfidf.toarray() from sklearn import decomposition n_topics = 5 nmf = decomposition.NMF(n_components=n_topics).fit(tfidf) print nmf print nmf.components_ n_top_words = 12 inverse_vocabulary = dict((v, k) for k, v in vectorizer.vocabulary.iteritems()) for topic_idx, topic in enumerate(nmf.components_): print "Topic #%d: " % topic_idx, print " ".join([inverse_vocabulary[i] for i in topic.argsort()[:-(n_top_words + 1):-1]]) print