#!/usr/bin/env python # coding: utf-8 # # `pyLDAvis.lda_model` # # pyLDAvis now also supports LDA application from scikit-learn. Let's take a look into this in more detail. We will be using the 20 newsgroups dataset as provided by scikit-learn. # In[1]: import warnings warnings.filterwarnings('ignore', category=DeprecationWarning) warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=UserWarning) # In[2]: import pyLDAvis import pyLDAvis.lda_model pyLDAvis.enable_notebook() # In[3]: from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation # ## Load 20 newsgroups dataset # # First, the 20 newsgroups dataset available in sklearn is loaded. As always, the headers, footers and quotes are removed. # In[4]: newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) docs_raw = newsgroups.data print(len(docs_raw)) # ## Convert to document-term matrix # # Next, the raw documents are converted into document-term matrix, possibly as raw counts or in TF-IDF form. # In[5]: tf_vectorizer = CountVectorizer(strip_accents = 'unicode', stop_words = 'english', lowercase = True, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.5, min_df = 10) dtm_tf = tf_vectorizer.fit_transform(docs_raw) print(dtm_tf.shape) # In[6]: tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw) print(dtm_tfidf.shape) # ## Fit Latent Dirichlet Allocation models # # Finally, the LDA models are fitted. # In[7]: # for TF DTM lda_tf = LatentDirichletAllocation(n_components=20, random_state=0) lda_tf.fit(dtm_tf) # for TFIDF DTM lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0) lda_tfidf.fit(dtm_tfidf) # ## Visualizing the models with pyLDAvis # In[8]: pyLDAvis.lda_model.prepare(lda_tf, dtm_tf, tf_vectorizer) # In[9]: pyLDAvis.lda_model.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer) # ### Using different MDS functions # # With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory. # In[10]: pyLDAvis.lda_model.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds') # In[11]: pyLDAvis.lda_model.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')