!head -n 10 txtdm.txt
The Neatest Little Guide to Stock Market Investing Investing For Dummies, 4th Edition The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns The Little Book of Value Investing Value Investing: From Graham to Buffett and Beyond Rich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not! Investing in Real Estate, 5th Edition Stock Investing For Dummies Rich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss
txt = [s.split() for s in open('txtdm.txt')]
ignore = ",|:|!|'"
stopwords = ['and','edition','for','in','little','of','the','to']
import re
txt = [[re.sub(ignore,'',w.lower()) for w in s ] for s in txt]
txt = [[w for w in s if w not in stopwords] for s in txt]
txt = [' '.join(s) for s in txt]
txt
['neatest guide stock market investing', 'investing dummies 4th', 'book common sense investing only way guarantee your fair share stock market returns', 'book value investing', 'value investing from graham buffett beyond', 'rich dads guide investing what rich invest that poor middle class do not', 'investing real estate 5th', 'stock investing dummies', 'rich dads advisors abcs real estate investing secrets finding hidden profits most investors miss']
from sklearn.feature_extraction.text import CountVectorizer
model = CountVectorizer()
xvec = model.fit_transform(txt)
xvec
<9x44 sparse matrix of type '<type 'numpy.int64'>' with 63 stored elements in Compressed Sparse Row format>
from sklearn.decomposition import NMF
n_topics = 2
nmf = NMF(n_components=n_topics,
sparseness='data', init='nndsvd', random_state=0)
nmf.fit_transform(xvec)
array([[ 0.06206478, 0.32759923], [ 0.07006666, 0.14309777], [-0. , 0.91268535], [ 0.04919433, 0.21403733], [ 0.06947109, 0.18135713], [ 0.80019019, -0. ], [ 0.1820973 , 0.08738539], [ 0.04344757, 0.23748179], [ 0.74776168, -0. ]])
import numpy as np
np.round(nmf.components_,2)
array([[ 0.05, 0.13, 0.55, 0.55, 0.04, 0. , 0.04, 0.59, 0. , 1.15, 0.59, 0.07, 0.69, 0. , 0.55, 0.04, 0.04, 0. , 0.62, 0.55, 0.59, 1.4 , 0.55, 0. , 0.59, 0.55, 0.55, 0.03, 0.59, 0. , 0.59, 0.55, 0.69, 0. , 1.74, 0.55, 0. , 0. , 0.01, 0.59, 0.07, 0. , 0.59, 0. ], [ 0.12, 0.06, 0. , 0. , 0.15, 0.94, 0.15, 0. , 0.76, 0. , 0. , 0.31, 0.03, 0.76, 0. , 0.15, 0.15, 0.76, 0.23, 0. , 0. , 1.66, 0. , 1.03, 0. , 0. , 0. , 0.27, 0. , 0.76, 0. , 0. , 0.03, 0.76, 0. , 0. , 0.76, 0.76, 1.23, 0. , 0.32, 0.76, 0. , 0.76]])
feature_names = model.get_feature_names()
n_top_words=5
for topic_idx, topic in enumerate(nmf.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
Topic #0: rich investing dads estate real () Topic #1: investing stock market book returns ()