In [141]:

!head -n 10 txtdm.txt

The Neatest Little Guide to Stock Market Investing
Investing For Dummies, 4th Edition
The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns
The Little Book of Value Investing
Value Investing: From Graham to Buffett and Beyond
Rich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!
Investing in Real Estate, 5th Edition
Stock Investing For Dummies
Rich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss

In [142]:

txt = [s.split() for s in open('txtdm.txt')]

In [143]:

ignore = ",|:|!|'"
stopwords = ['and','edition','for','in','little','of','the','to']

In [144]:

import re

In [145]:

txt = [[re.sub(ignore,'',w.lower()) for w in s ] for s in txt]

In [146]:

txt = [[w for w in s if w not in stopwords] for s in txt]

In [147]:

txt = [' '.join(s) for s in txt]

In [148]:

txt

Out[148]:

['neatest guide stock market investing',
 'investing dummies 4th',
 'book common sense investing only way guarantee your fair share stock market returns',
 'book value investing',
 'value investing from graham buffett beyond',
 'rich dads guide investing what rich invest that poor middle class do not',
 'investing real estate 5th',
 'stock investing dummies',
 'rich dads advisors abcs real estate investing secrets finding hidden profits most investors miss']

In [149]:

from sklearn.feature_extraction.text import CountVectorizer

In [150]:

model = CountVectorizer() 
xvec = model.fit_transform(txt)
xvec

Out[150]:

<9x44 sparse matrix of type '<type 'numpy.int64'>'
	with 63 stored elements in Compressed Sparse Row format>

In [151]:

from sklearn.decomposition import NMF

In [152]:

n_topics = 2

In [153]:

nmf = NMF(n_components=n_topics,
                    sparseness='data', init='nndsvd', random_state=0)

In [154]:

nmf.fit_transform(xvec)

Out[154]:

array([[ 0.06206478,  0.32759923],
       [ 0.07006666,  0.14309777],
       [-0.        ,  0.91268535],
       [ 0.04919433,  0.21403733],
       [ 0.06947109,  0.18135713],
       [ 0.80019019, -0.        ],
       [ 0.1820973 ,  0.08738539],
       [ 0.04344757,  0.23748179],
       [ 0.74776168, -0.        ]])

In [155]:

import numpy as np

In [167]:

np.round(nmf.components_,2)

Out[167]:

array([[ 0.05,  0.13,  0.55,  0.55,  0.04,  0.  ,  0.04,  0.59,  0.  ,
         1.15,  0.59,  0.07,  0.69,  0.  ,  0.55,  0.04,  0.04,  0.  ,
         0.62,  0.55,  0.59,  1.4 ,  0.55,  0.  ,  0.59,  0.55,  0.55,
         0.03,  0.59,  0.  ,  0.59,  0.55,  0.69,  0.  ,  1.74,  0.55,
         0.  ,  0.  ,  0.01,  0.59,  0.07,  0.  ,  0.59,  0.  ],
       [ 0.12,  0.06,  0.  ,  0.  ,  0.15,  0.94,  0.15,  0.  ,  0.76,
         0.  ,  0.  ,  0.31,  0.03,  0.76,  0.  ,  0.15,  0.15,  0.76,
         0.23,  0.  ,  0.  ,  1.66,  0.  ,  1.03,  0.  ,  0.  ,  0.  ,
         0.27,  0.  ,  0.76,  0.  ,  0.  ,  0.03,  0.76,  0.  ,  0.  ,
         0.76,  0.76,  1.23,  0.  ,  0.32,  0.76,  0.  ,  0.76]])

In [160]:

feature_names = model.get_feature_names()

In [165]:

n_top_words=5

In [166]:

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
rich investing dads estate real
()
Topic #1:
investing stock market book returns
()

In [ ]: