In this notebook I will try to implement and analyze LDA algorithm using PyMC package. The implementation is replication of the code provided at: http://stats.stackexchange.com/questions/104771/latent-dirichlet-allocation-in-pymc
The LDA model is described below:
import numpy as np
import pymc as pm
#K, V, D = 2, 4, 3 # number of topics, words, documents
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster")
%matplotlib inline
K, V, D = 5, 10, 20 # number of topics, words, documents
#data = np.array([[1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0]])
data_temp = np.random.randint(0,10,size=(D,V))
class LDA(object):
def __init__(self, data, topics=K, vocab=V):
"""
Takes the data variable and outputs a model
"""
self.data = data
self.topics = topics
self.vocab = vocab+1
self.docs = len(self.data)
self.alpha = np.ones(self.topics)
self.beta = np.ones(self.vocab)
self.theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i,\
pm.Dirichlet("ptheta_%s" % i, theta=self.alpha))
for i in range(self.docs)])
self.phi = pm.Container([pm.CompletedDirichlet("phi_%s" % i,\
pm.Dirichlet("pphi_%s" % i, theta=self.beta))
for i in range(self.topics)])
self.Wd = [len(doc) for doc in self.data]
self.Z = pm.Container([pm.Categorical("z_%s" % d,
p=self.theta[d],
size=self.Wd[d],
value=np.random.randint(self.topics,size=self.Wd[d]))
for d in range(self.docs)])
self.W = pm.Container([pm.Categorical("w_%s,%s" % (d,i),
p=pm.Lambda("phi_z_%s_%s" % (d,i),
lambda z=self.Z[d][i], phi=self.phi: phi[z]),
value=self.data[d][i],
observed=True)
for d in range(self.docs) for i in range(self.Wd[d])])
self.model = pm.Model([self.theta, self.phi, self.Z, self.W])
self.mcmc = pm.MCMC(self.model)
def fit(self, iterations=1000, burn_in=10):
# Fit the model by sampling from the data iterations times with burn in of burn_in.
self.mcmc.sample(iterations, burn=burn_in)
def show_topics(self):
# Show distribution of topics over words
return self.phi.value
def show_words(self):
# Show distribution of words in documents over topics
return self.W.value
def KLDiv(self, p,q):
return np.sum(p*np.log10(p/q))
def cosine_sim(self, x,y):
return np.dot(x,y)/np.sqrt(np.dot(x,x)*np.dot(y,y))
def sorted_docs_sim(self):
kldivs_docs = [(i, j, self.KLDiv(self.theta[i].value,self.theta[j].value),
self.cosine_sim(self.data[i], self.data[j]))
for i in range(len(self.theta)) for j in range(len(self.theta))
if i != j]
return sorted(kldivs_docs, key=lambda x: x[3], reverse=True)
def show_topic_words(self, n=10, idwords):
for i, t in enumerate(self.phi.value):
print "Topic %i : " % i, ", ".join(idwords[w_] for w_ in np.argsort(t[0])[-10:] if w_ < (self.vocab-1-1))
def plot_data(self):
plt.clf()
plt.matshow(data, fignum=1000, cmap=plt.cm.Reds)
plt.gca().set_aspect('auto')
plt.xlabel("Words")
plt.ylabel("Documents")
def plot_words_per_topic(self, ax=None):
if ax is None:
plt.clf()
fig, ax = plt.subplots(1,1)
words = self.Z.value
topic_dist = dict()
for k_i in words:
for k in k_i:
if k not in topic_dist:
topic_dist[k] = 0
topic_dist[k] += 1
ax.bar(topic_dist.keys(), topic_dist.values())
ax.set_xlabel("Topics")
ax.set_ylabel("Counts")
ax.set_title("Document words per topics")
plt.show()
def plot_word_dist(self, ax=None):
topics = self.phi.value
if ax is None:
plt.clf()
fig, ax = plt.subplots((len(topics)+1)/2, 2, figsize=(10,10))
for i, t in enumerate(topics):
ax[i/2][i%2].bar(range(len(t[0])), t[0])
ax[i/2][i%2].set_title("Topic %s" % i)
plt.suptitle("Vocab word proportions per topic")
fig.subplots_adjust(hspace=0.5, wspace=0.5)
File "<ipython-input-42-f08bbc70fdc3>", line 60 def show_topic_words(self, n=10, idwords): SyntaxError: non-default argument follows default argument
lda = LDA(data_temp)
lda.fit()
lda.plot_words_per_topic()
[-----------------100%-----------------] 1000 of 1000 complete in 39.4 sec
<matplotlib.figure.Figure at 0x13151b4a8>
lda.plot_word_dist()
<matplotlib.figure.Figure at 0x20602a58>
kld_sorted = lda.sorted_docs_sim()
kld_sorted[:10]
[(11, 12, 0.12794402850656339, 0.91550703216514051), (12, 11, 0.23233086565032021, 0.91550703216514051), (7, 16, 0.069081961101030681, 0.91529433139300531), (16, 7, 0.084241263235092026, 0.91529433139300531), (11, 14, 0.11389710569477, 0.89897158850740189), (14, 11, 0.11807633003998763, 0.89897158850740189), (0, 15, 0.27583826339271433, 0.89552039314012999), (15, 0, 0.22658878049302303, 0.89552039314012999), (8, 16, 0.20735934380211171, 0.88181818181818183), (16, 8, 0.28884846182266505, 0.88181818181818183)]
Now lets move to trying on some real world data. I will use the reuters corpus.
from nltk.corpus import inaugural
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
# Create a vocabulary from the corpus
vocab = dict()
for fileid in inaugural.fileids():
for word in inaugural.words(fileid):
word = word.lower()
if word not in stops and word.isalpha():
if word not in vocab:
vocab[word] = 0
vocab[word] += 1
"""
Sort the vocab keep only words which occur more than 50 times
Then Create word to id and id to word dictionaries
"""
vocab_sorted = filter(lambda x: x[1] > 50, sorted(vocab.items(), key=lambda x: x[1], reverse=True))
wordids = {v[0]: i for i, v in enumerate(vocab_sorted)}
idwords = {i: v[0] for i, v in enumerate(vocab_sorted)}
vocab_size = len(wordids)
print vocab_size
191
# Generate corpus document vectors
data = []
for fileid in inaugural.fileids():
data.append([0]*vocab_size)
for word in inaugural.words(fileid):
word = word.lower()
if word in wordids:
data[-1][wordids[word]] += 1
len(data)
56
# Plot the document word matrix
print data[0][:10]
data = np.array(data)
plt.clf()
plt.matshow(data, fignum=1000, cmap=plt.cm.Reds)
plt.gca().set_aspect('auto')
plt.xlabel("Words")
plt.ylabel("Documents")
[8, 4, 1, 0, 3, 6, 3, 1, 2, 3]
<matplotlib.text.Text at 0x2378a710>
<matplotlib.figure.Figure at 0x2382e2b0>
inaugural_lda = LDA(data, topics=10, vocab=vocab_size)
inaugural_lda.fit()
inaugural_lda.plot_words_per_topic()
[-----------------100%-----------------] 1001 of 1000 complete in 756.2 sec
<matplotlib.figure.Figure at 0x47367c88>
inaugural_lda.plot_word_dist()
<matplotlib.figure.Figure at 0x2301fc18>
# Above diagram is wrong for Document words per topics
plt.clf()
fig, ax = plt.subplots(1,1)
words = inaugural_lda.Z.value
topic_dist = dict()
for k_i in words:
for k in k_i:
if k not in topic_dist:
topic_dist[k] = 0
topic_dist[k] += 1
ax.bar(topic_dist.keys(), topic_dist.values())
ax.set_xlabel("Topics")
ax.set_ylabel("Counts")
ax.set_title("Document words per topics")
plt.show()
<matplotlib.figure.Figure at 0x131965cc0>
for i, t in enumerate(inaugural_lda.phi.value):
print "Topic %i : " % i, ", ".join(idwords[w_] for w_ in np.argsort(t[0])[-10:] if w_ < vocab_size -1)
Topic 0 : must, without, subject, best, true, institutions, help, beyond, state Topic 1 : make, home, much, strong, spirit, interest, know, high, commerce, long Topic 2 : future, federal, duties, congress, seek, purpose, public, time, nations, administration Topic 3 : united, people, authority, power, constitution, free, administration, state, become, trust Topic 4 : whose, liberty, service, right, national, trust, proper, secure, help, state Topic 5 : far, character, security, fellow, administration, together, subject, seek, faith, things Topic 6 : responsibility, old, future, every, nation, called, party, public, interests Topic 7 : united, union, character, general, subject, always, century, americans, future, people Topic 8 : live, justice, might, fellow, action, duties, give, high, done, ought Topic 9 : done, full, peace, secure, president, god, administration, home, may, way