count={}
#read in file00.txt through file39.txt
for n in range(40):
for l in open("test/file%02d.txt"%n).readlines():
for w in l.rstrip().lower().split():
if w not in count: count[w]=0
count[w] += 1
#sort according to highest rank
ydata=sorted(count.values(),reverse=True)
#top values of word counts
ydata[:10]
[1648, 794, 651, 623, 576, 518, 357, 273, 265, 176]
#find which they are
topwords=sorted(count,reverse=True,key=count.get)
topwords[:10]
['the', 'of', 'to', 'a', 'and', 'in', 'that', 'is', 'for', 'are']
#reference zipf distribution
zipf=[1./i for i in range(1,len(ydata)+1)]
figure(figsize=(6,6))
plot(range(1,len(ydata)+1),ydata,'o')
yscale('log')
xscale('log')
grid('on')
ylim(1,10000)
xlabel('word rank')
ylabel('word count')
#y=1000 # guess why intercept
#for i in range(5):
# plot(range(1,len(ydata)+1),y*array(zipf))
# y*=2
y= 2500 #y-intercept
#fit with zipf
plot(range(1,len(ydata)+1),y*array(zipf))
#annotate top 10 words
for i in range(10):
text(i+1,ydata[i]+100,topwords[i])
#and sample from rest of range
for k in map(lambda x:2**(x+4),range(8)):
text(k+1,ydata[k]+5,topwords[k])
text(len(ydata),ydata[-1]+2,topwords[-1])
<matplotlib.text.Text at 0x10a1bd510>
#size of vocab
len(count)
5077
#total size (40 files, roughly 600 words / file, i.e. *small*)
sum(count.values())
25155
import gzip
import re #for regular expressions
#this time read in gzipped file
with gzip.open("oz.txt.gz") as textfile:
words = textfile.read().lower()
#strip out all punctuation and split
words = re.sub(r'[^\w\s]','',words).split()
#same as before, but now define function for everything
def zipfplot(words,source):
count={}
for w in words:
if w not in count: count[w]=0
count[w] += 1
ydata=sorted(count.values(),reverse=True)
zipf=[1./i for i in range(1,len(ydata)+1)]
topwords=sorted(count,reverse=True,key=count.get)
figure(figsize=(6,6))
plot(range(1,len(ydata)+1),ydata,'o')
ylim(1,10000)
yscale('log')
xscale('log')
grid('on')
xlabel('word rank')
ylabel('word count')
title(source)
y= 300*ydata[300] #fit to zipf at rank 300
plot(range(1,len(ydata)+1),y*array(zipf))
#annotate top 10 words
for i in range(10):
text(i+1,ydata[i]+1000./(i+1),topwords[i])
#and sample from rest of range
for k in map(lambda x:2**(x+4),range(8)):
text(k+1,ydata[k]+5,topwords[k])
#plus the last
text(len(ydata),ydata[-1]+2,topwords[-1])
print 'vocab size =',len(count),', total terms =',sum(count.values())
print 'topwords = ',topwords[:10]
print 'lastwords = ',topwords[-5:]
zipfplot(words,'Oz')
vocab size = 2919 , total terms = 39256 topwords = ['the', 'and', 'to', 'of', 'a', 'i', 'was', 'you', 'in', 'he'] lastwords = ['dangerthat', 'gruffly', 'accidents', 'baked', 'gardens']
#now sherlock holmes, an even bigger text
with gzip.open("sherlock.txt.gz") as textfile:
words = textfile.read().lower()
#strip out all punctuation and split
words = re.sub(r'[^\w\s]','',words).split()
zipfplot(words,'Sherlock')
vocab size = 8410 , total terms = 104410 topwords = ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'that', 'it', 'you'] lastwords = ['glint', 'illegally', 'accomplish', 'volumes', 'confronted']
#exp and log functions
fig=figure(figsize=(10,4.5))
X=np.linspace(0, 5, 256, endpoint=True)
l=fig.add_subplot(121)
l.plot(X,exp(X))
l.set_title('exp(x)')
l.set_ylim(0,100)
X=np.linspace(1, 100, 256, endpoint=True)
r=fig.add_subplot(122)
r.plot(X,log(X))
r.set_title('log(x)')
<matplotlib.text.Text at 0x10a2f6490>
#see that power laws y=kx^b are linear in log-log plots
fig=figure(figsize=(10,9.5))
l = fig.add_subplot(221)
l.grid('on')
l.set_ylim(0,100)
l.plot(X,X*X)
l.plot(X,X)
l.plot(X,sqrt(X))
l.legend(['x**2','x','sqrt(x)'])
r = fig.add_subplot(222)
r.grid('on')
r.set_ylim(1,100)
r.plot(X,X*X)
r.plot(X,X)
r.plot(X,sqrt(X))
r.legend(['x**2','x','sqrt(x)'])
r.set_xscale('log')
r.set_yscale('log')
l = fig.add_subplot(223)
l.grid('on')
l.set_ylim(0,100)
l.plot(X,100/sqrt(X),label='100/sqrt(x)',color='r')
l.plot(X,100/X,label='100/x',color='g')
l.plot(X,100/(X*X),label='100/x**2',color='b')
l.legend()
r = fig.add_subplot(224)
r.grid('on')
r.set_ylim(1,100)
r.plot(X,100/sqrt(X),label='100/sqrt(x)',color='r')
r.plot(X,100/X,label='100/x',color='g')
r.plot(X,100/(X*X),label='100/x**2',color='b')
r.legend()
r.set_xscale('log')
r.set_yscale('log')