from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/new/words-6mill/colls/', r'.*\.txt', cat_pattern=r'(\w+)\.txt')
reader.categories('gpo.txt')
['gpo']
gpowords = reader.words('gpo.txt')
len(gpowords)
36646650
len(set([w.lower() for w in reader.words('gpo.txt')]))
438479
import nltk
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in ['gpo', 'artstor']
for word in reader.words(categories=genre))
gpofiltered[:10]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-17-fee8e157bba3> in <module>() ----> 1 gpofiltered[:10] TypeError: 'filter' object is not subscriptable
import re
gpotokens = re.split(r'\W+', reader.raw('gpo.txt'))
type(gpotokens)
list
from nltk.corpus import stopwords
gpofiltered = [w for w in gpotokens if w.lower() not in nltk.corpus.stopwords.words('english')]
gpofiltered[:10]
['United', 'States', 'National', 'Archives', 'Records', 'Service', '1975', '1975', 'Includes', 'bibliographical']
fd = nltk.FreqDist((token) for token in gpofiltered)
#fd = nltk.FreqDist((word) for word in reader.words(categories='gpo'))
type(fd.most_common())
#fd[1]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-3-f64a9ad921dc> in <module>() ----> 1 fd = nltk.FreqDist((token) for token in gpofiltered) 2 #fd = nltk.FreqDist((word) for word in reader.words(categories='gpo')) 3 type(fd.most_common()) 4 #fd[1] NameError: name 'gpofiltered' is not defined
#fd.most_common(10)
%matplotlib inline
fd.plot(50)
%matplotlib inline
fd.plot(50, cumulative=True)
import pickle
pickle.dump( gpofiltered, open( "/media/storage/dpla-data/pickles/gpofiltered.p", "wb" ) )
import pickle
pickle.dump( gpowords, open( "/media/storage/dpla-data/pickles/gpowords.p", "wb" ) )
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-5-3f88e95676c4> in <module>() 1 import pickle ----> 2 pickle.dump( gpowords, open( "/media/storage/dpla-data/pickles/gpowords.p", "wb" ) ) TypeError: cannot serialize '_io.BufferedReader' object
type(gpowords)
nltk.corpus.reader.util.StreamBackedCorpusView