%matplotlib inline import itertools import nltk import string with file("data/stories/A_THE FIR TREE.txt",'r') as handle: text = handle.read() text[0:100] tokens = nltk.word_tokenize(text) tokens[60:110] nltk.wordpunct_tokenize(text)[60:120] nltk.sent_tokenize(text)[0:10] from nltk.corpus import stopwords english_stops = stopwords.words('english') english_stops tokens = nltk.word_tokenize(text) tokens[0:15] len(tokens) # try this without .lower in the if-statement and check the size! tokens = [token.lower() for token in tokens if token.lower() not in english_stops] len(tokens) tokens[0:15] import string string.punctuation tokens = [token for token in tokens if token not in string.punctuation] len(tokens) # There's some awful stuff still in here: sorted(tokens)[0:20] [token for token in tokens if len(token) <= 2][0:20] def clean_tokens(tokens): """ Lowercases, takes out punct and stopwords and short strings """ return [token.lower() for token in tokens if (token not in string.punctuation) and (token.lower() not in english_stops) and len(token) > 2] clean = clean_tokens(tokens) clean[0:20] len(clean) from nltk import Text text = Text(clean) text.vocab().most_common()[0:20] def makeTextCollection(files): from nltk import Text from nltk import TextCollection textlist = [open(filen).read() for filen in files] texts= [Text(clean_tokens(nltk.word_tokenize(text))) for text in textlist] collection = TextCollection(texts) return collection filelist = !ls data/stories/* filelist fairyCol = makeTextCollection(filelist) fairyCol.concordance('witch') # Note that a concordance view should really retain original format; see below for how to do this # with an IndexedText object. fairyCol.similar('king') fairyCol.similar('queen') fairyCol.collocations(num=25, window_size=3) fairyCol.dispersion_plot(['queen','king','girl', 'boy', 'daughter','son']) fairyCol.plot(50) fairyCol.vocab().most_common()[0:15] fairyCol.common_contexts(['queen']) len(fairyCol.vocab()) nltk.download() text = nltk.word_tokenize("And now for something completely different") tagged = nltk.pos_tag(text) # there are a few options for taggers, details in NLTK books tagged nltk.untag(tagged) # stemming removes affixes. This is the default choice for stemming although other algs exist from nltk.stem import PorterStemmer stemmer = PorterStemmer() stemmer.stem('believes') # lemmatizing transforms to root words using grammar rules. It is slower. Stemming is more common. from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() lemmatizer.lemmatize('said', pos='v') # if you don't specify POS, you get zilch. lemmatizer.lemmatize('cookbooks') stemmer.stem('cookbooks') # an apparently recommended compression recipe in Perkins Python 3 NLTK book? Not sure I agree. stemmer.stem(lemmatizer.lemmatize('buses')) def tokenize_and_stem(text): tokens = clean_tokens(nltk.word_tokenize(text)) #return no_stops stems = [stemmer.stem(t) for t in tokens] return stems def makeStemTextCollection(files): from nltk import Text from nltk import TextCollection textlist = [open(filen).read() for filen in files] tokens = [tokenize_and_stem(text) for text in textlist] texts= [Text(text) for text in tokens] collection = TextCollection(texts) return collection fairyStems = makeStemTextCollection(filelist) fairyStems.concordance('witch') fairyStems.collocations(num=25, window_size=2) # reminder of above fairyCol.collocations(num=25, window_size=2) fairyStems.vocab().most_common()[0:10] len(fairyStems.vocab()) # a huge savings over the vocab size of the unstemmed one! len(fairyCol.vocab()) fairyStems.common_contexts(['witch']) fairyCol.common_contexts(['witch']) fairyStems.similar('witch') nltk.download() porter = nltk.PorterStemmer() grail = nltk.corpus.webtext.words('grail.txt') class IndexedText(object): def __init__(self, stemmer, text): self._text = text self._stemmer = stemmer self._index = nltk.Index((self._stem(word), i) for (i, word) in enumerate(text)) def concordance(self, word, width=40): key = self._stem(word) wc = int(width/4) # words of context for i in self._index[key]: lcontext = ' '.join(self._text[i-wc:i]) rcontext = ' '.join(self._text[i:i+wc]) ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width) # right justify rdisplay = '{:{width}}'.format(rcontext[:width], width=width) # left justify print(ldisplay, rdisplay) def _stem(self, word): return self._stemmer.stem(word).lower() text = IndexedText(porter, grail) text.concordance('lie') text.concordance('say')