%matplotlib inline

import itertools
import nltk
import string

with file("data/stories/A_THE FIR TREE.txt",'r') as handle:
    text = handle.read()

text[0:100]

tokens = nltk.word_tokenize(text)
tokens[60:110]

nltk.wordpunct_tokenize(text)[60:120]

nltk.sent_tokenize(text)[0:10]

from nltk.corpus import stopwords

english_stops = stopwords.words('english')

english_stops

tokens = nltk.word_tokenize(text)
tokens[0:15]

len(tokens)

# try this without .lower in the if-statement and check the size!
tokens = [token.lower() for token in tokens if token.lower() not in english_stops]
len(tokens)

tokens[0:15]

import string
string.punctuation

tokens = [token for token in tokens if token not in string.punctuation]
len(tokens)

# There's some awful stuff still in here:
sorted(tokens)[0:20]

[token for token in tokens if len(token) <= 2][0:20]

def clean_tokens(tokens):
    """ Lowercases, takes out punct and stopwords and short strings """
    return [token.lower() for token in tokens if (token not in string.punctuation) and 
                   (token.lower() not in english_stops) and len(token) > 2]

clean = clean_tokens(tokens)
clean[0:20]

len(clean)

from nltk import Text
text = Text(clean)
text.vocab().most_common()[0:20]

def makeTextCollection(files):
    from nltk import Text
    from nltk import TextCollection
    textlist = [open(filen).read() for filen in files]
    texts= [Text(clean_tokens(nltk.word_tokenize(text))) for text in textlist]
    collection = TextCollection(texts)
    return collection

filelist = !ls data/stories/*

filelist

fairyCol = makeTextCollection(filelist)

fairyCol.concordance('witch')
# Note that a concordance view should really retain original format; see below for how to do this 
# with an IndexedText object.

fairyCol.similar('king')

fairyCol.similar('queen')

fairyCol.collocations(num=25, window_size=3)

fairyCol.dispersion_plot(['queen','king','girl', 'boy', 'daughter','son'])

fairyCol.plot(50)

fairyCol.vocab().most_common()[0:15]

fairyCol.common_contexts(['queen'])

len(fairyCol.vocab())

nltk.download()

text = nltk.word_tokenize("And now for something completely different")
tagged = nltk.pos_tag(text)  # there are a few options for taggers, details in NLTK books
tagged

nltk.untag(tagged)

# stemming removes affixes.  This is the default choice for stemming although other algs exist
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('believes')

# lemmatizing transforms to root words using grammar rules. It is slower. Stemming is more common.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('said', pos='v')  # if you don't specify POS, you get zilch.

lemmatizer.lemmatize('cookbooks')

stemmer.stem('cookbooks')

# an apparently recommended compression recipe in Perkins Python 3 NLTK book? Not sure I agree.
stemmer.stem(lemmatizer.lemmatize('buses'))

def tokenize_and_stem(text):
    tokens = clean_tokens(nltk.word_tokenize(text))
    #return no_stops
    stems = [stemmer.stem(t) for t in tokens]
    return stems

def makeStemTextCollection(files):
    from nltk import Text
    from nltk import TextCollection
    textlist = [open(filen).read() for filen in files]
    tokens = [tokenize_and_stem(text) for text in textlist]
    texts= [Text(text) for text in tokens]
    collection = TextCollection(texts)
    return collection

fairyStems = makeStemTextCollection(filelist)

fairyStems.concordance('witch')

fairyStems.collocations(num=25, window_size=2)

# reminder of above
fairyCol.collocations(num=25, window_size=2)

fairyStems.vocab().most_common()[0:10]

len(fairyStems.vocab())  # a huge savings over the vocab size of the unstemmed one!

len(fairyCol.vocab())

fairyStems.common_contexts(['witch'])

fairyCol.common_contexts(['witch'])

fairyStems.similar('witch')

nltk.download()

porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')

class IndexedText(object):

    def __init__(self, stemmer, text):
        self._text = text
        self._stemmer = stemmer
        self._index = nltk.Index((self._stem(word), i)
                                 for (i, word) in enumerate(text))

    def concordance(self, word, width=40):
        key = self._stem(word)
        wc = int(width/4)                # words of context
        for i in self._index[key]:
            lcontext = ' '.join(self._text[i-wc:i])
            rcontext = ' '.join(self._text[i:i+wc])
            ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width)  # right justify
            rdisplay = '{:{width}}'.format(rcontext[:width], width=width)  # left justify
            print(ldisplay, rdisplay)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()

text = IndexedText(porter, grail)
text.concordance('lie')

text.concordance('say')