import nltk
!python -m nltk.downloader gutenberg

from nltk.corpus import gutenberg
gutenberg.raw('carroll-alice.txt')[1:50]

from nltk.tokenize import sent_tokenize, word_tokenize
word_tokenize(sent_tokenize(gutenberg.raw('carroll-alice.txt'))[1])[:10]

import re

def tokenize(words):
    re.sub(r'[-\n]', " ", words) # standardize whitespace
    return [w.upper() for w in words.split(' ') if w != ""]

tokenize(gutenberg.raw('carroll-alice.txt')[1:50])

from functools import reduce

def tokenize(words):
    # The list of regular expressions and replacements to be applied
    replacements = [
         ["[-\n]",                  " "] # Hyphens to whitespace
        ,[r'[][(){}#$%"]',          ""] # Strip unwanted characters
        ,[r'\s([./-]?\d+)+[./-]?\s'," [NUMBER] "] # Standardize numbers
        ,[r'\.{3,}',                " [ELLIPSIS] "]
        ,[r',',                     " [COMMA] "]
        ,[r';',                     " [SEMICOLON] "]
        ,[r':',                     " [COLON] "]
        ,[r'[.!?]',                 " [SENTENCE-BREAK] "]
    ]
    # This is a function that applies a single replacement from the list
    resub = lambda words, repls: re.sub(repls[0], repls[1], words)
    # Applies each replacement in order before splitting the text into tokens
    tokens = [w.upper() for w in 
              reduce(resub, replacements, words).split(' ') if w != '']
    return tokens + ["[SENTENCE-BREAK]"] # Add a sentence break in case the corpus cuts off mid-sentence

tokenize("This is 1 line of Text? How, does, it look; I wonder...")

from collections import defaultdict
from functools import partial
from itertools import dropwhile

def make_model(n, words):
    prior_n = n-1 # n-1 words in the prior tuple
    freq_dist = partial(defaultdict, int) # frequency distribution constructor
    model = defaultdict(freq_dist)

    for index in range(prior_n,len(words)):
        prior = words[index-prior_n:index]
        if "[SENTENCE-BREAK]" in prior:
            # Discard unneeded context
            prior = dropwhile(lambda x: x != "[SENTENCE-BREAK]", prior)
        # Note: tuples are hashable
        model[tuple(prior)][words[index]] += 1

    return model

from nltk.probability import ConditionalFreqDist

def make_model(n, words):
    prior_n = n-1 # n-1 words in the prior tuple
    model = ConditionalFreqDist()

    for index in range(prior_n,len(words)):
        prior = words[index-prior_n:index]
        if "[SENTENCE-BREAK]" in prior:
            # Discard unneeded context
            prior = dropwhile(lambda x: x != "[SENTENCE-BREAK]", prior)
        # Note: tuples are hashable
        model[tuple(prior)].inc(words[index])

    return model

from nltk.probability import ConditionalFreqDist as cfr

def make_model(n, ws):
    return cfr((tuple(ws[i-n+1:i]), ws[i]) for i in range(n-1,len(ws)))

from nltk.model.ngram import NgramModel

model = NgramModel(2, tokenize("This is a sentence. So is this."))
model.choose_random_word(["this"])

class NgramModel(ModelI):
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()


        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]

        for sent in train:
            for ngram in ngrams(chain(self._lpad, sent, self._rpad), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, pad_left, pad_right,
                                       estimator, *estimator_args, **estimator_kwargs)


for sent in train:
    for ngram in ngrams(chain(self._lpad, sent, self._rpad), n):
        self._ngrams.add(ngram)
        context = tuple(ngram[:-1])
        token = ngram[-1]
        cfd[context].inc(token)

def _ngrams(words, n):
    return (tuple(words[i-(n-1):i+1]) for i in range(n-1, len(words)))

list(_ngrams(tokenize("This is a sentence"), 3))

def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):

    sequence = iter(sequence)
    if pad_left:
        sequence = chain((pad_symbol,) * (n-1), sequence)
    if pad_right:
        sequence = chain(sequence, (pad_symbol,) * (n-1))

    history = []
    while n > 1:
        history.append(next(sequence))
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]

from nltk.util import ngrams
list(ngrams(tokenize("This is a slightly longer sentence"), 4))

token_poetry = tokenize(gutenberg.raw("austen-emma.txt"))
%timeit -n 5 list(ngrams(token_poetry, 4))
%memit list(ngrams(token_poetry, 4))
%timeit -n 5 list(_ngrams(token_poetry, 4))
%memit list(_ngrams(token_poetry, 4))

if not estimator_args and not estimator_kwargs:
    self._model = ConditionalProbDist(cfd, estimator, len(cfd))
else:
    self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

# recursively construct the lower-order models
if n > 1:
    self._backoff = NgramModel(n-1, train, pad_left, pad_right,
                               estimator, *estimator_args, **estimator_kwargs)


from encodings import base64_codec
base64_codec.base64_decode(b"VGhlcmUncyBub3RoaW5nIGhlcmUuLi4=")[0]