import nltk !python -m nltk.downloader gutenberg from nltk.corpus import gutenberg gutenberg.raw('carroll-alice.txt')[1:50] from nltk.tokenize import sent_tokenize, word_tokenize word_tokenize(sent_tokenize(gutenberg.raw('carroll-alice.txt'))[1])[:10] import re def tokenize(words): re.sub(r'[-\n]', " ", words) # standardize whitespace return [w.upper() for w in words.split(' ') if w != ""] tokenize(gutenberg.raw('carroll-alice.txt')[1:50]) from functools import reduce def tokenize(words): # The list of regular expressions and replacements to be applied replacements = [ ["[-\n]", " "] # Hyphens to whitespace ,[r'[][(){}#$%"]', ""] # Strip unwanted characters ,[r'\s([./-]?\d+)+[./-]?\s'," [NUMBER] "] # Standardize numbers ,[r'\.{3,}', " [ELLIPSIS] "] ,[r',', " [COMMA] "] ,[r';', " [SEMICOLON] "] ,[r':', " [COLON] "] ,[r'[.!?]', " [SENTENCE-BREAK] "] ] # This is a function that applies a single replacement from the list resub = lambda words, repls: re.sub(repls[0], repls[1], words) # Applies each replacement in order before splitting the text into tokens tokens = [w.upper() for w in reduce(resub, replacements, words).split(' ') if w != ''] return tokens + ["[SENTENCE-BREAK]"] # Add a sentence break in case the corpus cuts off mid-sentence tokenize("This is 1 line of Text? How, does, it look; I wonder...") from collections import defaultdict from functools import partial from itertools import dropwhile def make_model(n, words): prior_n = n-1 # n-1 words in the prior tuple freq_dist = partial(defaultdict, int) # frequency distribution constructor model = defaultdict(freq_dist) for index in range(prior_n,len(words)): prior = words[index-prior_n:index] if "[SENTENCE-BREAK]" in prior: # Discard unneeded context prior = dropwhile(lambda x: x != "[SENTENCE-BREAK]", prior) # Note: tuples are hashable model[tuple(prior)][words[index]] += 1 return model from nltk.probability import ConditionalFreqDist def make_model(n, words): prior_n = n-1 # n-1 words in the prior tuple model = ConditionalFreqDist() for index in range(prior_n,len(words)): prior = words[index-prior_n:index] if "[SENTENCE-BREAK]" in prior: # Discard unneeded context prior = dropwhile(lambda x: x != "[SENTENCE-BREAK]", prior) # Note: tuples are hashable model[tuple(prior)].inc(words[index]) return model from nltk.probability import ConditionalFreqDist as cfr def make_model(n, ws): return cfr((tuple(ws[i-n+1:i]), ws[i]) for i in range(n-1,len(ws))) from nltk.model.ngram import NgramModel model = NgramModel(2, tokenize("This is a sentence. So is this.")) model.choose_random_word(["this"]) class NgramModel(ModelI): def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): self._n = n self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] for sent in train: for ngram in ngrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) for sent in train: for ngram in ngrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) def _ngrams(words, n): return (tuple(words[i-(n-1):i+1]) for i in range(n-1, len(words))) list(_ngrams(tokenize("This is a sentence"), 3)) def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None): sequence = iter(sequence) if pad_left: sequence = chain((pad_symbol,) * (n-1), sequence) if pad_right: sequence = chain(sequence, (pad_symbol,) * (n-1)) history = [] while n > 1: history.append(next(sequence)) n -= 1 for item in sequence: history.append(item) yield tuple(history) del history[0] from nltk.util import ngrams list(ngrams(tokenize("This is a slightly longer sentence"), 4)) token_poetry = tokenize(gutenberg.raw("austen-emma.txt")) %timeit -n 5 list(ngrams(token_poetry, 4)) %memit list(ngrams(token_poetry, 4)) %timeit -n 5 list(_ngrams(token_poetry, 4)) %memit list(_ngrams(token_poetry, 4)) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) from encodings import base64_codec base64_codec.base64_decode(b"VGhlcmUncyBub3RoaW5nIGhlcmUuLi4=")[0]