This notebook introduces the Natural Language Toolkit (NLTK).
The first step is to import the nltk
library and to load some example texts.
import nltk
from nltk.book import *
text1.dispersion_plot(['Ahab','whale','Ishmael','Queequeg', 'Moby', 'dive'])
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
text1.concordance('Ishmael')
text1.similar('Ishmael')
text1.similar('whale')
text1.collocations()
text1.concordance("monstrous")
text2.concordance("monstrous")
text1.similar("monstrous")
text2.similar("monstrous")
def lexical_diversity(text):
return len(text)/(1.0*len(set(text)))
def percentage(count, total):
return 100*count/total
lexical_diversity(text3)
lexical_diversity(text4)
lexical_diversity(text5)
fdist1 = FreqDist(text1)
vocabulary1 = fdist1.keys()
fdist1['whale']
fdist1['monstrous']
fdist1.plot(50, cumulative=True)
hapaxes1 = fdist1.hapaxes()
print len(hapaxes1)
print hapaxes1[1000:1010]
thursday_sents = nltk.corpus.gutenberg.sents('chesterton-thursday.txt')
sent22 = thursday_sents[22]
' '.join(sent22)
nltk.bigrams(w for w in sent22 if w.isalpha())
import networkx as nx
G = nx.Graph()
begin_sent = 22
end_sent = 24
sents = thursday_sents[begin_sent:end_sent+1]
for sent in sents:
G.add_edges_from(nltk.bigrams(w for w in sent if w.isalpha()))
nx.draw(G)
Below [BKL] refers to "Natural Language Processing with Python" by Bird, Klein and Loper and [MAR] refers to "Mining the Social Web" by Matthew A. Russell.
See page 95 of [BKL].
import codecs, nltk, pprint
hard_times_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/786-0.txt"
david_copperfield_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/pg766.txt"
f = codecs.open(hard_times_path, encoding = 'utf-8')
david_copperfield_file = codecs.open(david_copperfield_path, encoding = 'utf-8')
hard_times_raw_text = f.read()
len(hard_times_raw_text)
david_copperfield_raw_text = david_copperfield_file.read()
len(david_copperfield_raw_text)
See page 112 of [BKL].
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(hard_times_raw_text)
print sents[171]
len(sents)
An alternative approach based on [MAR].
sents = nltk.tokenize.sent_tokenize(hard_times_raw_text)
print sents[171]
DC_sents = nltk.tokenize.sent_tokenize(david_copperfield_raw_text)
print DC_sents[171]
tokens = [nltk.tokenize.word_tokenize(s) for s in sents]
len(tokens)
print tokens[171]
DC_tokens = [nltk.tokenize.word_tokenize(s) for s in DC_sents]
print DC_tokens[171]
Warning: Very slow
# pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
' ' .join(tokens[171])
bigram = nltk.bigrams(w for w in tokens[171] if w.isalpha())
import networkx as nx
G = nx.Graph()
G.add_edges_from(bigram)
nx.draw(G)
print nltk.ngrams((w for w in tokens[171] if w.isalpha()), 5)
Now consider the full text of "Hard Times", here represented by the list tokens
.
fivegrams = []
for t in tokens:
fivegrams += nltk.ngrams((w for w in t if w.isalpha()), 5)
print ' '.join(fivegrams[2000])
Create a dictionary to count occurrences of specific 5-grams.
D = {}
for gram in fivegrams:
if not D.get(gram):
D[gram] = 1
else:
D[gram] += 1
Iterate through the keys of the dictionary and print out and 5-grams that have more than 3 occurrences in the text.
for gram in D.keys():
if D[gram] > 3:
print ' '.join(gram), D[gram]
Now we can try to find the occurrences of one of the 5-grams, 'vension with a gold spoon', in the original text.
hard_times_raw_text.find('venison with a gold spoon')
print hard_times_raw_text[250010:250082]
hard_times_raw_text.find('venison with a gold spoon',250057)
print hard_times_raw_text[250084:250182]
We only find 2, instead of the 4 we counted, though. Probably an issue of punctuation.
hard_times_raw_text.find('venison with a gold spoon', 250157)
five_gram_2 = 'his hands in his pockets'
hard_times_raw_text.find(five_gram_2)
hard_times_raw_text.find(five_gram_2, 39378)
hard_times_raw_text.find(five_gram_2, 41789)
hard_times_raw_text.find(five_gram_2, 58760)
hard_times_raw_text.find(five_gram_2, 476236)
hard_times_raw_text.find(five_gram_2, 514355)
Now we will try to reproduce Table 3.3 on p.47 of [MM].
DC_bigrams = []
for t in DC_tokens:
DC_bigrams += nltk.ngrams((w for w in t if w.isalpha()), 2)
DC_D = {}
for gram in DC_bigrams:
if not DC_D.get(gram):
DC_D[gram] = 1
else:
DC_D[gram] += 1
for gram in DC_D.keys():
if DC_D[gram] > 500:
print ' '.join(gram), DC_D[gram]
DC_trigrams = []
for t in DC_tokens:
DC_trigrams += nltk.ngrams((w for w in t if w.isalpha()), 3)
DC_D_3 = {}
for gram in DC_trigrams:
if not DC_D_3.get(gram):
DC_D_3[gram] = 1
else:
DC_D_3[gram] += 1
for gram in DC_D_3.keys():
if DC_D_3[gram] > 60:
print ' '.join(gram), DC_D_3[gram]
DC_fourgrams = []
for t in DC_tokens:
DC_fourgrams += nltk.ngrams((w for w in t if w.isalpha()), 4)
DC_D_4 = {}
for gram in DC_fourgrams:
if not DC_D_4.get(gram):
DC_D_4[gram] = 1
else:
DC_D_4[gram] += 1
for gram in DC_D_4.keys():
if DC_D_4[gram] > 18:
print ' '.join(gram), DC_D_4[gram]
By comparison with [MM] there is a problem in how I am handling punctuation. For example, I record the most frequent four gram as "I do know what" but in [MM] it is "I don't know what".
david_copperfield_raw_text.find("I do know what")
david_copperfield_raw_text.find("I don't know what")
print david_copperfield_raw_text[19264:19296]
At the very least I ought to reduce all text to lowercase, as done in [MM].
We begin with an extract.
test_extract = sents[1024: 1037]
print ' '.join(test_extract)
import codecs, nltk
little_dorrit_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/pg963.txt"
f = codecs.open(little_dorrit_path, encoding = 'utf-8')
little_dorrit_file = codecs.open(little_dorrit_path, encoding = 'utf-8')
little_dorrit_raw = little_dorrit_file.read()
len(little_dorrit_raw)
little_dorrit_raw.find(u'At the close of this recital')
end_phrase = 'producing the money.'
little_dorrit_raw.find(end_phrase)
task_string = little_dorrit_raw[1725461:1727185 + len(end_phrase)]
print task_string
import re
re_1 = r"'[^']+'"
re_2 = r"'[a-zA-Z0-9_,!? ]+(?:[-'][a-zA-Z0-9_,!? ]+)*'"
re_3 = r"'[^']+[\.,!?]'"
nltk.re_show(re_1, task_string[423:])
re.findall(re_1, task_string)
from nltk.corpus import PlaintextCorpusReader
corpus_root = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles"
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
len(hard_times_sents_raw)
a = 0
for sentence in hard_times_sents_raw:
a += len(sentence)
print a
import re
s = hard_times_sents_raw[1000]
# re.findall(r'\W+', sentence)
print s
In this section I show how to remove extraneous text from the raw string for "Hard Times". This is done by hand by first identifying the first and last sentences of the text. There are some issue about Unicode that I haven't yet resolved.
hard_times_first_sentence = '\xe2 \x80\x98 NOW , what I want is , Facts .'
hard_times_first_sentence.split() in hard_times_sents_raw
first_sentence_index = hard_times_sents_raw.index(hard_times_first_sentence.split())
' '.join(hard_times_sents_raw[first_sentence_index])
hard_times_last_sentence = 'We shall sit with lighter bosoms on the hearth , to see the ashes of our fires turn gray and cold .'
hard_times_last_sentence.split() in hard_times_sents_raw
last_sentence_index = hard_times_sents_raw.index(hard_times_last_sentence.split())
' '.join(hard_times_sents_raw[last_sentence_index])
hard_times_sents = hard_times_sents_raw[first_sentence_index:last_sentence_index + 1]