NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, and an active discussion forum.
Library documentation: http://www.nltk.org/
# needed to display the graphs
%matplotlib inline
# import the library and download sample texts
import nltk
nltk.download()
showing info http://nltk.github.com/nltk_data/
True
from nltk.book import *
*** Introductory Examples for the NLTK Book *** Loading text1, ..., text9 and sent1, ..., sent9 Type the name of the text or sentence to view it. Type: 'texts()' or 'sents()' to list the materials. text1: Moby Dick by Herman Melville 1851 text2: Sense and Sensibility by Jane Austen 1811 text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal text8: Personals Corpus text9: The Man Who Was Thursday by G . K . Chesterton 1908
# examine concordances (word + context)
text1.concordance("monstrous")
Displaying 11 of 11 matches: ong the former , one was of a most monstrous size . ... This came towards us , ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r ll over with a heathenish array of monstrous clubs and spears . Some were thick d as you gazed , and wondered what monstrous cannibal and savage could ever hav that has survived the flood ; most monstrous and most mountainous ! That Himmal they might scout at Moby Dick as a monstrous fable , or still worse and more de th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l ing Scenes . In connexion with the monstrous pictures of whales , I am strongly ere to enter upon those still more monstrous stories of them which are to be fo ght have been rummaged out of this monstrous cabinet there is no telling . But of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u
text1.similar("monstrous")
imperial subtly impalpable pitiable curious abundant perilous trustworthy untoward singular lamentable few determined maddens horrible tyrannical lazy mystifying christian exasperate
text2.common_contexts(["monstrous", "very"])
a_pretty is_pretty a_lucky am_glad be_glad
# see where in a text certain words are found to occur
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
# count of all tokens (including punctuation)
len(text3)
44764
# number of distinct tokens
len(set(text3))
2789
# the texts are just lists of strings
text2[141525:]
[u'among', u'the', u'merits', u'and', u'the', u'happiness', u'of', u'Elinor', u'and', u'Marianne', u',', u'let', u'it', u'not', u'be', u'ranked', u'as', u'the', u'least', u'considerable', u',', u'that', u'though', u'sisters', u',', u'and', u'living', u'almost', u'within', u'sight', u'of', u'each', u'other', u',', u'they', u'could', u'live', u'without', u'disagreement', u'between', u'themselves', u',', u'or', u'producing', u'coolness', u'between', u'their', u'husbands', u'.', u'THE', u'END']
# build a frequency distribution
fdist1 = FreqDist(text1)
fdist1
FreqDist({u',': 18713, u'the': 13721, u'.': 6862, u'of': 6536, u'and': 6024, u'a': 4569, u'to': 4542, u';': 4072, u'in': 3916, u'that': 2982, ...})
fdist1.most_common(20)
[(u',', 18713), (u'the', 13721), (u'.', 6862), (u'of', 6536), (u'and', 6024), (u'a', 4569), (u'to', 4542), (u';', 4072), (u'in', 3916), (u'that', 2982), (u"'", 2684), (u'-', 2552), (u'his', 2459), (u'it', 2209), (u'I', 2124), (u's', 1739), (u'is', 1695), (u'he', 1661), (u'with', 1659), (u'was', 1632)]
fdist1['whale']
906
fdist1.plot(20, cumulative=True)
# apply a list comprehension to get words over 15 characters
V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)
[u'CIRCUMNAVIGATION', u'Physiognomically', u'apprehensiveness', u'cannibalistically', u'characteristically', u'circumnavigating', u'circumnavigation', u'circumnavigations', u'comprehensiveness', u'hermaphroditical', u'indiscriminately', u'indispensableness', u'irresistibleness', u'physiognomically', u'preternaturalness', u'responsibilities', u'simultaneousness', u'subterraneousness', u'supernaturalness', u'superstitiousness', u'uncomfortableness', u'uncompromisedness', u'undiscriminating', u'uninterpenetratingly']
fdist2 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist2[w] > 7)
[u'#14-19teens', u'#talkcity_adults', u'((((((((((', u'........', u'Question', u'actually', u'anything', u'computer', u'cute.-ass', u'everyone', u'football', u'innocent', u'listening', u'remember', u'seriously', u'something', u'together', u'tomorrow', u'watching']
# word sequences that appear together unusually often
text4.collocations()
United States; fellow citizens; four years; years ago; Federal Government; General Government; American people; Vice President; Old World; Almighty God; Fellow citizens; Chief Magistrate; Chief Justice; God bless; every citizen; Indian tribes; public debt; one another; foreign nations; political parties
# download raw text from an online repository
import urllib2
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = urllib2.urlopen(url)
raw = response.read().decode('utf8')
len(raw)
1176896
raw[:75]
u'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n'
# tokenize the raw text
from nltk import word_tokenize
tokens = word_tokenize(raw)
len(tokens)
254352
tokens[:10]
[u'The', u'Project', u'Gutenberg', u'EBook', u'of', u'Crime', u'and', u'Punishment', u',', u'by']
text = nltk.Text(tokens)
text[1024:1062]
[u'CHAPTER', u'I', u'On', u'an', u'exceptionally', u'hot', u'evening', u'early', u'in', u'July', u'a', u'young', u'man', u'came', u'out', u'of', u'the', u'garret', u'in', u'which', u'he', u'lodged', u'in', u'S.', u'Place', u'and', u'walked', u'slowly', u',', u'as', u'though', u'in', u'hesitation', u',', u'towards', u'K.', u'bridge', u'.']
text.collocations()
Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna; great deal; Nikodim Fomitch; young man; Ilya Petrovitch; n't know; Project Gutenberg; Dmitri Prokofitch; Andrey Semyonovitch; Hay Market
raw.find("PART I")
5338
# HTML parsing using the Beautiful Soup library
from bs4 import BeautifulSoup
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urllib2.urlopen(url).read().decode('utf8')
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)
tokens[0:10]
[u'BBC', u'NEWS', u'|', u'Health', u'|', u'Blondes', u"'to", u'die', u'out', u'in']
# isolate just the article text
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')
Displaying 5 of 5 matches: hey say too few people now carry the gene for blondes to last beyond the next blonde hair is caused by a recessive gene . In order for a child to have blond have blonde hair , it must have the gene on both sides of the family in the g ere is a disadvantage of having that gene or by chance . They do n't disappear des would disappear is if having the gene was a disadvantage and I do not thin
# regular expression library
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
# match the end of a word
[w for w in wordlist if re.search('ed$', w)][0:10]
[u'abaissed', u'abandoned', u'abased', u'abashed', u'abatised', u'abed', u'aborted', u'abridged', u'abscessed', u'absconded']
# wildcard matches any single character
[w for w in wordlist if re.search('^..j..t..$', w)][0:10]
[u'abjectly', u'adjuster', u'dejected', u'dejectly', u'injector', u'majestic', u'objectee', u'objector', u'rejecter', u'rejector']
# combination of caret (start of word) and sets
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
[u'gold', u'golf', u'hold', u'hole']
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
# plus symbol matches any number of times repeating
[w for w in chat_words if re.search('^m+i+n+e+$', w)]
[u'miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', u'miiiiiinnnnnnnnnneeeeeeee', u'mine', u'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']
wsj = sorted(set(nltk.corpus.treebank.words()))
# more advanced regex example
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][0:10]
[u'0.0085', u'0.05', u'0.1', u'0.16', u'0.2', u'0.25', u'0.28', u'0.3', u'0.4', u'0.5']
[w for w in wsj if re.search('^[A-Z]+\$$', w)]
[u'C$', u'US$']
[w for w in wsj if re.search('^[0-9]{4}$', w)][0:10]
[u'1614', u'1637', u'1787', u'1901', u'1903', u'1917', u'1925', u'1929', u'1933', u'1934']
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][0:10]
[u'10-day', u'10-lap', u'10-year', u'100-share', u'12-point', u'12-year', u'14-hour', u'15-day', u'150-point', u'190-point']
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)][0:10]
[u'black-and-white', u'bread-and-butter', u'father-in-law', u'machine-gun-toting', u'savings-and-loan']
[w for w in wsj if re.search('(ed|ing)$', w)][0:10]
[u'62%-owned', u'Absorbed', u'According', u'Adopting', u'Advanced', u'Advancing', u'Alfred', u'Allied', u'Annualized', u'Anything']
# using "findall" to extract partial matches from words
fd = nltk.FreqDist(vs for word in wsj
for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)
[(u'io', 549), (u'ea', 476), (u'ie', 331), (u'ou', 329), (u'ai', 261), (u'ia', 253), (u'ee', 217), (u'oo', 174), (u'ua', 109), (u'au', 106), (u'ue', 105), (u'ui', 95)]
# NLTK has several word stemmers built in
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens][0:10]
[u'UK', u'Blond', u"'to", u'die', u'out', u'in', u'200', u"years'", u'Scientist', u'believ']
[lancaster.stem(t) for t in tokens][0:10]
[u'uk', u'blond', u"'to", u'die', u'out', u'in', u'200', u"years'", u'sci', u'believ']
wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens][0:10]
[u'UK', u'Blondes', u"'to", u'die', u'out', u'in', u'200', u"years'", u'Scientists', u'believe']
# also has a tokenizer that takes a regular expression as a parameter
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens; includes ], [
'''
nltk.regexp_tokenize(text, pattern)
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']
# Use a built-in tokenizer and tagger
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)
[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]
# Word similarity using a pre-tagged text
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')
man time day year car moment world family house country child boy state job way war girl place word work
# Tagged words are saved as tuples
nltk.corpus.brown.tagged_words()[0:10]
[(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN')]
nltk.corpus.brown.tagged_words(tagset='universal')[0:10]
[(u'The', u'DET'), (u'Fulton', u'NOUN'), (u'County', u'NOUN'), (u'Grand', u'ADJ'), (u'Jury', u'NOUN'), (u'said', u'VERB'), (u'Friday', u'NOUN'), (u'an', u'DET'), (u'investigation', u'NOUN'), (u'of', u'ADP')]
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()
[(u'NOUN', 30640), (u'VERB', 14399), (u'ADP', 12355), (u'.', 11928), (u'DET', 11389), (u'ADJ', 6706), (u'ADV', 3349), (u'CONJ', 2717), (u'PRON', 2535), (u'PRT', 2264), (u'NUM', 2166), (u'X', 106)]
# Part of speech tag count for words following "often" in a text
brown_lrnd_tagged = brown.tagged_words(categories='learned', tagset='universal')
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()
VERB ADV ADP ADJ . PRT 37 8 7 6 4 2
# Load some raw sentences to tag
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
# Default tagger (assigns same tag to each token)
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)
[('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('green', 'NN'), ('eggs', 'NN'), ('and', 'NN'), ('ham', 'NN'), (',', 'NN'), ('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('them', 'NN'), ('Sam', 'NN'), ('I', 'NN'), ('am', 'NN'), ('!', 'NN')]
# Evaluate the performance against a tagged corpus
default_tagger.evaluate(brown_tagged_sents)
0.13089484257215028
# Training a unigram tagger
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])
[(u'Various', u'JJ'), (u'of', u'IN'), (u'the', u'AT'), (u'apartments', u'NNS'), (u'are', u'BER'), (u'of', u'IN'), (u'the', u'AT'), (u'terrace', u'NN'), (u'type', u'NN'), (u',', u','), (u'being', u'BEG'), (u'on', u'IN'), (u'the', u'AT'), (u'ground', u'NN'), (u'floor', u'NN'), (u'so', u'QL'), (u'that', u'CS'), (u'entrance', u'NN'), (u'is', u'BEZ'), (u'direct', u'JJ'), (u'.', u'.')]
# Now evalute it
unigram_tagger.evaluate(brown_tagged_sents)
0.9349006503968017
# Combining taggers
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(brown_tagged_sents, backoff=t0)
t2 = nltk.BigramTagger(brown_tagged_sents, backoff=t1)
t2.evaluate(brown_tagged_sents)
0.9730592517453309
# Define a feature extractor
def gender_features(word):
return {'last_letter': word[-1]}
gender_features('Shrek')
{'last_letter': 'k'}
# Prepare a list of examples
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
# Process the names data
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Neo'))
'male'
classifier.classify(gender_features('Trinity'))
'female'
print(nltk.classify.accuracy(classifier, test_set))
0.752
classifier.show_most_informative_features(5)
Most Informative Features last_letter = u'a' female : male = 35.4 : 1.0 last_letter = u'k' male : female = 31.9 : 1.0 last_letter = u'f' male : female = 17.4 : 1.0 last_letter = u'p' male : female = 11.3 : 1.0 last_letter = u'm' male : female = 10.2 : 1.0
# Document classification
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
0.64
classifier.show_most_informative_features(5)
Most Informative Features contains(sans) = True neg : pos = 8.4 : 1.0 contains(uplifting) = True pos : neg = 8.2 : 1.0 contains(mediocrity) = True neg : pos = 7.7 : 1.0 contains(dismissed) = True pos : neg = 7.0 : 1.0 contains(overwhelmed) = True pos : neg = 6.3 : 1.0