#!/usr/bin/env python # coding: utf-8 # # NLTK # NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, and an active discussion forum. # # Library documentation: http://www.nltk.org/ # In[1]: # needed to display the graphs get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # import the library and download sample texts import nltk nltk.download() # In[3]: from nltk.book import * # In[4]: # examine concordances (word + context) text1.concordance("monstrous") # In[5]: text1.similar("monstrous") # In[6]: text2.common_contexts(["monstrous", "very"]) # In[7]: # see where in a text certain words are found to occur text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) # In[8]: # count of all tokens (including punctuation) len(text3) # In[9]: # number of distinct tokens len(set(text3)) # In[10]: # the texts are just lists of strings text2[141525:] # In[11]: # build a frequency distribution fdist1 = FreqDist(text1) fdist1 # In[12]: fdist1.most_common(20) # In[13]: fdist1['whale'] # In[14]: fdist1.plot(20, cumulative=True) # In[15]: # apply a list comprehension to get words over 15 characters V = set(text1) long_words = [w for w in V if len(w) > 15] sorted(long_words) # In[16]: fdist2 = FreqDist(text5) sorted(w for w in set(text5) if len(w) > 7 and fdist2[w] > 7) # In[17]: # word sequences that appear together unusually often text4.collocations() # ## Raw Text Processing # In[18]: # download raw text from an online repository import urllib2 url = "http://www.gutenberg.org/files/2554/2554.txt" response = urllib2.urlopen(url) raw = response.read().decode('utf8') len(raw) # In[19]: raw[:75] # In[20]: # tokenize the raw text from nltk import word_tokenize tokens = word_tokenize(raw) len(tokens) # In[21]: tokens[:10] # In[22]: text = nltk.Text(tokens) text[1024:1062] # In[23]: text.collocations() # In[24]: raw.find("PART I") # In[25]: # HTML parsing using the Beautiful Soup library from bs4 import BeautifulSoup url = "http://news.bbc.co.uk/2/hi/health/2284783.stm" html = urllib2.urlopen(url).read().decode('utf8') raw = BeautifulSoup(html).get_text() tokens = word_tokenize(raw) tokens[0:10] # In[26]: # isolate just the article text tokens = tokens[110:390] text = nltk.Text(tokens) text.concordance('gene') # ## Regular Expressions # In[27]: # regular expression library import re wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()] # In[28]: # match the end of a word [w for w in wordlist if re.search('ed$', w)][0:10] # In[29]: # wildcard matches any single character [w for w in wordlist if re.search('^..j..t..$', w)][0:10] # In[30]: # combination of caret (start of word) and sets [w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)] # In[31]: chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words())) # plus symbol matches any number of times repeating [w for w in chat_words if re.search('^m+i+n+e+$', w)] # In[32]: wsj = sorted(set(nltk.corpus.treebank.words())) # more advanced regex example [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][0:10] # In[33]: [w for w in wsj if re.search('^[A-Z]+\$$', w)] # In[34]: [w for w in wsj if re.search('^[0-9]{4}$', w)][0:10] # In[35]: [w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][0:10] # In[36]: [w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)][0:10] # In[37]: [w for w in wsj if re.search('(ed|ing)$', w)][0:10] # In[38]: # using "findall" to extract partial matches from words fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word)) fd.most_common(12) # ## Normalizing Text # In[39]: # NLTK has several word stemmers built in porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() # In[40]: [porter.stem(t) for t in tokens][0:10] # In[41]: [lancaster.stem(t) for t in tokens][0:10] # In[42]: wnl = nltk.WordNetLemmatizer() [wnl.lemmatize(t) for t in tokens][0:10] # In[43]: # also has a tokenizer that takes a regular expression as a parameter text = 'That U.S.A. poster-print costs $12.40...' pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' nltk.regexp_tokenize(text, pattern) # ## Tagging # In[44]: # Use a built-in tokenizer and tagger text = word_tokenize("They refuse to permit us to obtain the refuse permit") nltk.pos_tag(text) # In[45]: # Word similarity using a pre-tagged text text = nltk.Text(word.lower() for word in nltk.corpus.brown.words()) text.similar('woman') # In[46]: # Tagged words are saved as tuples nltk.corpus.brown.tagged_words()[0:10] # In[47]: nltk.corpus.brown.tagged_words(tagset='universal')[0:10] # In[48]: from nltk.corpus import brown brown_news_tagged = brown.tagged_words(categories='news', tagset='universal') tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged) tag_fd.most_common() # In[49]: # Part of speech tag count for words following "often" in a text brown_lrnd_tagged = brown.tagged_words(categories='learned', tagset='universal') tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often'] fd = nltk.FreqDist(tags) fd.tabulate() # In[50]: # Load some raw sentences to tag from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') # In[51]: # Default tagger (assigns same tag to each token) tags = [tag for (word, tag) in brown.tagged_words(categories='news')] nltk.FreqDist(tags).max() raw = 'I do not like green eggs and ham, I do not like them Sam I am!' tokens = word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') default_tagger.tag(tokens) # In[52]: # Evaluate the performance against a tagged corpus default_tagger.evaluate(brown_tagged_sents) # In[53]: # Training a unigram tagger from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) unigram_tagger.tag(brown_sents[2007]) # In[54]: # Now evalute it unigram_tagger.evaluate(brown_tagged_sents) # In[55]: # Combining taggers t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(brown_tagged_sents, backoff=t0) t2 = nltk.BigramTagger(brown_tagged_sents, backoff=t1) t2.evaluate(brown_tagged_sents) # ## Classifying Text # In[56]: # Define a feature extractor def gender_features(word): return {'last_letter': word[-1]} gender_features('Shrek') # In[57]: # Prepare a list of examples from nltk.corpus import names labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) import random random.shuffle(labeled_names) # In[58]: # Process the names data featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) # In[59]: classifier.classify(gender_features('Neo')) # In[60]: classifier.classify(gender_features('Trinity')) # In[61]: print(nltk.classify.accuracy(classifier, test_set)) # In[62]: classifier.show_most_informative_features(5) # In[63]: # Document classification from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) # In[64]: all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features # In[65]: featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) # In[66]: print(nltk.classify.accuracy(classifier, test_set)) # In[67]: classifier.show_most_informative_features(5)