#!/usr/bin/env python # coding: utf-8 # # Natural Language Processing # Sources: # # * [Natural Language Processing with Python – Analyzing Text with the Natural Language Toolkit](http://www.nltk.org/book/) by Steven Bird, Ewan Klein, and Edward Loper # * [NLP Tutorial Using Python NLTK (Simple Examples)](https://likegeeks.com/nlp-tutorial-using-python-nltk/) # * [NLTK: Natural Language Toolkit Documentation](http://www.nltk.org/) # * [Demos by the Cognitive Computation Group](http://cogcomp.cs.illinois.edu/page/demos) at the University of Illinois, Champaign-Urbana # # The NLTK package can be installed by # # conda install nltk # In[ ]: import numpy as np import matplotlib.pyplot as plt # First step in any natural language work...get some text. The book provides several large text samples that you can download with the following commands. A window will pop up from which you can select various [corpora](https://en.wikipedia.org/wiki/Text_corpus). Select "book" and download it. # In[ ]: import nltk nltk.download() # download text examples from "book" # Let's assume this as been done. Now the material for the book can be used by doing # In[ ]: import nltk.book as bk # Cool! All of *Moby Dick*! Let's take a look at the text. The text objects can be treated as lists, so # In[ ]: bk.text1[:8] # In[ ]: bk.text1[1000:1010] # A very common way to characterize some text is by a count of the times # a word appears in the text. The text objects have a number of useful # methods, such as # In[ ]: bk.text1.count('whale') # We can now write a python function to count the frequency of # occurrence of a number of words. NLTK provides a way to do this for # all of the words in a text. # In[ ]: wordCounts = bk.FreqDist(bk.text1) len(wordCounts) # In[ ]: type(wordCounts) # In[ ]: wordCounts.most_common()[:10] # Now we need a plot! We can do it ourselves, or use the `plot` method already defined for the `FreqDist` object.| # In[ ]: plt.figure(figsize=(10,10)) wordCounts.plot(50); # But, where is ''Ishmael'' and ''Starbuck''? # In[ ]: wordCounts['Ishmael'], wordCounts.freq('Ishmael') # In[ ]: wordCounts['Starbuck'], wordCounts.freq('Starbuck') # Who is mentioned more in *Moby Dick*? # # Usually we will want to ignore short words, including punctuation, and # words that occur a very small number of times. We can use a list # comprehension to build this list, but first need a way to create a # list of words without repetitions. Python `set` to the rescue. # In[ ]: len(bk.text1) # In[ ]: len(set(bk.text1)) # So each word is used, on average, ... # In[ ]: len(bk.text1) / len(set(bk.text1)) # ... about 13 times. Now, let's step through each word in the set and only # keep ones that are longer than 8 characters and that appear more than # 20 times, and let's sort the result. # In[ ]: sigWords = sorted([word for word in set(bk.text1) if len(word) > 8 and wordCounts[word] > 20]) len(sigWords) # In[ ]: print(*sigWords) # prints all on one line. Easier to see than just evaluating sigWords # Another common technique is to look for *collocations*, or pairs of # words that appear together more often than expected by considering the # number of times each word appears. Yep, NLTK has a method for that. # In[ ]: bk.text1.collocations() # # Using text from a URL # In[ ]: import urllib.request # In[ ]: response = urllib.request.urlopen('https://www.gutenberg.org/files/46/46-h/46-h.htm') html = response.read() # print(html) # To extract the text from the html, we will use [Beautiful Soup](https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe). # # In[ ]: from bs4 import BeautifulSoup # In[ ]: soup = BeautifulSoup(html,'html.parser') # In[ ]: soup(['script', 'style']) # In[ ]: for script in soup(["script", "style"]): script.extract() # In[ ]: text = soup.get_text(strip=True) # In[ ]: text[:100] # In[ ]: tokens = [t for t in text.split()] # In[ ]: len(tokens) # In[ ]: tokens[:20] # In[ ]: freq = nltk.FreqDist(tokens) freq.plot(20, cumulative=False) # In[ ]: stopwords = nltk.corpus.stopwords.words('english') stopwords[:10] # In[ ]: tokens_no_stopwords = [token for token in tokens if token not in stopwords] # In[ ]: len(tokens), len(tokens_no_stopwords) # In[ ]: freq = nltk.FreqDist(tokens_no_stopwords) freq.plot(20, cumulative=False) # ## Tokenize into sentences and words # In[ ]: sentences = nltk.tokenize.sent_tokenize(text) print(len(sentences)) sentences[:10] # Let's get rid of those \n's and \r's. # In[ ]: sentences = nltk.tokenize.sent_tokenize(text.replace('\n', '').replace('\r', '')) print(len(sentences)) sentences[:10] # We can also tokenize into words, in a better way than what we did above. # In[ ]: words = nltk.tokenize.word_tokenize(text.replace('\n', '')) print(len(words)) words[:20] # We often want the part-of-speech (POS) for each word to analyze the structure of sentences. # In[ ]: nltk.pos_tag(words[1000:1010]) # Let's get rid of stopwords again. # In[ ]: words_no_stopwords = [word for word in words if word not in stopwords] # In[ ]: len(words_no_stopwords) # In[ ]: freq = nltk.FreqDist(words_no_stopwords) freq.plot(20); # And let's remove words that are single characters. # In[ ]: words_no_stopwords = [word for word in words if word not in stopwords and len(word) > 1] # In[ ]: freq = nltk.FreqDist(words_no_stopwords) freq.plot(20); # In[ ]: freq.most_common()[:20] # # Compare texts # # Let's come up with a very simple way to compare texts and apply it to see how it rates the similarities among two Dicken's books and two Wilde books. # In[ ]: import urllib.request from bs4 import BeautifulSoup # In[ ]: def load_text(url): response = urllib.request.urlopen(url) html = response.read() soup = BeautifulSoup(html,'html.parser') for script in soup(["script", "style"]): script.extract() text = soup.get_text(strip=True) words = nltk.tokenize.word_tokenize(text) stopwords = nltk.corpus.stopwords.words('english') words_no_stopwords = [word for word in words if word not in stopwords] freq = nltk.FreqDist(words_no_stopwords) commonWordsCounts = freq.most_common()[:500] return [word for (word, count) in commonWordsCounts if len(word) > 1] # In[ ]: dickens1 = load_text('http://www.gutenberg.org/cache/epub/730/pg730.txt') dickens2 = load_text('http://www.gutenberg.org/files/786/786-0.txt') # In[ ]: wilde1 = load_text('http://www.gutenberg.org/files/790/790-0.txt') wilde2 = load_text('http://www.gutenberg.org/cache/epub/14522/pg14522.txt') # How many words in the 500 most common do two Dickens books have in common? # In[ ]: len(set(dickens1) & set(dickens2)) # and two Wilde books? # In[ ]: len(set(wilde1) & set(wilde2)) # How about the first Dickens book compared to the two Wilde books? # In[ ]: len(set(dickens1) & set(wilde1)), len(set(dickens1) & set(wilde2)) # and the second Dickens book compared to the two Wilde books? # In[ ]: len(set(dickens2) & set(wilde1)), len(set(dickens2) & set(wilde2)) # In[ ]: print(*dickens1) print(*dickens2) # In[ ]: print(*wilde1) print(*wilde2) # ## Better Pre-processing # In[ ]: import re wpt = nltk.WordPunctTokenizer() stop_words = nltk.corpus.stopwords.words('english') def simplify(doc): # lower case and remove special characters\whitespaces doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) doc = doc.lower() doc = doc.strip() # tokenize document tokens = wpt.tokenize(doc) # filter stopwords out of document filtered_tokens = [token for token in tokens if token not in stop_words] # re-create document from filtered tokens doc = ' '.join(filtered_tokens) return doc # In[ ]: simplify('Hello, Mary, wouldn''t you like to skip this class?') # In[ ]: def load_text(url): response = urllib.request.urlopen(url) html = response.read() soup = BeautifulSoup(html,'html.parser') for script in soup(["script", "style"]): script.extract() text = soup.get_text(strip=True) text = simplify(text) words = nltk.tokenize.word_tokenize(text) stopwords = nltk.corpus.stopwords.words('english') words_no_stopwords = [word for word in words if word not in stopwords] freq = nltk.FreqDist(words_no_stopwords) commonWordsCounts = freq.most_common()[:500] return [word for (word, count) in commonWordsCounts if len(word) > 1] # In[ ]: print(*wilde1) # In[ ]: wilde1 = load_text('http://www.gutenberg.org/cache/epub/301/pg301.txt') # In[ ]: print(*wilde1) # In[ ]: dickens1 = load_text('http://www.gutenberg.org/cache/epub/730/pg730.txt') dickens2 = load_text('http://www.gutenberg.org/files/786/786-0.txt') # In[ ]: wilde1 = load_text('http://www.gutenberg.org/files/790/790-0.txt') wilde2 = load_text('http://www.gutenberg.org/cache/epub/14522/pg14522.txt') # In[ ]: n_in_common = [] for txt1 in [dickens1, dickens2, wilde1, wilde2]: for txt2 in [dickens1, dickens2, wilde1, wilde2]: n_in_common.append(len(set(txt1) & set(txt2))) # In[ ]: n_in_common # In[ ]: np.array(n_in_common).reshape(4,4) # In[ ]: import pandas as pd pd.DataFrame(np.array(n_in_common).reshape(4,4), index=['dickens1', 'dickens2', 'wilde1', 'wilde2'], columns=['dickens1', 'dickens2', 'wilde1', 'wilde2']) # In[ ]: