#!/usr/bin/env python
# coding: utf-8

# # Natural Language Processing

# Sources:
# 
#   * [Natural Language Processing with Python – Analyzing Text with the Natural Language Toolkit](http://www.nltk.org/book/) by Steven Bird, Ewan Klein, and Edward Loper
#   * [NLP Tutorial Using Python NLTK (Simple Examples)](https://likegeeks.com/nlp-tutorial-using-python-nltk/)
#   * [NLTK: Natural Language Toolkit Documentation](http://www.nltk.org/)
#   * [Demos by the Cognitive Computation Group](http://cogcomp.cs.illinois.edu/page/demos) at the University of Illinois, Champaign-Urbana
#   
# The NLTK package can be installed by
# 
#      conda install nltk

# In[ ]:


import numpy as np
import matplotlib.pyplot as plt


# First step in any natural language work...get some text.  The book provides several large text samples that you can download with the following commands. A window will pop up from which you can select various [corpora](https://en.wikipedia.org/wiki/Text_corpus).  Select "book" and download it.

# In[ ]:


import nltk
nltk.download()  # download text examples from "book"


# Let's assume this as been done. Now the material for the book can be used by doing

# In[ ]:


import nltk.book as bk


# Cool!  All of *Moby Dick*!  Let's take a look at the text.  The text objects can be treated as lists, so

# In[ ]:


bk.text1[:8]


# In[ ]:


bk.text1[1000:1010]


# A very common way to characterize some text is by a count of the times
# a word appears in the text.  The text objects have a number of useful
# methods, such as

# In[ ]:


bk.text1.count('whale')


# We can now write a python function to count the frequency of
# occurrence of a number of words.  NLTK provides a way to do this for
# all of the words in a text.

# In[ ]:


wordCounts = bk.FreqDist(bk.text1)
len(wordCounts)


# In[ ]:


type(wordCounts)


# In[ ]:


wordCounts.most_common()[:10]


# Now we need a plot!  We can do it ourselves, or use the `plot` method already defined for the `FreqDist` object.|

# In[ ]:


plt.figure(figsize=(10,10))
wordCounts.plot(50);


# But, where is ''Ishmael'' and ''Starbuck''?

# In[ ]:


wordCounts['Ishmael'], wordCounts.freq('Ishmael')


# In[ ]:


wordCounts['Starbuck'], wordCounts.freq('Starbuck')


# Who is mentioned more in *Moby Dick*?
# 
# Usually we will want to ignore short words, including punctuation, and
# words that occur a very small number of times.  We can use a list
# comprehension to build this list, but first need a way to create a
# list of words without repetitions.  Python `set` to the rescue.

# In[ ]:


len(bk.text1)


# In[ ]:


len(set(bk.text1))


# So each word is used, on average, ...

# In[ ]:


len(bk.text1) / len(set(bk.text1))


# ... about 13 times.  Now, let's step through each word in the set and only
# keep ones that are longer than 8 characters and that appear more than
# 20 times, and let's sort the result.

# In[ ]:


sigWords = sorted([word for word in set(bk.text1) if len(word) > 8 and wordCounts[word] > 20])
len(sigWords)


# In[ ]:


print(*sigWords)  # prints all on one line.  Easier to see than just evaluating sigWords


# Another common technique is to look for *collocations*, or pairs of
# words that appear together more often than expected by considering the
# number of times each word appears.  Yep, NLTK has a method for that.

# In[ ]:


bk.text1.collocations()


# # Using text from a URL

# In[ ]:


import urllib.request


# In[ ]:


response = urllib.request.urlopen('https://www.gutenberg.org/files/46/46-h/46-h.htm')
html = response.read()
# print(html)


# To extract the text from the html, we will use [Beautiful Soup](https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe).
# 

# In[ ]:


from bs4 import BeautifulSoup


# In[ ]:


soup = BeautifulSoup(html,'html.parser')


# In[ ]:


soup(['script', 'style'])


# In[ ]:


for script in soup(["script", "style"]):
    script.extract()


# In[ ]:


text = soup.get_text(strip=True)


# In[ ]:


text[:100]


# In[ ]:


tokens = [t for t in text.split()]


# In[ ]:


len(tokens)


# In[ ]:


tokens[:20]


# In[ ]:


freq = nltk.FreqDist(tokens)
freq.plot(20, cumulative=False)


# In[ ]:


stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]


# In[ ]:


tokens_no_stopwords = [token for token in tokens if token not in stopwords]


# In[ ]:


len(tokens), len(tokens_no_stopwords)


# In[ ]:


freq = nltk.FreqDist(tokens_no_stopwords)
freq.plot(20, cumulative=False)


# ## Tokenize into sentences and words

# In[ ]:


sentences = nltk.tokenize.sent_tokenize(text)
print(len(sentences))
sentences[:10]


# Let's get rid of those \n's and \r's.

# In[ ]:


sentences = nltk.tokenize.sent_tokenize(text.replace('\n', '').replace('\r', ''))
print(len(sentences))
sentences[:10]


# We can also tokenize into words, in a better way than what we did above.

# In[ ]:


words = nltk.tokenize.word_tokenize(text.replace('\n', ''))
print(len(words))
words[:20]


# We often want the part-of-speech (POS) for each word to analyze the structure of sentences.

# In[ ]:


nltk.pos_tag(words[1000:1010])


# Let's get rid of stopwords again.

# In[ ]:


words_no_stopwords = [word for word in words if word not in stopwords]


# In[ ]:


len(words_no_stopwords)


# In[ ]:


freq = nltk.FreqDist(words_no_stopwords)
freq.plot(20);


# And let's remove words that are single characters.

# In[ ]:


words_no_stopwords = [word for word in words if word not in stopwords and len(word) > 1]


# In[ ]:


freq = nltk.FreqDist(words_no_stopwords)
freq.plot(20);


# In[ ]:


freq.most_common()[:20]


# # Compare texts
# 
# Let's come up with a very simple way to compare texts and apply it to see how it rates the similarities among two Dicken's books and two Wilde books.

# In[ ]:


import urllib.request
from bs4 import BeautifulSoup


# In[ ]:


def load_text(url):
    response = urllib.request.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html,'html.parser')
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text(strip=True)
    words = nltk.tokenize.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    words_no_stopwords = [word for word in words if word not in stopwords]
    freq = nltk.FreqDist(words_no_stopwords)
    commonWordsCounts = freq.most_common()[:500]
    return [word for (word, count) in commonWordsCounts if len(word) > 1]


# In[ ]:


dickens1 = load_text('http://www.gutenberg.org/cache/epub/730/pg730.txt')
dickens2 = load_text('http://www.gutenberg.org/files/786/786-0.txt')


# In[ ]:


wilde1 = load_text('http://www.gutenberg.org/files/790/790-0.txt')
wilde2 = load_text('http://www.gutenberg.org/cache/epub/14522/pg14522.txt')


# How many words in the 500 most common do two Dickens books have in common?

# In[ ]:


len(set(dickens1) & set(dickens2))


# and two Wilde books?

# In[ ]:


len(set(wilde1) & set(wilde2))


# How about the first Dickens book compared to the two Wilde books?

# In[ ]:


len(set(dickens1) & set(wilde1)), len(set(dickens1) & set(wilde2))


# and the second Dickens book compared to the two Wilde books?

# In[ ]:


len(set(dickens2) & set(wilde1)), len(set(dickens2) & set(wilde2))


# In[ ]:


print(*dickens1)
print(*dickens2)


# In[ ]:


print(*wilde1)
print(*wilde2)


# ## Better Pre-processing

# In[ ]:


import re

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def simplify(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc


# In[ ]:


simplify('Hello, Mary, wouldn''t you like to skip this class?')


# In[ ]:


def load_text(url):
    response = urllib.request.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html,'html.parser')
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text(strip=True)
    text = simplify(text)
    words = nltk.tokenize.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    words_no_stopwords = [word for word in words if word not in stopwords]
    freq = nltk.FreqDist(words_no_stopwords)
    commonWordsCounts = freq.most_common()[:500]
    return [word for (word, count) in commonWordsCounts if len(word) > 1]


# In[ ]:


print(*wilde1)


# In[ ]:


wilde1 = load_text('http://www.gutenberg.org/cache/epub/301/pg301.txt')


# In[ ]:


print(*wilde1)


# In[ ]:


dickens1 = load_text('http://www.gutenberg.org/cache/epub/730/pg730.txt')
dickens2 = load_text('http://www.gutenberg.org/files/786/786-0.txt')


# In[ ]:


wilde1 = load_text('http://www.gutenberg.org/files/790/790-0.txt')
wilde2 = load_text('http://www.gutenberg.org/cache/epub/14522/pg14522.txt')


# In[ ]:


n_in_common = []
for txt1 in [dickens1, dickens2, wilde1, wilde2]:
    for txt2 in [dickens1, dickens2, wilde1, wilde2]:
        n_in_common.append(len(set(txt1) & set(txt2)))


# In[ ]:


n_in_common


# In[ ]:


np.array(n_in_common).reshape(4,4)


# In[ ]:


import pandas as pd
pd.DataFrame(np.array(n_in_common).reshape(4,4), 
             index=['dickens1', 'dickens2', 'wilde1', 'wilde2'],
             columns=['dickens1', 'dickens2', 'wilde1', 'wilde2'])


# In[ ]: