Full repo here: https://github.com/arnicas/NLP-in-Python
This notebook requires installation of Andreas Mueller's excellent package: https://github.com/amueller/word_cloud
%matplotlib inline
# Wordcloud stuff comes from here:
# https://github.com/amueller/word_cloud
import nltk
import matplotlib.pyplot as plt
import wordcloud
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
english_stops
{u'a', u'about', u'above', u'after', u'again', u'against', u'all', u'am', u'an', u'and', u'any', u'are', u'as', u'at', u'be', u'because', u'been', u'before', u'being', u'below', u'between', u'both', u'but', u'by', u'can', u'did', u'do', u'does', u'doing', u'don', u'down', u'during', u'each', u'few', u'for', u'from', u'further', u'had', u'has', u'have', u'having', u'he', u'her', u'here', u'hers', u'herself', u'him', u'himself', u'his', u'how', u'i', u'if', u'in', u'into', u'is', u'it', u'its', u'itself', u'just', u'me', u'more', u'most', u'my', u'myself', u'no', u'nor', u'not', u'now', u'of', u'off', u'on', u'once', u'only', u'or', u'other', u'our', u'ours', u'ourselves', u'out', u'over', u'own', u's', u'same', u'she', u'should', u'so', u'some', u'such', u't', u'than', u'that', u'the', u'their', u'theirs', u'them', u'themselves', u'then', u'there', u'these', u'they', u'this', u'those', u'through', u'to', u'too', u'under', u'until', u'up', u'very', u'was', u'we', u'were', u'what', u'when', u'where', u'which', u'while', u'who', u'whom', u'why', u'will', u'with', u'you', u'your', u'yours', u'yourself', u'yourselves'}
booklist = !ls data/books
booklist
['anderson.txt', 'grimms.txt', 'irishfairy.txt', 'lovecraft.txt', 'mrjames.txt', 'poe.txt']
def read_file(name, path):
with file(path + name, 'r') as handle:
return handle.read()
raw = read_file(booklist[1], 'data/books/')
# increasing the size of the figure drawn
from pylab import rcParams
rcParams['figure.figsize'] = (15,15)
def make_word_cloud(text, stops=None):
import wordcloud
# you have to figure out a font path for your machine (sigh)
myWordcloud = wordcloud.WordCloud(font_path="/Users/lynn/Library/Fonts/Lato-Medium.ttf",
stopwords=stops).generate(text)
plt.imshow(myWordcloud)
plt.axis("off")
plt.show()
make_word_cloud(raw)
from IPython.html.widgets import interact, interactive, fixed
from IPython.html import widgets
from IPython.display import clear_output, display, HTML
def make_word_cloud_topics(files=None, path='data/books/'):
""" This variant will use english stops plus any you add to it by hand."""
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
def make_pic(files, stops):
stops = stops.split() # it's a string of words from your input, spaced
newstops = english_stops
newstops.update(stops)
text = read_file(files, path)
make_word_cloud(text, stops=newstops)
interact(make_pic, files=files, stops='')
# this lets you add stopwords separated by spaces, live... regular english stops are already filtered out.
make_word_cloud_topics(files=booklist)
stories = !ls data/stories/
stories
['A_THE BELL.txt', 'A_THE DREAM OF LITTLE TUK.txt', 'A_THE ELDERBUSH.txt', "A_THE EMPEROR'S NEW CLOTHES.txt", 'A_THE FALSE COLLAR.txt', 'A_THE FIR TREE.txt', 'A_THE HAPPY FAMILY.txt', 'A_THE LEAP-FROG.txt', 'A_THE LITTLE MATCH GIRL.txt', 'A_THE NAUGHTY BOY.txt', 'A_THE OLD HOUSE.txt', 'A_THE REAL PRINCESS.txt', 'A_THE RED SHOES.txt', 'A_THE SHADOW.txt', 'A_THE SHOES OF FORTUNE.txt', 'A_THE SNOW QUEEN.txt', 'A_THE STORY OF A MOTHER.txt', 'A_THE SWINEHERD.txt', 'G_BEARSKIN.txt', 'G_BRIAR ROSE.txt', 'G_CATHERINE AND FREDERICK.txt', 'G_CINDERELLA.txt', 'G_DUMMLING AND THE THREE FEATHERS.txt', 'G_FAITHFUL JOHN.txt', 'G_HANSEL AND GRETHEL.txt', 'G_LITTLE ONE-EYE, TWO-EYES AND THREE-EYES.txt', 'G_LITTLE RED-CAP.txt', 'G_LITTLE SNOW-WHITE.txt', 'G_MOTHER HOLLE.txt', 'G_OH, IF I COULD BUT SHIVER!.txt', 'G_RAPUNZEL.txt', 'G_RUMPELSTILTSKIN.txt', 'G_SNOW-WHITE AND ROSE-RED.txt', 'G_THE FROG PRINCE.txt', 'G_THE GOLDEN GOOSE.txt', 'G_THE GOOSE-GIRL.txt', 'G_THE LITTLE BROTHER AND SISTER.txt', 'G_THE SIX SWANS.txt', 'G_THE THREE LITTLE MEN IN THE WOOD.txt', 'G_THE TRAVELS OF TOM THUMB.txt', 'G_THE VALIANT LITTLE TAILOR.txt', 'G_THE WATER OF LIFE.txt', 'G_THUMBLING.txt']
make_word_cloud_topics(files=stories, path='data/stories/')