Full repo here: https://github.com/arnicas/NLP-in-Python
%matplotlib inline
import itertools
import nltk
import string
Read in a file to use for practice.
with file("data/stories/A_THE FIR TREE.txt",'r') as handle:
text = handle.read()
text[0:100]
'THE FIR TREE\r\n\r\nOut in the woods stood a nice little Fir Tree. The place he had was a\r\nvery good one'
tokens = nltk.word_tokenize(text)
tokens[60:110]
['so', 'very', 'much', 'to', 'be', 'a', 'grown-up', 'tree', '.', 'He', 'did', 'not', 'think', 'of', 'the', 'warm', 'sun', 'and', 'of', 'the', 'fresh', 'air', ';', 'he', 'did', 'not', 'care', 'for', 'the', 'little', 'cottage', 'children', 'that', 'ran', 'about', 'and', 'prattled', 'when', 'they', 'were', 'in', 'the', 'woods', 'looking', 'for', 'wild-strawberries', '.', 'The', 'children', 'often']
Notice the difference below - look for "wild-strawberries":
nltk.wordpunct_tokenize(text)[60:120]
['Fir', 'wanted', 'so', 'very', 'much', 'to', 'be', 'a', 'grown', '-', 'up', 'tree', '.', 'He', 'did', 'not', 'think', 'of', 'the', 'warm', 'sun', 'and', 'of', 'the', 'fresh', 'air', ';', 'he', 'did', 'not', 'care', 'for', 'the', 'little', 'cottage', 'children', 'that', 'ran', 'about', 'and', 'prattled', 'when', 'they', 'were', 'in', 'the', 'woods', 'looking', 'for', 'wild', '-', 'strawberries', '.', 'The', 'children', 'often', 'came', 'with', 'a', 'whole']
nltk.sent_tokenize(text)[0:10]
['THE FIR TREE\r\n\r\nOut in the woods stood a nice little Fir Tree.', 'The place he had was a\r\nvery good one: the sun shone on him: as to fresh air, there was enough\r\nof that, and round him grew many large-sized comrades, pines as well as\r\nfirs.', 'But the little Fir wanted so very much to be a grown-up tree.', 'He did not think of the warm sun and of the fresh air; he did not care\r\nfor the little cottage children that ran about and prattled when they\r\nwere in the woods looking for wild-strawberries.', 'The children often came\r\nwith a whole pitcher full of berries, or a long row of them threaded on\r\na straw, and sat down near the young tree and said, "Oh, how pretty he\r\nis!', 'What a nice little fir!"', 'But this was what the Tree could not bear\r\nto hear.', 'At the end of a year he had shot up a good deal, and after another year\r\nhe was another long bit taller; for with fir trees one can always tell\r\nby the shoots how many years old they are.', '"Oh!', 'Were I but such a high tree as the others are," sighed he.']
You'll want to choose your word tokenization carefully - and sometimes you have to use regex patterns instead. Note especially what happens with contractions here:http://text-processing.com/demo/tokenize/
"Stopwords" are words that are usually excluded because they are common connectors (or determiners) that are not considered to carry meaning. BEWARE hidden stopword filtering in libraries you use and always check stopword lists to see if you agree with their contents!
from nltk.corpus import stopwords
english_stops = stopwords.words('english')
english_stops
[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']
tokens = nltk.word_tokenize(text)
tokens[0:15]
['THE', 'FIR', 'TREE', 'Out', 'in', 'the', 'woods', 'stood', 'a', 'nice', 'little', 'Fir', 'Tree', '.', 'The']
len(tokens)
3925
We want to strip out stopwords - use a list comprehension. Notice you need to lower case the words before you check for membership!
# try this without .lower in the if-statement and check the size!
tokens = [token.lower() for token in tokens if token.lower() not in english_stops]
len(tokens)
2158
tokens[0:15]
['fir', 'tree', 'woods', 'stood', 'nice', 'little', 'fir', 'tree', '.', 'place', 'good', 'one', ':', 'sun', 'shone']
Let's get rid of punctuation too, which isn't used in most bag-of-words analyses.
import string
string.punctuation
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
tokens = [token for token in tokens if token not in string.punctuation]
len(tokens)
1647
# There's some awful stuff still in here:
sorted(tokens)[0:20]
["''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''", "''"]
The ugliness of some of those tokens! You have some possibilities now - add to your stopwords list the ones you want removed; or remove all very short words, which will get rid of our puntuation problem too.
[token for token in tokens if len(token) <= 2][0:20]
['``', 'oh', "''", '``', 'oh', "''", '``', "''", 'oh', 'go', '``', "''", '--', "''", "''", 'go', "''", "''", '``', "''"]
def clean_tokens(tokens):
""" Lowercases, takes out punct and stopwords and short strings """
return [token.lower() for token in tokens if (token not in string.punctuation) and
(token.lower() not in english_stops) and len(token) > 2]
clean = clean_tokens(tokens)
clean[0:20]
['fir', 'tree', 'woods', 'stood', 'nice', 'little', 'fir', 'tree', 'place', 'good', 'one', 'sun', 'shone', 'fresh', 'air', 'enough', 'round', 'grew', 'many', 'large-sized']
len(clean)
1445
The obvious thing you want to do next is count frequencies of words in texts - NLTK has you covered.
from nltk import Text
text = Text(clean)
text.vocab().most_common()[0:20]
[('tree', 66), ('fir', 27), ('said', 25), ('little', 24), ('one', 23), ('came', 15), ('thought', 14), ('know', 14), ('branches', 12), ('mice', 12), ('still', 10), ('trees', 10), ('children', 9), ('large', 9), ('would', 9), ('story', 9), ('humpy-dumpy', 9), ('come', 9), ('old', 9), ('like', 8)]
def makeTextCollection(files):
from nltk import Text
from nltk import TextCollection
textlist = [open(filen).read() for filen in files]
texts= [Text(clean_tokens(nltk.word_tokenize(text))) for text in textlist]
collection = TextCollection(texts)
return collection
filelist = !ls data/stories/*
filelist
['data/stories/A_THE BELL.txt', 'data/stories/A_THE DREAM OF LITTLE TUK.txt', 'data/stories/A_THE ELDERBUSH.txt', "data/stories/A_THE EMPEROR'S NEW CLOTHES.txt", 'data/stories/A_THE FALSE COLLAR.txt', 'data/stories/A_THE FIR TREE.txt', 'data/stories/A_THE HAPPY FAMILY.txt', 'data/stories/A_THE LEAP-FROG.txt', 'data/stories/A_THE LITTLE MATCH GIRL.txt', 'data/stories/A_THE NAUGHTY BOY.txt', 'data/stories/A_THE OLD HOUSE.txt', 'data/stories/A_THE REAL PRINCESS.txt', 'data/stories/A_THE RED SHOES.txt', 'data/stories/A_THE SHADOW.txt', 'data/stories/A_THE SHOES OF FORTUNE.txt', 'data/stories/A_THE SNOW QUEEN.txt', 'data/stories/A_THE STORY OF A MOTHER.txt', 'data/stories/A_THE SWINEHERD.txt', 'data/stories/G_BEARSKIN.txt', 'data/stories/G_BRIAR ROSE.txt', 'data/stories/G_CATHERINE AND FREDERICK.txt', 'data/stories/G_CINDERELLA.txt', 'data/stories/G_DUMMLING AND THE THREE FEATHERS.txt', 'data/stories/G_FAITHFUL JOHN.txt', 'data/stories/G_HANSEL AND GRETHEL.txt', 'data/stories/G_LITTLE ONE-EYE, TWO-EYES AND THREE-EYES.txt', 'data/stories/G_LITTLE RED-CAP.txt', 'data/stories/G_LITTLE SNOW-WHITE.txt', 'data/stories/G_MOTHER HOLLE.txt', 'data/stories/G_OH, IF I COULD BUT SHIVER!.txt', 'data/stories/G_RAPUNZEL.txt', 'data/stories/G_RUMPELSTILTSKIN.txt', 'data/stories/G_SNOW-WHITE AND ROSE-RED.txt', 'data/stories/G_THE FROG PRINCE.txt', 'data/stories/G_THE GOLDEN GOOSE.txt', 'data/stories/G_THE GOOSE-GIRL.txt', 'data/stories/G_THE LITTLE BROTHER AND SISTER.txt', 'data/stories/G_THE SIX SWANS.txt', 'data/stories/G_THE THREE LITTLE MEN IN THE WOOD.txt', 'data/stories/G_THE TRAVELS OF TOM THUMB.txt', 'data/stories/G_THE VALIANT LITTLE TAILOR.txt', 'data/stories/G_THE WATER OF LIFE.txt', 'data/stories/G_THUMBLING.txt']
fairyCol = makeTextCollection(filelist)
fairyCol.concordance('witch')
# Note that a concordance view should really retain original format; see below for how to do this
# with an IndexedText object.
Displaying 14 of 14 matches: woman behaved kindly reality wicked witch waylaid children built bread-house o grethel came next shaking till awoke witch said get lazy thing fetch water cook ll eat grethel began cry useless old witch made wished nice meal cooked hansel nothing crab claw every morning old witch came cage said hansel stretch finger n wild beasts wood died together old witch called leave noise help bit early mo n flames burning fiercely creep said witch see hot enough put bread intended gr howled grethel ran away left ungodly witch burn ashes ran hansel opening door c opening door called hansel saved old witch dead sprang like bird cage door open l upon neck kissed nothing fear went witch house every corner caskets full pear de called hal called rumpelstiltskin witch told witch told shrieked little man al called rumpelstiltskin witch told witch told shrieked little man stamped rig look brook wicked stepmother however witch witnessed departure two children sne little boy happened king hunting old witch took form chambermaid got room queen utiful charming ever told king fraud witch daughter practised upon tried senten
fairyCol.similar('king')
man father woman went home however replied queen eldest toads still prince westwards dwarfs folks lord giants wished garden get
fairyCol.similar('queen')
man country rats love palace people share gerda son see another church dear sees yes prince dost blow takes would
fairyCol.collocations(num=25, window_size=3)
faithful john; old woman; little boy; learned man; old man; red shoes; pewter soldier; snow queen; tom thumb; next morning; thou art; little girl; little gerda; robber maiden; fir tree; take care; king son; king daughter; weg weg; could see; little maiden; pick pick; shivering means; east street; fairy tale
fairyCol.dispersion_plot(['queen','king','girl', 'boy', 'daughter','son'])
fairyCol.plot(50)
fairyCol.vocab().most_common()[0:15]
[('said', 911), ('one', 571), ('little', 557), ('old', 368), ('came', 321), ('could', 304), ('went', 299), ('king', 289), ('would', 255), ('man', 237), ('must', 204), ('see', 194), ('thought', 192), ('away', 184), ('like', 182)]
fairyCol.common_contexts(['queen'])
snow_passed form_cradle day_heard snow_kissed snow_playfellow day_walking bed_place king_fragrant real_far snow_said body_hearts young_bore kissed_going snow_first hid_chamber shall_child king_great year_brought enemy_dressing snow_home
len(fairyCol.vocab())
6698
To see more details about the NLTK Text methods, read the code/doc here: http://www.nltk.org/_modules/nltk/text.html
To do this, you need to download the MaxEnt Treebank POS tagger with nltk.download() (on the models tab) - it may have downloaded with the full book collection, if you already got that.
nltk.download()
showing info http://nltk.github.com/nltk_data/
True
text = nltk.word_tokenize("And now for something completely different")
tagged = nltk.pos_tag(text) # there are a few options for taggers, details in NLTK books
tagged
[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]
nltk.untag(tagged)
['And', 'now', 'for', 'something', 'completely', 'different']
source: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
Parts of speech are used in anaysis that's "deeper" than bags-of-words approaches. For instance, chunking (parsing for structure) may be used for entity identification and semantics. See http://www.nltk.org/book/ch07.html for a little more info, and the 2 Perkins NLTK books.
The goal is to merge data items that are the same at some "root" meaning level, and reduce the number of features in your data set. "Cats" and "Cat" might want to be treated as the same thing, from a topic or summarization perspective.
# stemming removes affixes. This is the default choice for stemming although other algs exist
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('believes')
u'believ'
# lemmatizing transforms to root words using grammar rules. It is slower. Stemming is more common.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('said', pos='v') # if you don't specify POS, you get zilch.
u'say'
lemmatizer.lemmatize('cookbooks')
u'cookbook'
stemmer.stem('cookbooks')
u'cookbook'
# an apparently recommended compression recipe in Perkins Python 3 NLTK book? Not sure I agree.
stemmer.stem(lemmatizer.lemmatize('buses'))
u'bu'
def tokenize_and_stem(text):
tokens = clean_tokens(nltk.word_tokenize(text))
#return no_stops
stems = [stemmer.stem(t) for t in tokens]
return stems
def makeStemTextCollection(files):
from nltk import Text
from nltk import TextCollection
textlist = [open(filen).read() for filen in files]
tokens = [tokenize_and_stem(text) for text in textlist]
texts= [Text(text) for text in tokens]
collection = TextCollection(texts)
return collection
fairyStems = makeStemTextCollection(filelist)
fairyStems.concordance('witch')
Displaying 16 of 16 matches: old woman behav kindli realiti wick witch waylaid children built bread-hous or kill cook ate made great festiv day witch red eye see far fine sens smell like se grethel came next shake till awok witch said get lazi thing fetch water cook ll eat grethel began cri useless old witch made wish nice meal cook hansel gret el got noth crab claw everi morn old witch came cage said hansel stretch finger eaten wild beast wood die togeth old witch call leav nois help bit earli morn g hel oven flame burn fierc creep said witch see hot enough put bread intend gret l howl grethel ran away left ungodli witch burn ash ran hansel open door call h ansel open door call hansel save old witch dead sprang like bird cage door open d fell upon neck kiss noth fear went witch hous everi corner casket full pearl conrad call hal call rumpelstiltskin witch told witch told shriek littl man sta hal call rumpelstiltskin witch told witch told shriek littl man stamp right fo went look brook wick stepmoth howev witch wit departur two children sneak secr ur two children sneak secretli habit witch enchant spring forest present found eauti littl boy happen king hunt old witch took form chambermaid got room queen ci beauti charm ever told king fraud witch daughter practis upon tri sentenc pr
fairyStems.collocations(num=25, window_size=2)
faith john; old woman; littl boy; old man; pewter soldier; learn man; littl girl; red shoe; snow queen; tom thumb; next morn; littl gerda; fir tree; fairi tale; open door; king son; robber maiden; king daughter; could see; thou art; east street; littl kay; mother holl; turn round; take care
# reminder of above
fairyCol.collocations(num=25, window_size=2)
faithful john; old woman; little boy; old man; learned man; pewter soldier; red shoes; snow queen; little girl; tom thumb; next morning; little gerda; robber maiden; fir tree; king son; king daughter; could see; thou art; shivering means; east street; take care; fairy tale; little kay; mother holle; kid milk
fairyStems.vocab().most_common()[0:10]
[(u'said', 911), (u'one', 585), (u'littl', 557), (u'old', 368), (u'came', 321), (u'could', 304), (u'went', 299), (u'king', 293), (u'look', 270), (u'would', 255)]
len(fairyStems.vocab()) # a huge savings over the vocab size of the unstemmed one!
4861
len(fairyCol.vocab())
6698
fairyStems.common_contexts(['witch'])
awok_said ungodli_burn day_red wick_waylaid howev_wit old_dead old_came old_took went_hous said_see old_made habit_enchant rumpelstiltskin_told old_call fraud_daughter told_told
fairyCol.common_contexts(['witch'])
awoke_said old_called wicked_waylaid old_dead went_house old_came old_took ungodly_burn said_see however_witnessed old_made rumpelstiltskin_told fraud_daughter told_told
fairyStems.similar('witch')
woman man said old hous peasant-woman ladi eldest chivalri poet hansel hen emperor
This code example is from the NLTK book, http://www.nltk.org/book/ch03.html, ex 3.6 This is a search method that saves original context, but allows indexing into it by stemmed words.
Also requires downloading from nltk.download() if you want to run it.
nltk.download()
showing info http://nltk.github.com/nltk_data/
True
porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
class IndexedText(object):
def __init__(self, stemmer, text):
self._text = text
self._stemmer = stemmer
self._index = nltk.Index((self._stem(word), i)
for (i, word) in enumerate(text))
def concordance(self, word, width=40):
key = self._stem(word)
wc = int(width/4) # words of context
for i in self._index[key]:
lcontext = ' '.join(self._text[i-wc:i])
rcontext = ' '.join(self._text[i:i+wc])
ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width) # right justify
rdisplay = '{:{width}}'.format(rcontext[:width], width=width) # left justify
print(ldisplay, rdisplay)
def _stem(self, word):
return self._stemmer.stem(word).lower()
text = IndexedText(porter, grail)
text.concordance('lie')
('r king ! DENNIS : Listen , strange women', 'lying in ponds distributing swords is no') (' beat a very brave retreat . ROBIN : All', 'lies ! MINSTREL : [ singing ] Bravest of') (' Nay . Nay . Come . Come . You may', 'lie here . Oh , but you are wounded ! ') ('doctors immediately ! No , no , please !', 'Lie down . [ clap clap ] PIGLET : Well ') ('ere is much danger , for beyond the cave', 'lies the Gorge of Eternal Peril , which ') (' you . Oh ... TIM : To the north there', 'lies a cave -- the cave of Caerbannog --') ('h it and lived ! Bones of full fifty men', 'lie strewn about its lair . So , brave k') ("not stop our fight ' til each one of you", 'lies dead , and the Holy Grail returns t')
text.concordance('say')
(" dead ! CART - MASTER : ' Ere . He", "says he ' s not dead ! CUSTOMER : Yes ") (" you ' Man '. DENNIS : Well , you could", "say ' Dennis '. ARTHUR : Well , I didn ") (' find out , did you ? ARTHUR : I did', "say ' sorry ' about the ' old woman ', ") (" DENNIS : I mean , if I went ' round", 'saying I was an emperor just because som') (' you see ? ARTHUR : What ? GALAHAD : He', "says they ' ve already got one ! ARTHUR ") ('u ? HEAD KNIGHT : We are the Knights Who', "Say ... ' Ni '! RANDOM : Ni ! ARTHUR ") (' Ni ! ARTHUR : No ! Not the Knights Who', "Say ' Ni '! HEAD KNIGHT : The same ! ") ('the tale ! HEAD KNIGHT : The Knights Who', "Say ' Ni ' demand a sacrifice ! ARTHUR :") (' ! Ow ! Agh ! HEAD KNIGHT : We shall', "say ' ni ' again to you if you do ") (' Who sent you ? ARTHUR : The Knights Who', "Say ' Ni '. CRONE : Aggh ! No ! ") ('n buy a shrubbery , my friend and I will', "say ... we will say ... ' ni '. CRONE ") (' , my friend and I will say ... we will', "say ... ' ni '. CRONE : Agh ! Do ") ('VERE : Ni ! ROGER THE SHRUBBER : Are you', "saying ' ni ' to that old woman ? ARTHUR") ('his period in history . ARTHUR : Did you', "say ' shrubberies '? ROGER : Yes . Shrub") ('We are now ... no longer the Knights Who', "Say ' Ni '. KNIGHTS OF NI : Ni ! ") ('NIGHT : Shh ! We are now the Knights Who', "Say ' Ecky - ecky - ecky - ecky - ") (" HEAD KNIGHT : Augh ! Ohh ! Don ' t", 'say that word . ARTHUR : What word ? HEA') ('HEAD KNIGHT : I cannot tell , suffice to', 'say is one of the words the Knights of N') ('Ni cannot hear . ARTHUR : How can we not', "say the word if you don ' t tell us ") (" You wouldn ' t get vary far in life not", "saying ' is '. KNIGHTS OF NI : No , ") (': Aaaaugh ! HEAD KNIGHT : Aaaaugh ! Stop', 'saying the word ! The word ... ARTHUR : ') ('ila raised the hand grenade up on high ,', "saying ,' O Lord , bless this thy hand g") (' . SECOND BROTHER : And the Lord spake ,', "saying , ' First shalt thou take out the") (' There ! Look ! LAUNCELOT : What does it', 'say ? GALAHAD : What language is that ? ') ("CELOT : ' Course ! ARTHUR : What does it", "say ? MAYNARD : It reads , ' Here may ") (" ! MAYNARD : Well , that ' s what it", 'says . ARTHUR : Look , if he was dying ') ("bother to carve ' aaggggh '. He ' d just", "say it ! MAYNARD : Well , that ' s ") (' : Oh , shut up . Well , does it', 'say anything else ? MAYNARD : No . Just ') ('ided us ! FRENCH GUARD : How you English', "say , ' I one more time , mac , ")
Take a detour now to RiTa.js (https://github.com/dhowe/RiTaJS and http://rednoise.org/rita/) for related functionality in Javascript! And my own little toy project using it and POS: http://www.ghostweather.com/files/image_replacement/