import urllib poeUrl = "http://www.gutenberg.org/cache/epub/2147/pg2147.txt" grimmsUrl = "https://www.gutenberg.org/cache/epub/11027/pg11027.txt" andersonsUrl = "https://www.gutenberg.org/cache/epub/1597/pg1597.txt" irishFairyUrl = "https://www.gutenberg.org/cache/epub/32202/pg32202.txt" eliotPoemsUrl = "https://www.gutenberg.org/cache/epub/1567/pg1567.txt" rosettiPoemsUrl = "https://www.gutenberg.org/cache/epub/19188/pg19188.txt" lovecraftUrl = "https://www.gutenberg.org/cache/epub/31469/pg31469.txt" mrjamesUrl = "https://www.gutenberg.org/cache/epub/8486/pg8486.txt" ## add your own here - go to https://www.gutenberg.org/ and navigate to a .txt file page! ls -al data/books # Make it easier to download books with this in one function: def downloadGut(urlstring, filename): """ Use an urlstring to a txt file on gutenberg, and an output filename""" import urllib req = urllib.urlopen(urlstring) fileString = req.read().decode('utf-8', 'ignore') with file(filename, 'w') as handle: handle.write(fileString.encode('ascii','ignore')) print 'Made file ', filename, ' -- now strip the boilerplate by hand or with utils/stripgutenberg.pl' filename = 'mrjames.txt' newfile = 'data/books/' + filename # download the book downloadGut(mrjamesUrl, filename) !perl utils/stripgutenberg.pl < $filename > $newfile !rm $filename import nltk nltk.download() #- find the popup winow, go to the corpora tab storiesString = """ The Emperor's New Clothes The Swineherd The Real Princess The Shoes of Fortune The Fir Tree The Snow Queen The Leap-Frog The Elderbush The Bell The Old House The Happy Family The Story of a Mother The False Collar The Shadow The Little Match Girl The Dream of Little Tuk The Naughty Boy The Red Shoes """ stories = storiesString.split('\n') stories # want to match the strings in the body of the file, to search for story boundaries... # Beware, in Grimm's the TOC didn't match the in-file strings exactly :( stories = [story.strip().upper() for story in stories if story.strip()] stories def pairwise(iterable): """ Borrowed from Itertools's excellent page of examples: A utility to make pairs from the story list - (story1, story2), (story2, story3)... """ import itertools "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) next(b, None) return itertools.izip(a, b) storypairs = list(pairwise(stories)) # it returns a generator, so you have to make it a list storypairs.append(('THE RED SHOES', '')) # add a special last story pair storypairs def get_story(storypair, text): print storypair[0], ",", storypair[1] start = text.find(storypair[0]) if storypair[1] != '': # last pair for last story end = text.find(storypair[1]) print start, end storyString = text[start:end] else: storyString = text[start:] # special case for last file return storyString # read in the text file for the whole collection: with file('data/books/anderson.txt') as handle: anderson = handle.read() len(anderson) # Illustration of it working: get_story(storypairs[0], anderson) for storypair in storypairs: string = get_story(storypair, anderson) with file('data/stories/A_' + storypair[0] + '.txt', 'w') as handle: handle.write(string) ls -al data/stories stories2 = """ THE GOOSE-GIRL THE LITTLE BROTHER AND SISTER HANSEL AND GRETHEL OH, IF I COULD BUT SHIVER! DUMMLING AND THE THREE FEATHERS LITTLE SNOW-WHITE CATHERINE AND FREDERICK THE VALIANT LITTLE TAILOR LITTLE RED-CAP THE GOLDEN GOOSE BEARSKIN CINDERELLA FAITHFUL JOHN THE WATER OF LIFE THUMBLING BRIAR ROSE THE SIX SWANS RAPUNZEL MOTHER HOLLE THE FROG PRINCE THE TRAVELS OF TOM THUMB SNOW-WHITE AND ROSE-RED THE THREE LITTLE MEN IN THE WOOD RUMPELSTILTSKIN LITTLE ONE-EYE, TWO-EYES AND THREE-EYES""" stories2 = stories2.split('\n') stories2 = [story.strip() for story in stories2 if story] stories2 storypairs2 = list(pairwise(stories2)) storypairs2 storypairs2.append(('LITTLE ONE-EYE, TWO-EYES AND THREE-EYES', '')) # read in the file with file('data/books/grimms.txt') as handle: grimm = handle.read() len(grimm) # Write story out with a prepended G so we know the source: for storypair in storypairs2: string = get_story(storypair, grimm) with file('data/stories/G_' + storypair[0] + '.txt', 'w') as handle: handle.write(string) ls -al data/stories