import urllib

poeUrl = "http://www.gutenberg.org/cache/epub/2147/pg2147.txt"
grimmsUrl = "https://www.gutenberg.org/cache/epub/11027/pg11027.txt"
andersonsUrl = "https://www.gutenberg.org/cache/epub/1597/pg1597.txt"
irishFairyUrl = "https://www.gutenberg.org/cache/epub/32202/pg32202.txt"
eliotPoemsUrl = "https://www.gutenberg.org/cache/epub/1567/pg1567.txt"
rosettiPoemsUrl = "https://www.gutenberg.org/cache/epub/19188/pg19188.txt"
lovecraftUrl = "https://www.gutenberg.org/cache/epub/31469/pg31469.txt"
mrjamesUrl = "https://www.gutenberg.org/cache/epub/8486/pg8486.txt"
## add your own here - go to https://www.gutenberg.org/ and navigate to a .txt file page!

ls -al data/books

# Make it easier to download books with this in one function:
def downloadGut(urlstring, filename):
    """ Use an urlstring to a txt file on gutenberg, and an output filename"""
    import urllib
    
    req = urllib.urlopen(urlstring)
    fileString = req.read().decode('utf-8', 'ignore')
    with file(filename, 'w') as handle:
        handle.write(fileString.encode('ascii','ignore'))
    print 'Made file ', filename, ' -- now strip the boilerplate by hand or with utils/stripgutenberg.pl'

filename = 'mrjames.txt'
newfile = 'data/books/' + filename

# download the book
downloadGut(mrjamesUrl, filename)

!perl utils/stripgutenberg.pl < $filename > $newfile

!rm $filename

import nltk

nltk.download() #- find the popup winow, go to the corpora tab

storiesString = """
     The Emperor's New Clothes
     The Swineherd
     The Real Princess
     The Shoes of Fortune
     The Fir Tree
     The Snow Queen
     The Leap-Frog
     The Elderbush
     The Bell
     The Old House
     The Happy Family
     The Story of a Mother
     The False Collar
     The Shadow
     The Little Match Girl
     The Dream of Little Tuk
     The Naughty Boy
     The Red Shoes
     """

stories = storiesString.split('\n')

stories

# want to match the strings in the body of the file, to search for story boundaries... 
# Beware, in Grimm's the TOC didn't match the in-file strings exactly :(

stories = [story.strip().upper() for story in stories if story.strip()]

stories

def pairwise(iterable):
    """ Borrowed from Itertools's excellent page of examples:
        A utility to make pairs from the story list - (story1, story2), (story2, story3)... 
    """
    import itertools
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return itertools.izip(a, b)

storypairs = list(pairwise(stories))   # it returns a generator, so you have to make it a list

storypairs.append(('THE RED SHOES', ''))  # add a special last story pair

storypairs

def get_story(storypair, text):
    print storypair[0], ",", storypair[1]
    start = text.find(storypair[0])
    if storypair[1] != '':  # last pair for last story
        end = text.find(storypair[1])
        print start, end
        storyString = text[start:end]
    else:
        storyString = text[start:]  # special case for last file
    return storyString

# read in the text file for the whole collection:

with file('data/books/anderson.txt') as handle:
    anderson = handle.read()

len(anderson)

# Illustration of it working:
get_story(storypairs[0], anderson)

for storypair in storypairs:
    string = get_story(storypair, anderson)
    with file('data/stories/A_' + storypair[0] + '.txt', 'w') as handle:
        handle.write(string)

ls -al data/stories

stories2 = """
THE GOOSE-GIRL

THE LITTLE BROTHER AND SISTER

HANSEL AND GRETHEL

OH, IF I COULD BUT SHIVER!

DUMMLING AND THE THREE FEATHERS

LITTLE SNOW-WHITE

CATHERINE AND FREDERICK

THE VALIANT LITTLE TAILOR

LITTLE RED-CAP

THE GOLDEN GOOSE

BEARSKIN

CINDERELLA

FAITHFUL JOHN

THE WATER OF LIFE

THUMBLING

BRIAR ROSE

THE SIX SWANS

RAPUNZEL

MOTHER HOLLE

THE FROG PRINCE

THE TRAVELS OF TOM THUMB

SNOW-WHITE AND ROSE-RED

THE THREE LITTLE MEN IN THE WOOD

RUMPELSTILTSKIN

LITTLE ONE-EYE, TWO-EYES AND THREE-EYES"""

stories2 = stories2.split('\n')

stories2 = [story.strip() for story in stories2 if story]

stories2

storypairs2 = list(pairwise(stories2))

storypairs2

storypairs2.append(('LITTLE ONE-EYE, TWO-EYES AND THREE-EYES', ''))

# read in the file
with file('data/books/grimms.txt') as handle:
    grimm = handle.read()

len(grimm)

# Write story out with a prepended G so we know the source:

for storypair in storypairs2:
    string = get_story(storypair, grimm)
    with file('data/stories/G_' + storypair[0] + '.txt', 'w') as handle:
        handle.write(string)

ls -al data/stories