#!/usr/bin/env python # coding: utf-8 # ##A Demonstration of how one can use Python to select chapters of a Ebook based on their POV in title. # # - The is is a simple demonstration, it only catches the basic case # - You'll likely need to customise it on a per-book basis # - For demonstration, I am using a particular ebook containing the first for books of George R.R. Matrin's Song of Ice and Fire # - It is your responsibility to ensure the legality of this in your local # # ---------------------- # # ####The Libraries # We are using python3 today, but this code should work almost without change in python2. To libraries are required. # # - [ebooklib](https://github.com/aerkalov/ebooklib) is for reading and writing the epubs as a whole -- they are basically Zip Archieves # - [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) is for reading the HTML files within them # # Both can be installed with `pip`. # # We are also going to use the standard library component: # # - [re](https://docs.python.org/3/library/re.html) for regular expressions # In[22]: import ebooklib from ebooklib import epub from bs4 import BeautifulSoup import re # ##What to Keep: # ebooklib.epub breaks the epub up into items. These are files with in the zip archieve. # Generally most booked have one item (ie file), per chapter. That is the case for our book. # # Of these items, there are three catagories of item we want to keep: # # - items that are not chapters at all -- these could be pictures, or metadata or something else. we don't know. # - chapters that are universal, eg the prologue, the dedication or the appendix. # - chapter's that are about the character we are interested in # In[23]: def is_not_chapter(item): return item.get_type() != ebooklib.ITEM_DOCUMENT # #####Recognising univeral chapters # All the normal chapted in out case are named along the lines of: `b01-c01` for book 1 (as it is a complation) chapter 1. Special chapters like the appendix don't follow this pattern. We can check for it with a regex # In[24]: def is_univeral_chapter(chapter): return not re.match("(b\d\d.c\d\d)",chapter.get_id()) # ##Is it about our character? # # In this particular book all the character names are in the chapter headings. # However it does represent them in two different ways. In some sections it is with a `

` element, in others in is in a `

` element. We'll check for both. # # Notice this function is a higher order function that returns a function. That makes it work nice with filter -- useful for testing, if you've already stripped down to just the normal chapters. # # `filter(is_character("JON"), chapters)` # In[25]: def is_character(name): def inner(chapter): soup = BeautifulSoup(chapter.get_content()) heading_matchers = [lambda: soup.find_all('h1'), lambda: soup.find_all(class_='ct') ] headings=[] for matcher in heading_matchers: headings = matcher() if len(headings)>0: break else: return False assert(len(headings)==1) heading = headings[0] chapter_character_name = heading.text.strip() return chapter_character_name == name return inner # ###Bring our conditions together # Another higher order function, again to make it work with `filter`. # In this case it is a closure. # In[26]: def keep_item(character_name): is_our_charatacter = is_character(character_name) def inner(item): return (is_not_chapter(item) or is_univeral_chapter(item) or is_our_charatacter(item)) return inner # ###Combine it all, with a read and a write # Also we'll modify the title, don't want to get them confused. # There is also a helper function below to workout the new filename # In[27]: def rewrite_book_by_character(filename, character): book = epub.read_epub(filename) book.items = list(filter(keep_item(character), book.items)) book.title+=": " + character + "POVs_ONLY" new_filename = get_new_filename(filename,character) epub.write_epub(new_filename, book, {}) return new_filename def get_new_filename(filename,character): import os.path filename_base, ext = os.path.splitext(filename) new_filename = filename_base +"_" + character+"_ONLY"+ext return new_filename # ##Git it a go # In[28]: from IPython.display import FileLink filename = rewrite_book_by_character('asoiaf01-04.epub', "JON") FileLink(filename) # In[ ]: # ####The MIT License (MIT) # # Copyright (c) 2015 Lyndon White # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # In[ ]: