In [9]:
from pybloom import BloomFilter
import os
import re
In [10]:
# Read all my posts.
posts = {post_name: open(POST_DIR + post_name).read() for post_name in os.listdir(POST_DIR)}
# Create a dictionary of {"post name": "lowercase word set"}.
split_posts = {name: set(re.split("\W+", contents.lower())) for name, contents in posts.items()}
In [11]:
filters = {}
for name, words in split_posts.items():
    filters[name] = BloomFilter(capacity=len(words), error_rate=0.1)
    for word in words:
        filters[name].add(word)
In [12]:
def search(search_string):
    search_terms = re.split("\W+", search_string)
    return [name for name, filter in filters.items() if all(term in filter for term in search_terms)]
In [13]:
search("android raspberry")
Out[13]:
['2013-12-07 - bloom-filter-search-engine.md',
 '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md',
 '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']
In [29]:
# Average filter length per post, in bytes.
sum(len(filter.bitarray.tobytes()) for filter in filters.values()) / len(filters)
Out[29]:
295
In [23]:
big_filter = BloomFilter(capacity=sum(len(words) for words in split_posts.values()), error_rate=0.1)
for name, words in split_posts.items():
    for word in words:
        big_filter.add(name + word)
In [24]:
def search_big(search_string):
    search_terms = re.split("\W+", search_string)
    return [name for name in split_posts if all((name + term) in big_filter for term in search_terms)]
In [25]:
search_big("android raspberry")
Out[25]:
['2013-12-07 - bloom-filter-search-engine.md',
 '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md',
 '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']
In [28]:
len(big_filter.bitarray.tobytes()) / len(split_posts)
Out[28]:
294