from pybloom import BloomFilter
import os
import re
# Read all my posts.
posts = {post_name: open(POST_DIR + post_name).read() for post_name in os.listdir(POST_DIR)}
# Create a dictionary of {"post name": "lowercase word set"}.
split_posts = {name: set(re.split("\W+", contents.lower())) for name, contents in posts.items()}
filters = {}
for name, words in split_posts.items():
filters[name] = BloomFilter(capacity=len(words), error_rate=0.1)
for word in words:
filters[name].add(word)
def search(search_string):
search_terms = re.split("\W+", search_string)
return [name for name, filter in filters.items() if all(term in filter for term in search_terms)]
search("android raspberry")
['2013-12-07 - bloom-filter-search-engine.md', '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md', '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']
# Average filter length per post, in bytes.
sum(len(filter.bitarray.tobytes()) for filter in filters.values()) / len(filters)
295
big_filter = BloomFilter(capacity=sum(len(words) for words in split_posts.values()), error_rate=0.1)
for name, words in split_posts.items():
for word in words:
big_filter.add(name + word)
def search_big(search_string):
search_terms = re.split("\W+", search_string)
return [name for name in split_posts if all((name + term) in big_filter for term in search_terms)]
search_big("android raspberry")
['2013-12-07 - bloom-filter-search-engine.md', '2013-06-19 - how-remote-control-rf-devices-raspberry-pi.md', '2013-06-24 - writing-my-first-android-app-control-your-raspberr.md']
len(big_filter.bitarray.tobytes()) / len(split_posts)
294