from pybloom import BloomFilter import os import re # Read all my posts. posts = {post_name: open(POST_DIR + post_name).read() for post_name in os.listdir(POST_DIR)} # Create a dictionary of {"post name": "lowercase word set"}. split_posts = {name: set(re.split("\W+", contents.lower())) for name, contents in posts.items()} filters = {} for name, words in split_posts.items(): filters[name] = BloomFilter(capacity=len(words), error_rate=0.1) for word in words: filters[name].add(word) def search(search_string): search_terms = re.split("\W+", search_string) return [name for name, filter in filters.items() if all(term in filter for term in search_terms)] search("android raspberry") # Average filter length per post, in bytes. sum(len(filter.bitarray.tobytes()) for filter in filters.values()) / len(filters) big_filter = BloomFilter(capacity=sum(len(words) for words in split_posts.values()), error_rate=0.1) for name, words in split_posts.items(): for word in words: big_filter.add(name + word) def search_big(search_string): search_terms = re.split("\W+", search_string) return [name for name in split_posts if all((name + term) in big_filter for term in search_terms)] search_big("android raspberry") len(big_filter.bitarray.tobytes()) / len(split_posts)