def char_count(text): histogram = {} for ch in text: histogram[ch] = histogram.get(ch, 0) + 1 return histogram letter_count = char_count("live and let live") letter_count def by_value(tup): return tup[1] def build_huffman_tree(letter_count): """ recieves dictionary with char:count entries generates a list of letters representing the binary Huffman encoding tree""" queue = list(letter_count.items()) while len(queue) >= 2: queue.sort(key=by_value) # sort by the count # combine two least frequent elements elem1, freq1 = queue.pop(0) elem2, freq2 = queue.pop(0) elems = (elem1, elem2) freqs = freq1 + freq2 queue.append((elems,freqs)) return queue[0][0] tree = build_huffman_tree(letter_count) tree def build_codebook(huff_tree, prefix=''): """ receives a Huffman tree with embedded encoding and a prefix of encodings. returns a dictionary where characters are keys and associated binary strings are values.""" if isinstance(huff_tree, str): # a leaf return {huff_tree: prefix} else: left, right = huff_tree codebook = {} codebook.update( build_codebook(left, prefix + '0')) codebook.update( build_codebook(right, prefix + '1')) return codebook codebook = build_codebook(tree) for ch,freq in sorted(letter_count.items(), key=by_value, reverse=True): print(ch, freq, codebook[ch]) def compress(text, codebook): ''' compress text using codebook dictionary ''' return ''.join(codebook[ch] for ch in text if ord(ch) <= 128) ch,ord(ch),'{:08b}'.format(ord(ch)) bits_ascii = ''.join(['{:07b}'.format(ord(ch)) for ch in "live and let live" if ord(ch) <= 128]) huff_bits = compress("live and let live", codebook) print(len(bits_ascii),bits_ascii) print(len(huff_bits),huff_bits) def build_decoding_dict(codebook): """build the "reverse" of encoding dictionary""" return {v:k for k,v in codebook.items()} decodebook = build_decoding_dict(codebook) decodebook def decompress(bits, decodebook): prefix = '' result = [] for bit in bits: prefix += bit if prefix not in decodebook: continue result.append(decodebook[prefix]) prefix = '' assert prefix == '' # must finish last codeword return ''.join(result) # converts list of chars to a string decompress(huff_bits, decodebook) def huffman_code(corpus): counts = char_count(corpus) tree = build_huffman_tree(counts) return build_codebook(tree) huffman_code('aaabbc') huffman_code('aaabbcc') codebook = huffman_code('qwertuioplkjhgfdsazxcvbnm') codebook mean([len(v) for v in codebook.values()]) corpus = ''.join([ch*(i+1) for i,ch in enumerate('qwertuioplkjhgfdsazxcvbnm') ]) corpus huffman_code(corpus) print(''.join([chr(c) for c in range(128)])) def compression_ratio(text, corpus): codebook = huffman_code(corpus) len_compress = len(compress(text, codebook)) len_ascii = len(text) * 7 return len_compress / len_ascii from urllib.request import urlopen with urlopen("http://www.gutenberg.org/cache/epub/42745/pg42745.txt") as f: gauss = f.read().decode('utf-8') print(gauss[:gauss.index('\n\r')]) with urlopen("http://www.gutenberg.org/files/25447/25447-0.txt") as f: russell = f.read().decode('utf-8') print(russell[:russell.index('\n\r')]) with urlopen("http://www.gutenberg.org/files/42743/42743-0.txt") as f: table_ronde = f.read().decode('utf-8') print(table_ronde[:table_ronde.index('\n\r')]) with urlopen("http://www.gutenberg.org/cache/epub/97/pg97.txt") as f: flatland_book = f.read().decode('utf-8') print(flatland_book[:flatland_book.index('\n\r')]) print("self",compression_ratio(flatland_book, flatland_book)) print("en",compression_ratio(flatland_book, russell)) print("de",compression_ratio(flatland_book, gauss)) print("fr",compression_ratio(flatland_book, table_ronde)) import pydot # https://github.com/nlhepler/pydot-py3, http://pyparsing.wikispaces.com/Download+and+Installation def tree_to_str(tree): if isinstance(tree, str): return tree return ''.join([tree_to_str(item) for item in tree]) def add_to_node(tree, node, graph): left, right = tree left_node, right_node = pydot.Node(tree_to_str(left)), pydot.Node(tree_to_str(right)) graph.add_node(left_node) graph.add_node(right_node) graph.add_edge(pydot.Edge(node, left_node)) graph.add_edge(pydot.Edge(node, right_node)) if not isinstance(left, str): add_to_node(left, left_node, graph) if not isinstance(right, str): add_to_node(right, right_node, graph) def tree_to_graph(tree): graph = pydot.Dot(graph_type='digraph',strict=True) root = pydot.Node(tree_to_str(tree)) graph.add_node(root) add_to_node(tree, root, graph) return graph tree_to_str(tree) graph = tree_to_graph(tree) with open("tmp.dot","w") as f: f.write(graph.to_string()) !dot tmp.dot -Tpng -ohuffman_tree.png !huffman_tree.png for c in flatland_book: if ord(c) >= 128: print(ord(c)) flatland_book_clean = flatland_book.replace(chr(65279), '').replace('"','').replace("'",'').replace(",",'').replace(";",'').replace(":",'').replace(".",'') flatland_tree = build_huffman_tree(char_count(flatland_book_clean.lower())) tree_to_str(flatland_tree) graph = tree_to_graph(flatland_tree) with open("tmp.dot","w") as f: f.write(graph.to_string()) !dot tmp.dot -Tpng -oflatland_tree.png !flatland_tree.png