HyperLogLog counter

The HyperLogLog counter uses bit patterns in hash functions to estimate

Read:

http://blog.aggregateknowledge.com/2012/10/25/sketch-of-the-day-hyperloglog-cornerstone-of-a-big-data-infrastructure/

The code was mostly taken from

https://github.com/svpcom/hyperloglog

which is Vasily Evseenko's rewrite of Nelson Gonçalves's Log-Log-Sketch repo (https://github.com/goncalvesnelson/). I'm much indebted to them!

In [148]:
# derived constants etc.  These are just utility functions that you can ignore.

def _get_alpha(b):
    if not (4 <= b <= 16):
        raise ValueError("b=%d should be in range [4 : 16]" % b)

    if b == 4:
        return 0.673

    if b == 5:
        return 0.697

    if b == 6:
        return 0.709

    return 0.7213 / (1.0 + 1.079 / (1 << b))

def estimate_cardinality(alpha, bits, bins):
    # harmonic mean
    E = alpha * float(len(bins) ** 2) / sum(math.pow(2.0, -x) for x in bins)
                                       
    if E <= 2.5 * bits:             # Small range correction              
        V = bins.count(0)           #count number or registers equal to 0
        return bits * math.log(bins/ float(V)) if V > 0 else E
    elif E <= float(1L << 160) / 30.0:
        return E
    else:
        return -(1L << 160) * math.log(1.0 - E / (1L << 160))
In [149]:
# choose the precision by choosing how many estimators to track.
bits = 8
alpha = _get_alpha(bits)
num_bins = 1 << bits
bit_bins = [ 1L << i for i in range(160 - bits + 1) ]

print 'num bins:', num_bins
num bins: 256

In [150]:
# 'rho' function to calculate the bit pattern to watch (string of 0s)
import bisect

# here, 'rho' is the number of 0s to the left of the first 'accuracy' bits.
def rho(w):
    r = len(bit_bins) - bisect.bisect_right(bit_bins, w)
    return r
In [151]:
print 1, len(bit_bins) - rho(1)
print 2, len(bit_bins) - rho(2)
print 3, len(bit_bins) - rho(3)
print 4, len(bit_bins) - rho(4)
print 8, len(bit_bins) - rho(8)
print 2**152, len(bit_bins) - rho(2**152)
1 1
2 2
3 2
4 3
8 4
5708990770823839524233143877797980545530986496 153

In [152]:
print 'initializing', num_bins, 'estimators'
estimators = [0]*num_bins
initializing 256 estimators

In [153]:
from hashlib import sha1

# to add a number into the counter:
def add(num):
    # take the hash of 'num'
    num = str(num)
    hash = long(sha1(num).hexdigest(), 16)
    
    # here, 'bin' is determined by the first 'bits' bits of hash
    bin = hash & ((1 << bits) - 1)
    
    # now count the number of 0s in the remaining bits
    remaining_bits = hash >> bits
    count = rho(remaining_bits)
    
    # take max of currently stored estimation & this one
    estimators[bin] = max(estimators[bin], count)
    
import random
for i in range(100000):
    num = random.randint(0, int(1e9))
    add(num)
In [154]:
print 'estimate cardinality as', estimate_cardinality(alpha, bits, estimators)
estimate cardinality as 98837.2382097

In [155]:
import random
for i in range(100000):
    num = random.randint(0, int(1e9))
    add(num)
In [156]:
print 'estimate cardinality as', estimate_cardinality(alpha, bits, estimators)
estimate cardinality as 187751.540573

In []:
 
Back to top