HyperLogLog counter¶

The HyperLogLog counter uses bit patterns in hash functions to estimate

http://blog.aggregateknowledge.com/2012/10/25/sketch-of-the-day-hyperloglog-cornerstone-of-a-big-data-infrastructure/

The code was mostly taken from

https://github.com/svpcom/hyperloglog

which is Vasily Evseenko's rewrite of Nelson Gonçalves's Log-Log-Sketch repo (https://github.com/goncalvesnelson/). I'm much indebted to them!

In :
# derived constants etc.  These are just utility functions that you can ignore.

def _get_alpha(b):
if not (4 <= b <= 16):
raise ValueError("b=%d should be in range [4 : 16]" % b)

if b == 4:
return 0.673

if b == 5:
return 0.697

if b == 6:
return 0.709

return 0.7213 / (1.0 + 1.079 / (1 << b))

def estimate_cardinality(alpha, bits, bins):
# harmonic mean
E = alpha * float(len(bins) ** 2) / sum(math.pow(2.0, -x) for x in bins)

if E <= 2.5 * bits:             # Small range correction
V = bins.count(0)           #count number or registers equal to 0
return bits * math.log(bins/ float(V)) if V > 0 else E
elif E <= float(1L << 160) / 30.0:
return E
else:
return -(1L << 160) * math.log(1.0 - E / (1L << 160))
In :
# choose the precision by choosing how many estimators to track.
bits = 8
alpha = _get_alpha(bits)
num_bins = 1 << bits
bit_bins = [ 1L << i for i in range(160 - bits + 1) ]

print 'num bins:', num_bins
num bins: 256
In :
# 'rho' function to calculate the bit pattern to watch (string of 0s)
import bisect

# here, 'rho' is the number of 0s to the left of the first 'accuracy' bits.
def rho(w):
r = len(bit_bins) - bisect.bisect_right(bit_bins, w)
return r
In :
print 1, len(bit_bins) - rho(1)
print 2, len(bit_bins) - rho(2)
print 3, len(bit_bins) - rho(3)
print 4, len(bit_bins) - rho(4)
print 8, len(bit_bins) - rho(8)
print 2**152, len(bit_bins) - rho(2**152)
1 1
2 2
3 2
4 3
8 4
5708990770823839524233143877797980545530986496 153
In :
print 'initializing', num_bins, 'estimators'
estimators = *num_bins
initializing 256 estimators
In :
from hashlib import sha1

# to add a number into the counter:
# take the hash of 'num'
num = str(num)
hash = long(sha1(num).hexdigest(), 16)

# here, 'bin' is determined by the first 'bits' bits of hash
bin = hash & ((1 << bits) - 1)

# now count the number of 0s in the remaining bits
remaining_bits = hash >> bits
count = rho(remaining_bits)

# take max of currently stored estimation & this one
estimators[bin] = max(estimators[bin], count)

import random
for i in range(100000):
num = random.randint(0, int(1e9))