The HyperLogLog counter uses bit patterns in hash functions to estimate
Read:
The code was mostly taken from
https://github.com/svpcom/hyperloglog
which is Vasily Evseenko's rewrite of Nelson Gonçalves's Log-Log-Sketch repo (https://github.com/goncalvesnelson/). I'm much indebted to them!
# derived constants etc. These are just utility functions that you can ignore.
def _get_alpha(b):
if not (4 <= b <= 16):
raise ValueError("b=%d should be in range [4 : 16]" % b)
if b == 4:
return 0.673
if b == 5:
return 0.697
if b == 6:
return 0.709
return 0.7213 / (1.0 + 1.079 / (1 << b))
def estimate_cardinality(alpha, bits, bins):
# harmonic mean
E = alpha * float(len(bins) ** 2) / sum(math.pow(2.0, -x) for x in bins)
if E <= 2.5 * bits: # Small range correction
V = bins.count(0) #count number or registers equal to 0
return bits * math.log(bins/ float(V)) if V > 0 else E
elif E <= float(1L << 160) / 30.0:
return E
else:
return -(1L << 160) * math.log(1.0 - E / (1L << 160))
# choose the precision by choosing how many estimators to track.
bits = 8
alpha = _get_alpha(bits)
num_bins = 1 << bits
bit_bins = [ 1L << i for i in range(160 - bits + 1) ]
print 'num bins:', num_bins
num bins: 256
# 'rho' function to calculate the bit pattern to watch (string of 0s)
import bisect
# here, 'rho' is the number of 0s to the left of the first 'accuracy' bits.
def rho(w):
r = len(bit_bins) - bisect.bisect_right(bit_bins, w)
return r
print 1, len(bit_bins) - rho(1)
print 2, len(bit_bins) - rho(2)
print 3, len(bit_bins) - rho(3)
print 4, len(bit_bins) - rho(4)
print 8, len(bit_bins) - rho(8)
print 2**152, len(bit_bins) - rho(2**152)
1 1 2 2 3 2 4 3 8 4 5708990770823839524233143877797980545530986496 153
print 'initializing', num_bins, 'estimators'
estimators = [0]*num_bins
initializing 256 estimators
from hashlib import sha1
# to add a number into the counter:
def add(num):
# take the hash of 'num'
num = str(num)
hash = long(sha1(num).hexdigest(), 16)
# here, 'bin' is determined by the first 'bits' bits of hash
bin = hash & ((1 << bits) - 1)
# now count the number of 0s in the remaining bits
remaining_bits = hash >> bits
count = rho(remaining_bits)
# take max of currently stored estimation & this one
estimators[bin] = max(estimators[bin], count)
import random
for i in range(100000):
num = random.randint(0, int(1e9))
add(num)
print 'estimate cardinality as', estimate_cardinality(alpha, bits, estimators)
estimate cardinality as 98837.2382097
import random
for i in range(100000):
num = random.randint(0, int(1e9))
add(num)
print 'estimate cardinality as', estimate_cardinality(alpha, bits, estimators)
estimate cardinality as 187751.540573