import os
os.environ['NUMBA_ENABLE_AVX'] = '1'
import numpy as np
import numba
import math
def find_alignment(x):
data = x.ctypes.data
bit = 0
while data != 0:
if data & 1:
break
else:
data >>= 1
bit += 1
return 2**bit
def get_empty(shape, dtype, align16=False):
old = []
for i in range(100):
x = np.empty(shape, dtype)
x_alignment = find_alignment(x)
if (align16 and x_alignment == 16) or (not align16 and x_alignment > 16):
return x
else:
old.append(x) # To avoid getting this allocation again on the next loop
else:
raise ValueError('Unable to achieve desired align16=%s' % align16)
a_align = get_empty(10000, np.float32, align16=False)
a_align[:] = np.arange(10000).astype(np.float32)
b_align = get_empty(10000, np.float32, align16=False)
b_align[:] = a_align
out_align = get_empty(10000, np.float32, align16=False)
print map(find_alignment, (a_align, b_align, out_align))
[32, 64, 32]
a_nonalign = get_empty(10000, np.float32, align16=True)
a_nonalign[:] = np.arange(10000).astype(np.float32)
b_nonalign = get_empty(10000, np.float32, align16=True)
b_nonalign[:] = a_nonalign
out_nonalign = get_empty(10000, np.float32, align16=True)
print map(find_alignment, (a_nonalign, b_nonalign, out_nonalign))
[16, 16, 16]
@numba.jit
def do_math(a, b, out):
for i in range(a.shape[0]):
out[i] = a[i] + b[i] * math.fabs(a[i])
do_math(a_align, b_align, out_align)
%timeit do_math(a_nonalign, b_nonalign, out_nonalign)
%timeit do_math(a_align, b_align, out_align)
100000 loops, best of 3: 3.59 µs per loop 100000 loops, best of 3: 2.46 µs per loop