In [1]:

import os
os.environ['NUMBA_ENABLE_AVX'] = '1'
import numpy as np
import numba
import math

In [2]:

def find_alignment(x):
    data = x.ctypes.data
    
    bit = 0
    while data != 0:
        if data & 1:
            break
        else:
            data >>= 1
            bit += 1

    return 2**bit

def get_empty(shape, dtype, align16=False):
    old = []
    for i in range(100):
        x = np.empty(shape, dtype)
        x_alignment = find_alignment(x)
        if (align16 and x_alignment == 16) or (not align16 and x_alignment > 16):
            return x
        else:
            old.append(x)  # To avoid getting this allocation again on the next loop
    else:
        raise ValueError('Unable to achieve desired align16=%s' % align16)

In [3]:

a_align = get_empty(10000, np.float32, align16=False)
a_align[:] = np.arange(10000).astype(np.float32)
b_align = get_empty(10000, np.float32, align16=False)
b_align[:] = a_align
out_align = get_empty(10000, np.float32, align16=False)

print map(find_alignment, (a_align, b_align, out_align))

[32, 64, 32]

In [4]:

a_nonalign = get_empty(10000, np.float32, align16=True)
a_nonalign[:] = np.arange(10000).astype(np.float32)
b_nonalign = get_empty(10000, np.float32, align16=True)
b_nonalign[:] = a_nonalign
out_nonalign = get_empty(10000, np.float32, align16=True)

print map(find_alignment, (a_nonalign, b_nonalign, out_nonalign))

[16, 16, 16]

In [5]:

@numba.jit
def do_math(a, b, out):
    for i in range(a.shape[0]):
        out[i] = a[i] + b[i] * math.fabs(a[i])

In [6]:

do_math(a_align, b_align, out_align)

In [7]:

%timeit do_math(a_nonalign, b_nonalign, out_nonalign)
%timeit do_math(a_align, b_align, out_align)

100000 loops, best of 3: 3.59 µs per loop
100000 loops, best of 3: 2.46 µs per loop

In [7]: