%load_ext cythonmagic
%%cython
import numpy as np
cimport numpy as np
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline double inner_func(double[:, ::1] X):
return X[0, 0]
def loop_1(int N, switch=True):
cdef double[:, ::1] X = np.zeros((100, 100))
cdef int i
for i in range(N):
# this should be inlined by the compiler
inner_func(X)
missing cimport: /home/vanderplas/.config/ipython/cython/_cython_magic_aad7a6b46ed24a8baf2373c6b8784ac3.pyx cython
timeit loop_1(1E6)
100000 loops, best of 3: 10.1 us per loop
Now we'll repeat, but make a dummy function such that we
can guarantee that inner_func
will not be inlined
%%cython
import numpy as np
cimport numpy as np
cimport cython
ctypedef double (*inner_func_ptr)(double[:, ::1])
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double inner_func_1(double[:, ::1] X):
return X[0, 0]
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double inner_func_2(double[:, ::1] X):
return X[0, 0]
def loop_2(int N, switch=True):
# use a switch to ensure that inlining can't happen: compilers
# are usually smart enough to figure it out otherwise.
cdef inner_func_ptr func
if switch:
func = inner_func_1
else:
func = inner_func_2
cdef double[:, ::1] X = np.zeros((100, 100))
cdef int i
for i in range(N):
func(X)
missing cimport: /home/vanderplas/.config/ipython/cython/_cython_magic_ac4b8923a5786ceee8b7f92131a08998.pyx cython
%timeit loop_2(1E6)
10 loops, best of 3: 22.9 ms per loop
In this case, inlining improves the computation speed by a factor of 2000!
Here we'll replicate the fast method above, but with a typed numpy array rather than a typed memoryview:
%%cython
import numpy as np
cimport numpy as np
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline double inner_func(np.ndarray[double, ndim=2, mode='c'] X):
return X[0, 0]
def loop_3(int N, switch=True):
cdef np.ndarray[double, ndim=2, mode='c'] X = np.zeros((100, 100))
cdef int i
for i in range(N):
inner_func(X)
missing cimport: /home/vanderplas/.config/ipython/cython/_cython_magic_0523a248c0a6af0427879e879eda0095.pyx cython
warning: /home/vanderplas/.config/ipython/cython/_cython_magic_0523a248c0a6af0427879e879eda0095.pyx:8:30: Buffer unpacking not optimized away. warning: /home/vanderplas/.config/ipython/cython/_cython_magic_0523a248c0a6af0427879e879eda0095.pyx:8:30: Buffer unpacking not optimized away.
These warnings indicate a problem: buffer unpacking cannot be optimized for the numpy array dtype. Let's see how this compares to the other implementations:
print "inlined memview:"
%timeit loop_1(1E6)
print "non-inlined memview:"
%timeit loop_2(1E6)
print "inlined ndarray:"
%timeit loop_3(1E6)
inlined memview: 100000 loops, best of 3: 10.1 us per loop non-inlined memview: 10 loops, best of 3: 22.9 ms per loop inlined ndarray: 1 loops, best of 3: 617 ms per loop
The result for the ndarray is many times slower than either the inlined or the non-inlined example above!