Date: May 26, 2015
tl;dr: Fortran is fastest; numba and Cython are within a factor of two, with numba somewhat faster than Cython; plain numpy is much slower.
Update: In the original version of this notebook (posted yesterday), my Cython code omitted a type declaration for the loop counters, which made it catastrophically slow. Many thanks to my tweeps for pointing that out. With the added type declarations, Cython speed is similar to Numba and Fortran. Also, some of you pointed out that I was needlessly re-allocating memory every time the function was called, so I've now move the memory allocation outside of the functions. This significantly sped up the numba and Fortran calls, with the result that Cython is still about 2x slower than those (but much faster than plain numpy).
In this notebook, I test the speed of naive Python, vectorized numpy, numba, Cython, and Fortran for solving the heat equation on a regular Cartesian grid in 1- and 2-D with centered differences in space and the explicit Euler method in time. You can view this as an update to Jake Vanderplas' blog post from two years ago, but with a different kernel.
Note that this operation is just a sparse matrix-vector multiply, but I'm not interested in implementing it that way because my real algorithms of interest are solvers for nonlinear hyperbolic PDEs. There are better algorithms for this problem involving e.g. FFTs or implicit methods, but for the same reason I'm not primarlily interested in those approaches.
I've run this on two machines and gotten some significantly different results, so YMMV. The results below were computed on a Mac Pro with 2 quad-core 2.66 GHz Intel Xeon processors and 16GB of RAM.
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 15})
import numpy as np
from numba import jit
import numba
numba.__version__
'0.18.2'
def centered_difference_1D(u,D,dx=1):
for i in range(1,len(D)-1):
D[i] = (u[i+1] - 2*u[i] + u[i-1])/dx**2
return D
N = 100
u = np.random.rand(N)
D = np.empty_like(u)
u[0] = 0; u[-1] = 0
times = {}
y = %timeit -o centered_difference_1D(u,D)
times['Python'] = y.best
10000 loops, best of 3: 120 µs per loop
Obviously, it's slow. How much can we speed it up with numba?
@jit
def centered_difference_1D_numba(u,D,dx=1):
for i in range(1,len(D)-1):
D[i] = (u[i+1] - 2*u[i] + u[i-1])/dx**2
return D
y = %timeit -o centered_difference_1D_numba(u,D)
times['numba'] = y.best
The slowest run took 147324.47 times longer than the fastest. This could mean that an intermediate result is being cached 1000000 loops, best of 3: 665 ns per loop
Not bad: a 100x speedup.
Now let's actually solve the heat equation.
def solve_heat_equation_1D(u,D,dx,dt,T):
"""Advance solution u by time T using the explicit Euler method."""
n_steps = int(T/dt)
for j in range(n_steps):
u = u + dt*centered_difference_1D(u,D,dx)
return u
@jit # jit only makes a very small improvement here
def solve_heat_equation_1D_numba(u,D,dx,dt,T):
"""Advance solution u by time T using the explicit Euler method."""
n_steps = int(T/dt)
for j in range(n_steps):
u = u + dt*centered_difference_1D_numba(u,D,dx)
return u
x = np.linspace(0,1,N)
u = np.exp(-100*(x-0.5)**2)
dx = x[1]-x[0]
dt = 0.4 * dx**2
times = {}
print 'Python:'
y = %timeit -o solve_heat_equation_1D(u,D,dx,dt,T=1)
times['Python'] = y.best
print 'Numba:'
y = %timeit -o solve_heat_equation_1D_numba(u,D,dx,dt,T=1)
times['numba'] = y.best
Python: 1 loops, best of 3: 3.27 s per loop Numba: The slowest run took 4.46 times longer than the fastest. This could mean that an intermediate result is being cached 1 loops, best of 3: 86.7 ms per loop
From here on, I won't time the basic Python implementation because it's so slow. But I show the code for reference.
def centered_difference_2D(u,D,dx=1.):
D = np.zeros_like(u)
for i in range(1,u.shape[0]-1):
for j in range(1,u.shape[1]-1):
D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2
return D
@jit
def centered_difference_2D_numba(u,D,dx=1.):
for i in range(1,u.shape[0]-1):
for j in range(1,u.shape[1]-1):
D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2
return D
def centered_difference_2D_vectorized(u,D,dx=1.):
D = np.zeros_like(u)
D[1:-1,1:-1] = (u[2:,1:-1]+u[1:-1,2:]+u[:-2,1:-1]+u[1:-1,:-2]-4*u[1:-1,1:-1])/dx**2
return D
I essentially borrowed this code from Jake Vanderplas' blog.
%load_ext Cython
%%cython
cimport cython
import numpy as np
@cython.boundscheck(False)
@cython.wraparound(False)
def centered_difference_2D_cython(double[:, :] u, double[:, :] D, double dx):
cdef int i, j
cdef int M = u.shape[0]
for i in range(1,M-1):
for j in range(1,M-1):
D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2
return np.asarray(D)
I can call Fortran by using f2py. Note that my compiler is gfortran, which is certainly not the best. Also, I'm unsure of what optimization flags are being thrown by f2py. When this routine is called below, I'm careful to give it Fortran-ordered arrays.
%%file cdiff_fort.f90
subroutine centered_diff_fort(u,D,dx,m)
integer :: m
double precision, intent(in) :: u(m,m), dx
double precision, intent(inout) :: D(m,m)
integer :: i,j
do j = 2,m-1
do i = 2,m-1
D(i,j) = (u(i+1,j) + u(i,j+1) + u(i-1,j) + u(i,j-1)
& - 4*u(i,j))/dx**2
end do
end do
end subroutine centered_diff_fort
Overwriting cdiff_fort.f90
# Compile the Fortran with f2py.
# We'll direct the output into /dev/null so it doesn't fill the screen
!f2py -c cdiff_fort.f90 -m centered_diff_fort > /dev/null
# For some reason this fails in the notebook but works fine from a terminal
N=1000
dx = 1.
u = np.random.rand(N,N)
D = np.zeros_like(u)
times = {}
#print 'Python'
#y = %timeit -o centered_difference_2D(u,D,dx)
#times['Python'] = y.best
print 'numpy vectorized'
y = %timeit -o centered_difference_2D_vectorized(u,D,dx)
times['numpy'] = y.best
print 'numba'
y = %timeit -o centered_difference_2D_numba(u,D,dx)
times['numba'] = y.best
print 'Cython'
y = %timeit -o centered_difference_2D_cython(u,D,dx)
times['Cython'] = y.best
import centered_diff_fort
uF = np.zeros((N,N),order='F')
DF = np.zeros((N,N),order='F')
uF[:,:] = np.random.rand(N,N)
#print u.flags['F_CONTIGUOUS']
dx = 1.
print 'Fortran'
y = %timeit -o centered_diff_fort.centered_diff_fort(uF,DF,dx,N)
times['Fortran'] = y.best
plt.bar(range(len(times)), times.values(), align='center')
plt.xticks(range(len(times)), times.keys());
numpy vectorized 10 loops, best of 3: 32.1 ms per loop numba The slowest run took 40.73 times longer than the fastest. This could mean that an intermediate result is being cached 100 loops, best of 3: 3.24 ms per loop Cython 100 loops, best of 3: 5.18 ms per loop Fortran 100 loops, best of 3: 2.15 ms per loop
grids = (50, 200, 500, 1000, 2000, 5000, 10000)
dx = 1.
times = {}
for method in ['Cython','numba','numpy','Fortran']:
times[method] = []
for N in grids:
u = np.random.rand(N,N)
D = np.zeros((N,N))
uF = np.zeros((N,N),order='F')
DF = np.zeros((N,N),order='F')
uF[:,:] = np.random.rand(N,N)
y = %timeit -o centered_difference_2D_vectorized(u,D,dx);
times['numpy'].append(y.best)
y = %timeit -o centered_difference_2D_numba(u,D,dx);
times['numba'].append(y.best)
y = %timeit -o centered_difference_2D_cython(u,D,dx);
times['Cython'].append(y.best)
y = %timeit -o centered_diff_fort.centered_diff_fort(uF,DF,dx,N);
times['Fortran'].append(y.best)
plt.loglog(grids,times['numpy'],
grids,times['numba'],
grids,times['Cython'],
grids,times['Fortran'])
plt.legend(['numpy','numba','Cython','Fortran'],loc='best');
plt.xlabel('N')
plt.ylabel('Seconds');
The slowest run took 4.38 times longer than the fastest. This could mean that an intermediate result is being cached 10000 loops, best of 3: 66.6 µs per loop 100000 loops, best of 3: 6.98 µs per loop 100000 loops, best of 3: 18 µs per loop 100000 loops, best of 3: 4.14 µs per loop 1000 loops, best of 3: 508 µs per loop 10000 loops, best of 3: 119 µs per loop 1000 loops, best of 3: 205 µs per loop The slowest run took 4.66 times longer than the fastest. This could mean that an intermediate result is being cached 10000 loops, best of 3: 55.8 µs per loop 100 loops, best of 3: 3.26 ms per loop 1000 loops, best of 3: 744 µs per loop 1000 loops, best of 3: 1.23 ms per loop The slowest run took 4.50 times longer than the fastest. This could mean that an intermediate result is being cached 1000 loops, best of 3: 365 µs per loop 10 loops, best of 3: 34.3 ms per loop 100 loops, best of 3: 3.22 ms per loop 100 loops, best of 3: 5.16 ms per loop 100 loops, best of 3: 2.13 ms per loop 10 loops, best of 3: 131 ms per loop 100 loops, best of 3: 12.9 ms per loop 10 loops, best of 3: 20.7 ms per loop 100 loops, best of 3: 8.74 ms per loop 1 loops, best of 3: 1.27 s per loop 1 loops, best of 3: 102 ms per loop 10 loops, best of 3: 143 ms per loop 10 loops, best of 3: 54.4 ms per loop 1 loops, best of 3: 4.76 s per loop 1 loops, best of 3: 330 ms per loop 1 loops, best of 3: 524 ms per loop 1 loops, best of 3: 246 ms per loop
Fortran and Numba are the obvious winners here. Obviously, I need to start using Numba more.
def solve_heat_equation_2D(u,D,dx,dt,T):
"""Advance solution u to time T using the explicit Euler method."""
n_steps = int(T/dt)
for j in range(n_steps):
u = u + dt*centered_difference_2D(u,D,dx)
return u
def solve_heat_equation_2D_vectorized(u,D,dx,dt,T):
n_steps = int(T/dt)
for j in range(n_steps):
u = u + dt*centered_difference_2D_vectorized(u,D,dx)
return u
@jit
def solve_heat_equation_2D_numba(u,D,dx,dt,T):
n_steps = int(T/dt)
for j in range(n_steps):
u = u + dt*centered_difference_2D_numba(u,D,dx)
return u
def solve_heat_equation_2D_cython(u,D,dx,dt,T):
n_steps = int(T/dt)
for j in range(n_steps):
u = u + dt*centered_difference_2D_cython(u,D,dx)
return u
def solve_heat_equation_2D_fortran(u,D,dx,dt,T):
N = u.shape[0]
n_steps = int(T/dt)
for j in range(n_steps):
centered_diff_fort.centered_diff_fort(u,D,dx,N)
u = u + dt*D
return u
N = 200
x = np.linspace(0,1,N)
X, Y = np.meshgrid(x,x)
u = np.exp(-100*((X-0.5)**2+(Y-0.5)**2))
D = np.zeros_like(u)
dx = x[1]-x[0]
dt = 0.2 * dx**2
T = 0.01
times = {}
#print 'Python'
#y = %timeit -o solve_heat_equation_2D(u,D,dx,dt,T=T)
#times['Python'] = y.best
print 'numpy vectorized'
y = %timeit -o solve_heat_equation_2D_vectorized(u,D,dx,dt,T=T)
times['numpy'] = y.best
print 'numba'
y = %timeit -o solve_heat_equation_2D_numba(u,D,dx,dt,T=T)
times['numba'] = y.best
print 'Cython'
y = %timeit -o solve_heat_equation_2D_cython(u,D,dx,dt,T=T)
times['Cython'] = y.best
print 'Fortran'
uF = np.zeros((N,N),order='F')
DF = np.zeros((N,N),order='F')
uF[:,:] = np.exp(-100*((X-0.5)**2+(Y-0.5)**2))
y = %timeit -o solve_heat_equation_2D_fortran(uF,DF,dx,dt,T=T)
times['Fortran'] = y.best
plt.bar(range(len(times)), times.values(), align='center')
plt.xticks(range(len(times)), times.keys());
numpy vectorized 1 loops, best of 3: 1.39 s per loop numba 1 loops, best of 3: 728 ms per loop Cython 1 loops, best of 3: 758 ms per loop Fortran 1 loops, best of 3: 449 ms per loop