#!/usr/bin/env python # coding: utf-8 # # # Playing with Numba and finite differences # # **David I. Ketcheson** # # Date: **May 26, 2015** # # **tl;dr: Fortran is fastest; numba and Cython are within a factor of two, # with numba somewhat faster than Cython; plain numpy is much slower.** # # # **Update**: *In the original version of this notebook (posted yesterday), # my Cython code omitted a type declaration for the loop counters, which # made it catastrophically slow. Many thanks to my tweeps for pointing that # out. With the added type declarations, Cython speed is similar to Numba and Fortran. # Also, some of you pointed out that I was needlessly re-allocating memory # every time the function was called, so I've now move the memory allocation # outside of the functions. This significantly sped up the numba and Fortran calls, # with the result that Cython is still about 2x slower than those (but much faster # than plain numpy).* # # # In this notebook, I test the speed of naive Python, vectorized # numpy, numba, Cython, and Fortran for solving the heat equation # on a regular Cartesian grid in 1- and 2-D with centered differences # in space and the explicit Euler method in time. # You can view this as an update to # [Jake Vanderplas' blog post](https://jakevdp.github.io/blog/2013/06/15/numba-vs-cython-take-2/) # from two years ago, but with a different kernel. # # Note that this operation is just a sparse matrix-vector multiply, but # I'm not interested in implementing it that way because my real algorithms # of interest are solvers for nonlinear hyperbolic PDEs. There are # better algorithms for this problem involving e.g. FFTs or implicit methods, # but for the same reason I'm not primarlily interested in those approaches. # # I've run this on two machines and gotten some significantly different # results, so YMMV. The results below were computed on a Mac Pro with # 2 quad-core 2.66 GHz Intel Xeon processors and 16GB of RAM. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import matplotlib matplotlib.rcParams.update({'font.size': 15}) import numpy as np from numba import jit import numba numba.__version__ # # 1D heat equation # # # Naive # A naive implementation of the second-order centered-difference: # In[2]: def centered_difference_1D(u,D,dx=1): for i in range(1,len(D)-1): D[i] = (u[i+1] - 2*u[i] + u[i-1])/dx**2 return D N = 100 u = np.random.rand(N) D = np.empty_like(u) u[0] = 0; u[-1] = 0 times = {} y = get_ipython().run_line_magic('timeit', '-o centered_difference_1D(u,D)') times['Python'] = y.best # # Numba jit # Obviously, it's slow. How much can we speed it up with numba? # In[3]: @jit def centered_difference_1D_numba(u,D,dx=1): for i in range(1,len(D)-1): D[i] = (u[i+1] - 2*u[i] + u[i-1])/dx**2 return D # In[4]: y = get_ipython().run_line_magic('timeit', '-o centered_difference_1D_numba(u,D)') times['numba'] = y.best # Not bad: a 100x speedup. # # # Time to solve the 1D Heat equation # Now let's actually solve the heat equation. # In[5]: def solve_heat_equation_1D(u,D,dx,dt,T): """Advance solution u by time T using the explicit Euler method.""" n_steps = int(T/dt) for j in range(n_steps): u = u + dt*centered_difference_1D(u,D,dx) return u @jit # jit only makes a very small improvement here def solve_heat_equation_1D_numba(u,D,dx,dt,T): """Advance solution u by time T using the explicit Euler method.""" n_steps = int(T/dt) for j in range(n_steps): u = u + dt*centered_difference_1D_numba(u,D,dx) return u x = np.linspace(0,1,N) u = np.exp(-100*(x-0.5)**2) dx = x[1]-x[0] dt = 0.4 * dx**2 times = {} print 'Python:' y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_1D(u,D,dx,dt,T=1)') times['Python'] = y.best print 'Numba:' y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_1D_numba(u,D,dx,dt,T=1)') times['numba'] = y.best # # 2D centered difference kernel # From here on, I won't time the basic Python implementation because # it's so slow. But I show the code for reference. # In[6]: def centered_difference_2D(u,D,dx=1.): D = np.zeros_like(u) for i in range(1,u.shape[0]-1): for j in range(1,u.shape[1]-1): D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2 return D @jit def centered_difference_2D_numba(u,D,dx=1.): for i in range(1,u.shape[0]-1): for j in range(1,u.shape[1]-1): D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2 return D def centered_difference_2D_vectorized(u,D,dx=1.): D = np.zeros_like(u) D[1:-1,1:-1] = (u[2:,1:-1]+u[1:-1,2:]+u[:-2,1:-1]+u[1:-1,:-2]-4*u[1:-1,1:-1])/dx**2 return D # # Cython # I essentially borrowed this code from Jake Vanderplas' blog. # In[7]: get_ipython().run_line_magic('load_ext', 'Cython') # In[8]: get_ipython().run_cell_magic('cython', '', 'cimport cython\nimport numpy as np\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef centered_difference_2D_cython(double[:, :] u, double[:, :] D, double dx):\n cdef int i, j\n cdef int M = u.shape[0]\n for i in range(1,M-1):\n for j in range(1,M-1):\n D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2\n return np.asarray(D)\n') # # Fortran # I can call Fortran by using f2py. Note that my compiler is gfortran, # which is certainly not the best. Also, I'm unsure of what optimization # flags are being thrown by f2py. When this routine is called below, # I'm careful to give it Fortran-ordered arrays. # In[9]: get_ipython().run_cell_magic('file', 'cdiff_fort.f90', '\n subroutine centered_diff_fort(u,D,dx,m)\n integer :: m\n double precision, intent(in) :: u(m,m), dx\n double precision, intent(inout) :: D(m,m)\n integer :: i,j\n do j = 2,m-1 \n do i = 2,m-1 \n D(i,j) = (u(i+1,j) + u(i,j+1) + u(i-1,j) + u(i,j-1) \n & - 4*u(i,j))/dx**2\n end do \n end do \n end subroutine centered_diff_fort\n') # In[10]: # Compile the Fortran with f2py. # We'll direct the output into /dev/null so it doesn't fill the screen get_ipython().system('f2py -c cdiff_fort.f90 -m centered_diff_fort > /dev/null') # For some reason this fails in the notebook but works fine from a terminal # # Time to compute the 2D centered difference # In[11]: N=1000 dx = 1. u = np.random.rand(N,N) D = np.zeros_like(u) times = {} #print 'Python' #y = %timeit -o centered_difference_2D(u,D,dx) #times['Python'] = y.best print 'numpy vectorized' y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_vectorized(u,D,dx)') times['numpy'] = y.best print 'numba' y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_numba(u,D,dx)') times['numba'] = y.best print 'Cython' y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_cython(u,D,dx)') times['Cython'] = y.best import centered_diff_fort uF = np.zeros((N,N),order='F') DF = np.zeros((N,N),order='F') uF[:,:] = np.random.rand(N,N) #print u.flags['F_CONTIGUOUS'] dx = 1. print 'Fortran' y = get_ipython().run_line_magic('timeit', '-o centered_diff_fort.centered_diff_fort(uF,DF,dx,N)') times['Fortran'] = y.best plt.bar(range(len(times)), times.values(), align='center') plt.xticks(range(len(times)), times.keys()); # # Timing versus N # In[12]: grids = (50, 200, 500, 1000, 2000, 5000, 10000) dx = 1. times = {} for method in ['Cython','numba','numpy','Fortran']: times[method] = [] for N in grids: u = np.random.rand(N,N) D = np.zeros((N,N)) uF = np.zeros((N,N),order='F') DF = np.zeros((N,N),order='F') uF[:,:] = np.random.rand(N,N) y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_vectorized(u,D,dx);') times['numpy'].append(y.best) y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_numba(u,D,dx);') times['numba'].append(y.best) y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_cython(u,D,dx);') times['Cython'].append(y.best) y = get_ipython().run_line_magic('timeit', '-o centered_diff_fort.centered_diff_fort(uF,DF,dx,N);') times['Fortran'].append(y.best) plt.loglog(grids,times['numpy'], grids,times['numba'], grids,times['Cython'], grids,times['Fortran']) plt.legend(['numpy','numba','Cython','Fortran'],loc='best'); plt.xlabel('N') plt.ylabel('Seconds'); # Fortran and Numba are the obvious winners here. Obviously, I # need to start using Numba more. # # # # Time to solve the heat equation # In[13]: def solve_heat_equation_2D(u,D,dx,dt,T): """Advance solution u to time T using the explicit Euler method.""" n_steps = int(T/dt) for j in range(n_steps): u = u + dt*centered_difference_2D(u,D,dx) return u def solve_heat_equation_2D_vectorized(u,D,dx,dt,T): n_steps = int(T/dt) for j in range(n_steps): u = u + dt*centered_difference_2D_vectorized(u,D,dx) return u @jit def solve_heat_equation_2D_numba(u,D,dx,dt,T): n_steps = int(T/dt) for j in range(n_steps): u = u + dt*centered_difference_2D_numba(u,D,dx) return u def solve_heat_equation_2D_cython(u,D,dx,dt,T): n_steps = int(T/dt) for j in range(n_steps): u = u + dt*centered_difference_2D_cython(u,D,dx) return u def solve_heat_equation_2D_fortran(u,D,dx,dt,T): N = u.shape[0] n_steps = int(T/dt) for j in range(n_steps): centered_diff_fort.centered_diff_fort(u,D,dx,N) u = u + dt*D return u N = 200 x = np.linspace(0,1,N) X, Y = np.meshgrid(x,x) u = np.exp(-100*((X-0.5)**2+(Y-0.5)**2)) D = np.zeros_like(u) dx = x[1]-x[0] dt = 0.2 * dx**2 T = 0.01 times = {} #print 'Python' #y = %timeit -o solve_heat_equation_2D(u,D,dx,dt,T=T) #times['Python'] = y.best print 'numpy vectorized' y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_vectorized(u,D,dx,dt,T=T)') times['numpy'] = y.best print 'numba' y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_numba(u,D,dx,dt,T=T)') times['numba'] = y.best print 'Cython' y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_cython(u,D,dx,dt,T=T)') times['Cython'] = y.best print 'Fortran' uF = np.zeros((N,N),order='F') DF = np.zeros((N,N),order='F') uF[:,:] = np.exp(-100*((X-0.5)**2+(Y-0.5)**2)) y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_fortran(uF,DF,dx,dt,T=T)') times['Fortran'] = y.best plt.bar(range(len(times)), times.values(), align='center') plt.xticks(range(len(times)), times.keys());