#!/usr/bin/env python
# coding: utf-8

# <!-- dom:TITLE: Playing with Numba and finite differences -->
# # Playing with Numba and finite differences
# <!-- dom:AUTHOR: David I. Ketcheson -->
# <!-- Author: --> **David I. Ketcheson**
# 
# Date: **May 26, 2015**
# 
# **tl;dr: Fortran is fastest; numba and Cython are within a factor of two,
# with numba somewhat faster than Cython; plain numpy is much slower.**
# 
# 
# **Update**: *In the original version of this notebook (posted yesterday),
# my Cython code omitted a type declaration for the loop counters, which
# made it catastrophically slow.  Many thanks to my tweeps for pointing that
# out.  With the added type declarations, Cython speed is similar to Numba and Fortran.
# Also, some of you pointed out that I was needlessly re-allocating memory
# every time the function was called, so I've now move the memory allocation
# outside of the functions.  This significantly sped up the numba and Fortran calls,
# with the result that Cython is still about 2x slower than those (but much faster
# than plain numpy).*
# 
# 
# In this notebook, I test the speed of naive Python, vectorized
# numpy, numba, Cython, and Fortran for solving the heat equation
# on a regular Cartesian grid in 1- and 2-D with centered differences
# in space and the explicit Euler method in time.
# You can view this as an update to 
# [Jake Vanderplas' blog post](https://jakevdp.github.io/blog/2013/06/15/numba-vs-cython-take-2/)
# from two years ago, but with a different kernel.
# 
# Note that this operation is just a sparse matrix-vector multiply, but
# I'm not interested in implementing it that way because my real algorithms
# of interest are solvers for nonlinear hyperbolic PDEs.  There are 
# better algorithms for this problem involving e.g. FFTs or implicit methods,
# but for the same reason I'm not primarlily interested in those approaches.
# 
# I've run this on two machines and gotten some significantly different
# results, so YMMV.  The results below were computed on a Mac Pro with
# 2 quad-core 2.66 GHz Intel Xeon processors and 16GB of RAM.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 15})
import numpy as np
from numba import jit
import numba
numba.__version__


# # 1D heat equation
# 
# # Naive
# A naive implementation of the second-order centered-difference:

# In[2]:


def centered_difference_1D(u,D,dx=1):
    for i in range(1,len(D)-1):
        D[i] = (u[i+1] - 2*u[i] + u[i-1])/dx**2
    return D

N = 100
u = np.random.rand(N)
D = np.empty_like(u)
u[0] = 0; u[-1] = 0

times = {}
y = get_ipython().run_line_magic('timeit', '-o centered_difference_1D(u,D)')
times['Python'] = y.best


# # Numba jit
# Obviously, it's slow.  How much can we speed it up with numba?

# In[3]:


@jit
def centered_difference_1D_numba(u,D,dx=1):
    for i in range(1,len(D)-1):
        D[i] = (u[i+1] - 2*u[i] + u[i-1])/dx**2
    return D


# In[4]:


y = get_ipython().run_line_magic('timeit', '-o centered_difference_1D_numba(u,D)')
times['numba'] = y.best


# Not bad: a 100x speedup.
# 
# # Time to solve the 1D Heat equation
# Now let's actually solve the heat equation.

# In[5]:


def solve_heat_equation_1D(u,D,dx,dt,T):
    """Advance solution u by time T using the explicit Euler method."""
    n_steps = int(T/dt)
    for j in range(n_steps):
        u = u + dt*centered_difference_1D(u,D,dx)
    return u

@jit  # jit only makes a very small improvement here
def solve_heat_equation_1D_numba(u,D,dx,dt,T):
    """Advance solution u by time T using the explicit Euler method."""
    n_steps = int(T/dt)
    for j in range(n_steps):
        u = u + dt*centered_difference_1D_numba(u,D,dx)
    return u

x = np.linspace(0,1,N)
u = np.exp(-100*(x-0.5)**2)
dx = x[1]-x[0]
dt = 0.4 * dx**2

times = {}
print 'Python:'
y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_1D(u,D,dx,dt,T=1)')
times['Python'] = y.best

print 'Numba:'
y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_1D_numba(u,D,dx,dt,T=1)')
times['numba'] = y.best


# # 2D centered difference kernel
# From here on, I won't time the basic Python implementation because
# it's so slow.  But I show the code for reference.

# In[6]:


def centered_difference_2D(u,D,dx=1.):
    D = np.zeros_like(u)
    for i in range(1,u.shape[0]-1):
        for j in range(1,u.shape[1]-1):
            D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2
    return D

@jit
def centered_difference_2D_numba(u,D,dx=1.):
    for i in range(1,u.shape[0]-1):
        for j in range(1,u.shape[1]-1):
            D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2
    return D

def centered_difference_2D_vectorized(u,D,dx=1.):
    D = np.zeros_like(u)
    D[1:-1,1:-1] = (u[2:,1:-1]+u[1:-1,2:]+u[:-2,1:-1]+u[1:-1,:-2]-4*u[1:-1,1:-1])/dx**2
    return D


# # Cython
# I essentially borrowed this code from Jake Vanderplas' blog.

# In[7]:


get_ipython().run_line_magic('load_ext', 'Cython')


# In[8]:


get_ipython().run_cell_magic('cython', '', 'cimport cython\nimport numpy as np\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef centered_difference_2D_cython(double[:, :] u, double[:, :] D, double dx):\n    cdef int i, j\n    cdef int M = u.shape[0]\n    for i in range(1,M-1):\n        for j in range(1,M-1):\n            D[i,j] = (u[i+1,j] + u[i,j+1] + u[i-1,j] + u[i,j-1] - 4*u[i,j])/dx**2\n    return np.asarray(D)\n')


# # Fortran
# I can call Fortran by using f2py.  Note that my compiler is gfortran,
# which is certainly not the best.  Also, I'm unsure of what optimization
# flags are being thrown by f2py.  When this routine is called below,
# I'm careful to give it Fortran-ordered arrays.

# In[9]:


get_ipython().run_cell_magic('file', 'cdiff_fort.f90', '\n      subroutine centered_diff_fort(u,D,dx,m)\n          integer :: m\n          double precision, intent(in) :: u(m,m), dx\n          double precision, intent(inout) :: D(m,m)\n          integer :: i,j\n          do j = 2,m-1 \n              do i = 2,m-1 \n                  D(i,j) = (u(i+1,j) + u(i,j+1) + u(i-1,j) + u(i,j-1) \n     &                   - 4*u(i,j))/dx**2\n              end do \n          end do \n      end subroutine centered_diff_fort\n')


# In[10]:


# Compile the Fortran with f2py.
# We'll direct the output into /dev/null so it doesn't fill the screen
get_ipython().system('f2py -c cdiff_fort.f90 -m centered_diff_fort > /dev/null')
# For some reason this fails in the notebook but works fine from a terminal


# # Time to compute the 2D centered difference

# In[11]:


N=1000
dx = 1.
u = np.random.rand(N,N)
D = np.zeros_like(u)

times = {}
#print 'Python'
#y = %timeit -o centered_difference_2D(u,D,dx)
#times['Python'] = y.best

print 'numpy vectorized'
y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_vectorized(u,D,dx)')
times['numpy'] = y.best

print 'numba'
y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_numba(u,D,dx)')
times['numba'] = y.best

print 'Cython'
y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_cython(u,D,dx)')
times['Cython'] = y.best

import centered_diff_fort

uF = np.zeros((N,N),order='F')
DF = np.zeros((N,N),order='F')
uF[:,:] = np.random.rand(N,N)
#print u.flags['F_CONTIGUOUS']
dx = 1.
print 'Fortran'
y = get_ipython().run_line_magic('timeit', '-o centered_diff_fort.centered_diff_fort(uF,DF,dx,N)')
times['Fortran'] = y.best

plt.bar(range(len(times)), times.values(), align='center')
plt.xticks(range(len(times)), times.keys());


# # Timing versus N

# In[12]:


grids = (50, 200, 500, 1000, 2000, 5000, 10000)
dx = 1.
times = {}
for method in ['Cython','numba','numpy','Fortran']:
    times[method] = []

for N in grids:
    u = np.random.rand(N,N)
    D = np.zeros((N,N))
    uF = np.zeros((N,N),order='F')
    DF = np.zeros((N,N),order='F')
    uF[:,:] = np.random.rand(N,N)
    y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_vectorized(u,D,dx);')
    times['numpy'].append(y.best)
    y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_numba(u,D,dx);')
    times['numba'].append(y.best)
    y = get_ipython().run_line_magic('timeit', '-o centered_difference_2D_cython(u,D,dx);')
    times['Cython'].append(y.best)
    y = get_ipython().run_line_magic('timeit', '-o centered_diff_fort.centered_diff_fort(uF,DF,dx,N);')
    times['Fortran'].append(y.best)

plt.loglog(grids,times['numpy'],
         grids,times['numba'],
         grids,times['Cython'],
         grids,times['Fortran'])
plt.legend(['numpy','numba','Cython','Fortran'],loc='best');
plt.xlabel('N')
plt.ylabel('Seconds');


# Fortran and Numba are the obvious winners here.  Obviously, I
# need to start using Numba more.
# 
# 
# # Time to solve the heat equation

# In[13]:


def solve_heat_equation_2D(u,D,dx,dt,T):
    """Advance solution u to time T using the explicit Euler method."""
    n_steps = int(T/dt)
    for j in range(n_steps):
        u = u + dt*centered_difference_2D(u,D,dx)
    return u

def solve_heat_equation_2D_vectorized(u,D,dx,dt,T):
    n_steps = int(T/dt)
    for j in range(n_steps):
        u = u + dt*centered_difference_2D_vectorized(u,D,dx)
    return u

@jit
def solve_heat_equation_2D_numba(u,D,dx,dt,T):
    n_steps = int(T/dt)
    for j in range(n_steps):
        u = u + dt*centered_difference_2D_numba(u,D,dx)
    return u

def solve_heat_equation_2D_cython(u,D,dx,dt,T):
    n_steps = int(T/dt)
    for j in range(n_steps):
        u = u + dt*centered_difference_2D_cython(u,D,dx)
    return u

def solve_heat_equation_2D_fortran(u,D,dx,dt,T):
    N = u.shape[0]
    n_steps = int(T/dt)
    for j in range(n_steps):
        centered_diff_fort.centered_diff_fort(u,D,dx,N)
        u = u + dt*D
    return u

N = 200
x = np.linspace(0,1,N)
X, Y = np.meshgrid(x,x)
u = np.exp(-100*((X-0.5)**2+(Y-0.5)**2))
D = np.zeros_like(u)
dx = x[1]-x[0]
dt = 0.2 * dx**2
T = 0.01

times = {}
#print 'Python'
#y = %timeit -o solve_heat_equation_2D(u,D,dx,dt,T=T)
#times['Python'] = y.best

print 'numpy vectorized'
y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_vectorized(u,D,dx,dt,T=T)')
times['numpy'] = y.best

print 'numba'
y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_numba(u,D,dx,dt,T=T)')
times['numba'] = y.best

print 'Cython'
y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_cython(u,D,dx,dt,T=T)')
times['Cython'] = y.best

print 'Fortran'
uF = np.zeros((N,N),order='F')
DF = np.zeros((N,N),order='F')
uF[:,:] = np.exp(-100*((X-0.5)**2+(Y-0.5)**2))
y = get_ipython().run_line_magic('timeit', '-o solve_heat_equation_2D_fortran(uF,DF,dx,dt,T=T)')
times['Fortran'] = y.best


plt.bar(range(len(times)), times.values(), align='center')
plt.xticks(range(len(times)), times.keys());