#!/usr/bin/env python
# coding: utf-8

# > This is one of the 100 recipes of the [IPython Cookbook](http://ipython-books.github.io/), the definitive guide to high-performance scientific computing and data science in Python.
# 

# # 5.7. Writing massively parallel code for NVIDIA graphics cards (GPUs) with CUDA

# Let's import PyCUDA.

# In[ ]:


import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np


# Now, we initialize the NumPy array that will contain the fractal.

# In[ ]:


size = 200
iterations = 100
col = np.empty((size, size), dtype=np.int32)


# We allocate memory for this array on the GPU.

# In[ ]:


col_gpu = cuda.mem_alloc(col.nbytes)


# We write the CUDA kernel in a string. The mandelbrot function accepts the figure size, the number of iterations, and a pointer to the memory buffer as arguments. It updates the col buffer with the escape value in the fractal for each pixel.

# In[ ]:


code = """
__global__ void mandelbrot(int size,
                           int iterations,
                           int *col)

{
    // Get the row and column index of the current thread.
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int index = i * size + j;
    
    // Declare and initialize the variables.
    double cx, cy;
    double z0, z1, z0_tmp, z0_2, z1_2;
    cx = -2.0 + (double)j / size * 3;
    cy = -1.5 + (double)i / size * 3;

    // Main loop.
    z0 = z1 = 0.0;
    for (int n = 0; n < iterations; n++)
    {
        z0_2 = z0 * z0;
        z1_2 = z1 * z1;
        if (z0_2 + z1_2 <= 100)
        {
            // Need to update z0 and z1 in parallel.
            z0_tmp = z0_2 - z1_2 + cx;
            z1 = 2 * z0 * z1 + cy;
            z0 = z0_tmp;
            col[index] = n;
        }
        else break;
    }
}
"""


# Now, we compile the CUDA program.

# In[ ]:


prg = SourceModule(code)
mandelbrot = prg.get_function("mandelbrot")


# We define the block size and the grid size, specifying how the threads will be parallelized with respect to the data.

# In[ ]:


block_size = 10
block = (block_size, block_size, 1)
grid = (size // block_size, size // block_size, 1)


# We call the compiled function.

# In[ ]:


mandelbrot(np.int32(size), np.int32(iterations), col_gpu,
           block=block, grid=grid)


# Once the function has completed, we copy the contents of the CUDA buffer back to the NumPy array col.

# In[ ]:


cuda.memcpy_dtoh(col, col_gpu)


# Let's display the fractal.

# In[ ]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
plt.imshow(np.log(col), cmap=plt.cm.hot,);
plt.xticks([]);
plt.yticks([]);


# Let's evaluate the time taken by this function.

# In[ ]:


get_ipython().run_cell_magic('timeit', 'col_gpu = cuda.mem_alloc(col.nbytes); cuda.memcpy_htod(col_gpu, col)', 'mandelbrot(np.int32(size), np.int32(iterations), col_gpu,\n           block=block, grid=grid)\ncuda.memcpy_dtoh(col, col_gpu)\n')


# > You'll find all the explanations, figures, references, and much more in the book (to be released later this summer).
# 
# > [IPython Cookbook](http://ipython-books.github.io/), by [Cyrille Rossant](http://cyrille.rossant.net), Packt Publishing, 2014 (500 pages).