#!/usr/bin/env python # coding: utf-8 # > This is one of the 100 recipes of the [IPython Cookbook](http://ipython-books.github.io/), the definitive guide to high-performance scientific computing and data science in Python. # # # 5.7. Writing massively parallel code for NVIDIA graphics cards (GPUs) with CUDA # Let's import PyCUDA. # In[ ]: import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np # Now, we initialize the NumPy array that will contain the fractal. # In[ ]: size = 200 iterations = 100 col = np.empty((size, size), dtype=np.int32) # We allocate memory for this array on the GPU. # In[ ]: col_gpu = cuda.mem_alloc(col.nbytes) # We write the CUDA kernel in a string. The mandelbrot function accepts the figure size, the number of iterations, and a pointer to the memory buffer as arguments. It updates the col buffer with the escape value in the fractal for each pixel. # In[ ]: code = """ __global__ void mandelbrot(int size, int iterations, int *col) { // Get the row and column index of the current thread. int i = blockIdx.y * blockDim.y + threadIdx.y; int j = blockIdx.x * blockDim.x + threadIdx.x; int index = i * size + j; // Declare and initialize the variables. double cx, cy; double z0, z1, z0_tmp, z0_2, z1_2; cx = -2.0 + (double)j / size * 3; cy = -1.5 + (double)i / size * 3; // Main loop. z0 = z1 = 0.0; for (int n = 0; n < iterations; n++) { z0_2 = z0 * z0; z1_2 = z1 * z1; if (z0_2 + z1_2 <= 100) { // Need to update z0 and z1 in parallel. z0_tmp = z0_2 - z1_2 + cx; z1 = 2 * z0 * z1 + cy; z0 = z0_tmp; col[index] = n; } else break; } } """ # Now, we compile the CUDA program. # In[ ]: prg = SourceModule(code) mandelbrot = prg.get_function("mandelbrot") # We define the block size and the grid size, specifying how the threads will be parallelized with respect to the data. # In[ ]: block_size = 10 block = (block_size, block_size, 1) grid = (size // block_size, size // block_size, 1) # We call the compiled function. # In[ ]: mandelbrot(np.int32(size), np.int32(iterations), col_gpu, block=block, grid=grid) # Once the function has completed, we copy the contents of the CUDA buffer back to the NumPy array col. # In[ ]: cuda.memcpy_dtoh(col, col_gpu) # Let's display the fractal. # In[ ]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.imshow(np.log(col), cmap=plt.cm.hot,); plt.xticks([]); plt.yticks([]); # Let's evaluate the time taken by this function. # In[ ]: get_ipython().run_cell_magic('timeit', 'col_gpu = cuda.mem_alloc(col.nbytes); cuda.memcpy_htod(col_gpu, col)', 'mandelbrot(np.int32(size), np.int32(iterations), col_gpu,\n block=block, grid=grid)\ncuda.memcpy_dtoh(col, col_gpu)\n') # > You'll find all the explanations, figures, references, and much more in the book (to be released later this summer). # # > [IPython Cookbook](http://ipython-books.github.io/), by [Cyrille Rossant](http://cyrille.rossant.net), Packt Publishing, 2014 (500 pages).