cd ./PyCUDA/ import os os.listdir('.') %load pycuda_montecarlo_pi.py import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy import time C_cuda_code = SourceModule(""" __global__ void MC_Pi(float *U1, float *U2, float *counter) { const int i = blockIdx.x*blockDim.x + threadIdx.x; counter[i] = 0; if( (pow(U1[i],2) + pow(U2[i],2)) <= 1.0 ) counter[i] = 1; } """) n = 16*1024*1204 U1 = numpy.random.rand(n).astype('f') U2 = numpy.random.rand(n).astype('f') counter = numpy.zeros(n).astype('f') start_time = time.time() func = C_cuda_code.get_function("MC_Pi") size_block = 1024 size_grid = int((n-1)/size_block + 1) func(cuda.In(U1), cuda.In(U2), cuda.Out(counter), block=(size_block,1,1), grid=(size_grid,1)) counter = numpy.sum(counter) print "PI_gpu = ", 4.0*counter/n print "Time elapsed GPU: ", time.time() - start_time, "s" start_time = time.time() counter_cpu = numpy.sum( (numpy.power(U1,2) + numpy.power(U2,2)) <= 1.0 ) print "PI_cpu = ", 4.0*counter_cpu/n print "Time elapsed CPU: ", time.time() - start_time, "s" %load pycuda_montecarlo_pi_GPUArrays.py import pycuda.driver as cuda import pycuda.autoinit import numpy import time import pycuda.gpuarray as gpuarray import pycuda.curandom as curandom n = 16*1024*1204 U1 = curandom.rand(n) U2 = curandom.rand(n) counter = gpuarray.zeros(n, dtype='f') start_time = time.time() counter = gpuarray.sum( (U1*U1 + U2*U2) <= 1.0 ) print "PI_gpu = ", 4.0*counter/n print "Time elapsed GPUArrays: ", time.time() - start_time, "s" # Sequential part U1 = numpy.random.rand(n).astype('f') U2 = numpy.random.rand(n).astype('f') start_time = time.time() counter_cpu = numpy.sum( (numpy.power(U1,2) + numpy.power(U2,2)) <= 1.0 ) print "PI_cpu = ", 4.0*counter_cpu/n print "Time elapsed CPU: ", time.time() - start_time, "s" %load pycuda_sumarrays.py import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy import time C_cuda_code = SourceModule(""" __global__ void sum_arrays(float *a, float *b, float *c) { const int i = blockIdx.x*blockDim.x + threadIdx.x; c[i] = a[i] + b[i]; } """) n = 16*1024*1024 a = numpy.random.randn(n).astype('f') b = numpy.random.randn(n).astype('f') c = numpy.zeros(n).astype('f') a_gpu = cuda.mem_alloc(a.nbytes) b_gpu = cuda.mem_alloc(b.nbytes) c_gpu = cuda.mem_alloc(c.nbytes) start_time = time.time() cuda.memcpy_htod(a_gpu, a) cuda.memcpy_htod(b_gpu, b) func = C_cuda_code.get_function("sum_arrays") size_block = 1024 size_grid = int((n-1)/size_block + 1) func(a_gpu, b_gpu, c_gpu, block=(size_block,1,1), grid=(size_grid,1)) cuda.memcpy_dtoh(c, c_gpu) print "Time elapsed GPU: ", time.time() - start_time, "s" start_time = time.time() c_cpu = a + b print "Time elapsed CPU: ", time.time() - start_time, "s" print numpy.alltrue(abs(c-c_cpu) < 1e-5) %load_ext version_information %version_information numpy, pycuda from IPython.core.display import HTML def css_styling(): styles = open("./styles/custom.css", "r").read() return HTML(styles) css_styling()