import numpy as np import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule a = np.linspace(1,16,16).astype(np.float32) a a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void dubluj(float *a) { int idx = threadIdx.x; a[idx] *= 2; } """) func = mod.get_function("dubluj") func(a_gpu, block=(16,1,1)) print a cuda.memcpy_dtoh(a, a_gpu) print a func(cuda.InOut(a), block=(4, 4, 1)) print a import pycuda.gpuarray as gpuarray a = np.linspace(1,16,16).astype(np.float32) a_gpu = gpuarray.to_gpu(a.astype(numpy.float32)) a_doubled = (a_gpu*2).get() print a_gpu print a_doubled import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule mod = SourceModule(""" #include __global__ void printf_test() { printf("GONZO %d.%d.%d\\n", threadIdx.x, threadIdx.y, threadIdx.z); } """) func = mod.get_function("printf_test") func(block=(2,2,1)) import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void kernel(float *a) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if(threadIdx.x==0) a[idx] = 1.0f; } """) a = np.zeros(10).astype(np.float32) func = mod.get_function("kernel") print np.linspace(0,9,10) print "----------------" print a func(cuda.InOut(a),block=(5,1,1),grid=(2,1,1)) print a