%load https://raw.github.com/lebedov/scikits.cuda/master/demos/fft_demo.py
%%time
"""
Demonstrates how to use the PyCUDA interface to CUFFT to compute 1D FFTs.
"""
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import scikits.cuda.fft as cu_fft
print 'Testing fft/ifft..'
N = 4096*16000
x = np.asarray(np.random.rand(N), np.float32)
xf = np.fft.fft(x)
y = np.real(np.fft.ifft(xf))
x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty(N/2+1, np.complex64)
plan_forward = cu_fft.Plan(x_gpu.shape, np.float32, np.complex64)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)
y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(x_gpu.shape, np.complex64, np.float32)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)
print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)
print 'Testing in-place fft..'
x = np.asarray(np.random.rand(N)+1j*np.random.rand(N), np.complex64)
x_gpu = gpuarray.to_gpu(x)
plan = cu_fft.Plan(x_gpu.shape, np.complex64, np.complex64)
cu_fft.fft(x_gpu, x_gpu, plan)
cu_fft.ifft(x_gpu, x_gpu, plan, True)
print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6)
Testing fft/ifft.. Success status: True Testing in-place fft.. Success status: True CPU times: user 14.9 s, sys: 3.21 s, total: 18.2 s Wall time: 18.4 s
%%time
cu_fft.fft(x_gpu, x_gpu, plan)
x_gpu.get()[1234]
CPU times: user 132 ms, sys: 148 ms, total: 280 ms Wall time: 280 ms
%%time
xfft = np.fft.fft(x)
CPU times: user 4.01 s, sys: 176 ms, total: 4.19 s Wall time: 4.19 s
x.shape[0]/1024**2
62
x_gpu.get()[1234]
(nan+nan*j)
%load https://github.com/lebedov/scikits.cuda/raw/master/demos/dot_demo.py
%%time
"""
Demonstrates multiplication of two matrices on the GPU.
"""
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import numpy as np
import scikits.cuda.linalg as culinalg
import scikits.cuda.misc as cumisc
culinalg.init()
# Double precision is only supported by devices with compute
# capability >= 1.3:
import string
demo_types = [np.float32, np.complex64]
if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3:
demo_types.extend([np.float64, np.complex128])
for t in demo_types:
print 'Testing matrix multiplication for type ' + str(np.dtype(t))
if np.iscomplexobj(t()):
a = np.asarray(np.random.rand(10, 5)+1j*np.random.rand(10, 5), t)
b = np.asarray(np.random.rand(5, 5)+1j*np.random.rand(5, 5), t)
c = np.asarray(np.random.rand(5, 5)+1j*np.random.rand(5, 5), t)
else:
a = np.asarray(np.random.rand(10, 5), t)
b = np.asarray(np.random.rand(5, 5), t)
c = np.asarray(np.random.rand(5, 5), t)
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)
temp_gpu = culinalg.dot(a_gpu, b_gpu)
d_gpu = culinalg.dot(temp_gpu, c_gpu)
temp_gpu.gpudata.free()
del(temp_gpu)
print 'Success status: ', np.allclose(np.dot(np.dot(a, b), c) , d_gpu.get())
print 'Testing vector multiplication for type ' + str(np.dtype(t))
if np.iscomplexobj(t()):
d = np.asarray(np.random.rand(5)+1j*np.random.rand(5), t)
e = np.asarray(np.random.rand(5)+1j*np.random.rand(5), t)
else:
d = np.asarray(np.random.rand(5), t)
e = np.asarray(np.random.rand(5), t)
d_gpu = gpuarray.to_gpu(d)
e_gpu = gpuarray.to_gpu(e)
temp = culinalg.dot(d_gpu, e_gpu)
print 'Success status: ', np.allclose(np.dot(d, e), temp)
Testing matrix multiplication for type float32 Success status: True Testing vector multiplication for type float32 Success status: True Testing matrix multiplication for type complex64 Success status: True Testing vector multiplication for type complex64 Success status: True Testing matrix multiplication for type float64 Success status: True Testing vector multiplication for type float64 Success status: True Testing matrix multiplication for type complex128 Success status: True Testing vector multiplication for type complex128 Success status: True CPU times: user 4 ms, sys: 0 ns, total: 4 ms Wall time: 3.7 ms