The purpose of this code is to convert a color image to greyscale using both the GPU and the CPU. We compare the performance of each method using the system timer.
import PIL
import PIL.Image
image = PIL.Image.open("CinqueTerre.jpg")
image_array_rgb = numpy.array(image)
r,g,b = numpy.split(image_array_rgb, 3, axis=2)
a = numpy.ones_like(r) * 255
image_array_rgba = numpy.concatenate((r,g,b,a), axis=2).copy()
figsize(6,4)
matplotlib.pyplot.imshow(image_array_rgba);
matplotlib.pyplot.title("image_array_rgba");
Note that we convert each color pixel $p_{\text{color}} = (r,g,b)$ into a greyscale pixel $p_{\text{greyscale}}$ by averaging the color channels:
$$p_{\text{greyscale}} = \frac{r+g+b}{3}$$image_array_greyscale_cpu_result = \
(image_array_rgba[:,:,0].astype(numpy.uint32) + \
image_array_rgba[:,:,1].astype(numpy.uint32) + \
image_array_rgba[:,:,2].astype(numpy.uint32)) / 3
figsize(6,4)
matplotlib.pyplot.imshow(image_array_greyscale_cpu_result, cmap="gray", vmin=0, vmax=255);
matplotlib.pyplot.title("image_array_greyscale_cpu_result");
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
image_array_greyscale_gpu_result = numpy.zeros_like(a).squeeze()
source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void color_to_greyscale(
unsigned char* d_greyscale,
uchar4* d_color,
int num_pixels_y,
int num_pixels_x )
{
int ny = num_pixels_y;
int nx = num_pixels_x;
int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x;
if ( image_index_2d.x < nx && image_index_2d.y < ny )
{
uchar4 color = d_color[ image_index_1d ];
unsigned int greyscale = ( color.x + color.y + color.z ) / 3;
d_greyscale[ image_index_1d ] = (unsigned char)greyscale;
}
}
"""
)
color_to_greyscale_function = source_module.get_function("color_to_greyscale")
start_timer = pycuda.driver.Event()
end_timer = pycuda.driver.Event()
num_pixels_y = numpy.int32(image_array_rgba.shape[0])
num_pixels_x = numpy.int32(image_array_rgba.shape[1])
color_to_greyscale_function_block = (32,16,1)
num_blocks_y = int(ceil(float(num_pixels_y) / float(color_to_greyscale_function_block[1])))
num_blocks_x = int(ceil(float(num_pixels_x) / float(color_to_greyscale_function_block[0])))
color_to_greyscale_function_grid = (num_blocks_x, num_blocks_y)
image_array_rgba_device = pycuda.driver.mem_alloc(image_array_rgba.nbytes)
image_array_greyscale_device = pycuda.driver.mem_alloc(image_array_greyscale_gpu_result.nbytes)
pycuda.driver.memcpy_htod(image_array_rgba_device, image_array_rgba)
pycuda.driver.memcpy_htod(image_array_greyscale_device, image_array_greyscale_gpu_result)
color_to_greyscale_function(
image_array_greyscale_device,
image_array_rgba_device,
num_pixels_y,
num_pixels_x,
block=color_to_greyscale_function_block,
grid=color_to_greyscale_function_grid)
pycuda.driver.memcpy_dtoh(image_array_greyscale_gpu_result, image_array_greyscale_device)
figsize(6,4)
matplotlib.pyplot.imshow(image_array_greyscale_gpu_result, cmap="gray", vmin=0, vmax=255);
matplotlib.pyplot.title("image_array_greyscale_gpu_result");
diff = numpy.abs(image_array_greyscale_cpu_result.astype(float32) - image_array_greyscale_gpu_result.astype(float32))
max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255
print \
"Difference between GPU and CPU results as a percentage of the maximum possible difference (should be 0%%): %0.2f%%" % \
(100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff)))
print
figsize(20,4)
matplotlib.pyplot.subplot(131)
matplotlib.pyplot.imshow(image_array_greyscale_cpu_result, cmap="gray", vmin=0, vmax=255);
matplotlib.pyplot.title("image_array_greyscale_cpu_result")
matplotlib.pyplot.subplot(132)
matplotlib.pyplot.imshow(image_array_greyscale_gpu_result, cmap="gray", vmin=0, vmax=255);
matplotlib.pyplot.title("image_array_greyscale_gpu_result")
matplotlib.pyplot.subplot(133)
matplotlib.pyplot.imshow(diff, cmap="gray");
matplotlib.pyplot.title("diff")
matplotlib.pyplot.colorbar();
Difference between GPU and CPU results as a percentage of the maximum possible difference (should be 0%): 0.00%
import sys
import time
if sys.platform == "win32":
print "Using time.clock for benchmarking...\n"
system_timer_function = time.clock
else:
print "Using time.time for benchmarking...\n"
system_timer_function = time.time
num_timing_iterations = 100
print "num_timing_iterations = %d" % num_timing_iterations
Using time.time for benchmarking... num_timing_iterations = 100
total_time_seconds = 0
image_array_rgba_uint32 = image_array_rgba.astype(numpy.uint32)
for i in range(num_timing_iterations):
start_time_seconds = system_timer_function()
image_array_greyscale_cpu_result = \
(image_array_rgba_uint32[:,:,0] + image_array_rgba_uint32[:,:,1] + image_array_rgba_uint32[:,:,2]) / 3
end_time_seconds = system_timer_function()
elapsed_time_seconds = (end_time_seconds - start_time_seconds)
total_time_seconds = total_time_seconds + elapsed_time_seconds
average_time_seconds_cpu = total_time_seconds / num_timing_iterations
print "Using system timer for benchmarking (see above)..."
print "Average time elapsed executing color to greyscale conversion on the CPU over %d runs: %f s" % (num_timing_iterations,average_time_seconds_cpu)
print
figsize(6,4)
matplotlib.pyplot.imshow(image_array_greyscale_cpu_result, cmap="gray", vmin=0, vmax=255);
matplotlib.pyplot.title("image_array_greyscale_cpu_result");
Using system timer for benchmarking (see above)... Average time elapsed executing color to greyscale conversion on the CPU over 100 runs: 0.263890 s
total_time_seconds = 0
image_array_greyscale_gpu_result = numpy.zeros_like(a).squeeze()
for i in range(num_timing_iterations):
pycuda.driver.memcpy_htod(image_array_greyscale_device, image_array_greyscale_gpu_result)
pycuda.driver.Context.synchronize()
start_time_seconds = system_timer_function()
color_to_greyscale_function(
image_array_greyscale_device,
image_array_rgba_device,
num_pixels_y,
num_pixels_x,
block=color_to_greyscale_function_block,
grid=color_to_greyscale_function_grid)
pycuda.driver.Context.synchronize()
end_time_seconds = system_timer_function()
elapsed_time_seconds = end_time_seconds - start_time_seconds
total_time_seconds = total_time_seconds + elapsed_time_seconds
pycuda.driver.memcpy_dtoh(image_array_greyscale_gpu_result, image_array_greyscale_device)
average_time_seconds_gpu = total_time_seconds / num_timing_iterations
print "Using system timer for benchmarking (see above)..."
print "Average time elapsed executing color_to_greyscale GPU kernel over %d runs: %f s" % (num_timing_iterations,average_time_seconds_gpu)
print
figsize(6,4)
matplotlib.pyplot.imshow(image_array_greyscale_gpu_result, cmap="gray", vmin=0, vmax=255);
matplotlib.pyplot.title("image_array_greyscale_gpu_result");
Using system timer for benchmarking (see above)... Average time elapsed executing color_to_greyscale GPU kernel over 100 runs: 0.028072 s
gpu_speedup = average_time_seconds_cpu / average_time_seconds_gpu
print "Average CPU time: %f s" % average_time_seconds_cpu
print "Average GPU time: %f s" % average_time_seconds_gpu
print "GPU speedup: %f x" % gpu_speedup
Average CPU time: 0.263890 s Average GPU time: 0.028072 s GPU speedup: 9.400502 x