import PIL import PIL.Image image = PIL.Image.open("RedEye.jpg") image_template = PIL.Image.open("RedEyeTemplate.jpg") image_array_rgb = numpy.array(image) image_template_array_rgb = numpy.array(image_template) r,g,b = numpy.split(image_array_rgb, 3, axis=2) r_template,g_template,b_template = numpy.split(image_template_array_rgb, 3, axis=2) a = numpy.ones_like(r) * 255 a_template = numpy.ones_like(r_template) * 255 rgba = numpy.concatenate((r,g,b,a), axis=2).copy() rgba_template = numpy.concatenate((r_template,g_template,b_template,a_template), axis=2).copy() r = r.squeeze().copy() g = g.squeeze().copy() b = b.squeeze().copy() a = a.squeeze().copy() r_template = r_template.squeeze().copy() g_template = g_template.squeeze().copy() b_template = b_template.squeeze().copy() figsize(9,4) matplotlib.pyplot.subplot(121) matplotlib.pyplot.imshow(rgba); matplotlib.pyplot.title("rgba"); matplotlib.pyplot.subplot(122) matplotlib.pyplot.imshow(rgba_template, interpolation="nearest"); matplotlib.pyplot.title("rgba_template"); import cv2 r_response_opencv = cv2.matchTemplate(r.astype(numpy.float32), r_template.astype(numpy.float32), cv2.TM_CCOEFF_NORMED) g_response_opencv = cv2.matchTemplate(g.astype(numpy.float32), g_template.astype(numpy.float32), cv2.TM_CCOEFF_NORMED) b_response_opencv = cv2.matchTemplate(b.astype(numpy.float32), b_template.astype(numpy.float32), cv2.TM_CCOEFF_NORMED) response_opencv = r_response_opencv * g_response_opencv * b_response_opencv figsize(20,4) matplotlib.pyplot.subplot(141); matplotlib.pyplot.imshow(r_response_opencv, cmap="gray"); matplotlib.pyplot.title("r_response_opencv"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(142); matplotlib.pyplot.imshow(g_response_opencv, cmap="gray"); matplotlib.pyplot.title("g_response_opencv"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(143); matplotlib.pyplot.imshow(b_response_opencv, cmap="gray"); matplotlib.pyplot.title("b_response_opencv"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(144); matplotlib.pyplot.imshow(response_opencv, cmap="gray"); matplotlib.pyplot.title("response_opencv"); matplotlib.pyplot.colorbar(); figsize(4,4) matplotlib.pyplot.imshow(response_opencv > 0.15, cmap="gray"); matplotlib.pyplot.title("response_opencv > 0.15"); matplotlib.pyplot.colorbar(); import pycuda.autoinit import pycuda.driver import pycuda.compiler import radixsort num_pixels_y = numpy.int32(r.shape[0]) num_pixels_x = numpy.int32(r.shape[1]) r_response_gpu_naive = numpy.zeros_like(r, dtype=numpy.float32) g_response_gpu_naive = numpy.zeros_like(g, dtype=numpy.float32) b_response_gpu_naive = numpy.zeros_like(b, dtype=numpy.float32) response_gpu_naive = numpy.zeros_like(r, dtype=numpy.float32) r_response_gpu_shared_memory = numpy.zeros_like(r, dtype=numpy.float32) g_response_gpu_shared_memory = numpy.zeros_like(g, dtype=numpy.float32) b_response_gpu_shared_memory = numpy.zeros_like(b, dtype=numpy.float32) response_gpu_shared_memory = numpy.zeros_like(r, dtype=numpy.float32) positive_response_gpu = numpy.zeros_like(r, dtype=numpy.float32) coordinates_gpu = numpy.zeros_like(r, dtype=numpy.uint32) sorted_positive_response_gpu = numpy.zeros_like(r, dtype=numpy.float32) sorted_coordinates_gpu = numpy.zeros_like(r, dtype=numpy.uint32) r_output_gpu = numpy.zeros_like(r) r_device = pycuda.driver.mem_alloc(r.nbytes) g_device = pycuda.driver.mem_alloc(g.nbytes) b_device = pycuda.driver.mem_alloc(b.nbytes) r_template_device = pycuda.driver.mem_alloc(r_template.nbytes) g_template_device = pycuda.driver.mem_alloc(g_template.nbytes) b_template_device = pycuda.driver.mem_alloc(b_template.nbytes) r_response_device = pycuda.driver.mem_alloc(r_response_gpu_naive.nbytes) g_response_device = pycuda.driver.mem_alloc(g_response_gpu_naive.nbytes) b_response_device = pycuda.driver.mem_alloc(b_response_gpu_naive.nbytes) response_device = pycuda.driver.mem_alloc(response_gpu_naive.nbytes) positive_response_device = pycuda.driver.mem_alloc(positive_response_gpu.nbytes) coordinates_device = pycuda.driver.mem_alloc(coordinates_gpu.nbytes) sorted_positive_response_device = pycuda.driver.mem_alloc(sorted_positive_response_gpu.nbytes) sorted_coordinates_device = pycuda.driver.mem_alloc(sorted_coordinates_gpu.nbytes) r_output_device = pycuda.driver.mem_alloc(r.nbytes) radix_sort_manager = radixsort.RadixSortManager(512*512) import pycuda.autoinit import pycuda.driver import pycuda.compiler source_module = pycuda.compiler.SourceModule \ ( """ __global__ void naive_normalized_cross_correlation( float* d_response, unsigned char* d_original, unsigned char* d_template, int num_pixels_y, int num_pixels_x, int template_half_height, int template_height, int template_half_width, int template_width, int template_size, float template_mean ) { int ny = num_pixels_y; int nx = num_pixels_x; int knx = template_width; int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x; if ( image_index_2d.x < nx && image_index_2d.y < ny ) { // // compute image mean // float image_sum = 0.0f; for ( int y = -template_half_height; y <= template_half_height; y++ ) { for ( int x = -template_half_width; x <= template_half_width; x++ ) { int2 image_offset_index_2d = make_int2( image_index_2d.x + x, image_index_2d.y + y ); int2 image_offset_index_2d_clamped = make_int2( min( nx - 1, max( 0, image_offset_index_2d.x ) ), min( ny - 1, max( 0, image_offset_index_2d.y ) ) ); int image_offset_index_1d_clamped = ( nx * image_offset_index_2d_clamped.y ) + image_offset_index_2d_clamped.x; unsigned char image_offset_value = d_original[ image_offset_index_1d_clamped ]; image_sum += (float)image_offset_value; } } float image_mean = image_sum / (float)template_size; // // compute sums // float sum_of_image_template_diff_products = 0.0f; float sum_of_squared_image_diffs = 0.0f; float sum_of_squared_template_diffs = 0.0f; for ( int y = -template_half_height; y <= template_half_height; y++ ) { for ( int x = -template_half_width; x <= template_half_width; x++ ) { int2 image_offset_index_2d = make_int2( image_index_2d.x + x, image_index_2d.y + y ); int2 image_offset_index_2d_clamped = make_int2( min( nx - 1, max( 0, image_offset_index_2d.x ) ), min( ny - 1, max( 0, image_offset_index_2d.y ) ) ); int image_offset_index_1d_clamped = ( nx * image_offset_index_2d_clamped.y ) + image_offset_index_2d_clamped.x; unsigned char image_offset_value = d_original[ image_offset_index_1d_clamped ]; float image_diff = (float)image_offset_value - image_mean; int2 template_index_2d = make_int2( x + template_half_width, y + template_half_height ); int template_index_1d = ( knx * template_index_2d.y ) + template_index_2d.x; unsigned char template_value = d_template[ template_index_1d ]; float template_diff = template_value - template_mean; float image_template_diff_product = image_offset_value * template_diff; float squared_image_diff = image_diff * image_diff; float squared_template_diff = template_diff * template_diff; sum_of_image_template_diff_products += image_template_diff_product; sum_of_squared_image_diffs += squared_image_diff; sum_of_squared_template_diffs += squared_template_diff; } } // // compute final result // float result_value = 0.0f; if ( sum_of_squared_image_diffs != 0 && sum_of_squared_template_diffs != 0 ) { result_value = sum_of_image_template_diff_products / sqrt( sum_of_squared_image_diffs * sum_of_squared_template_diffs ); } d_response[ image_index_1d ] = result_value; } } __global__ void combined_response( float* d_response, float* d_r_response, float* d_g_response, float* d_b_response, int num_pixels_y, int num_pixels_x ) { int ny = num_pixels_y; int nx = num_pixels_x; int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x; if ( image_index_2d.x < nx && image_index_2d.y < ny ) { d_response[ image_index_1d ] = d_r_response[ image_index_1d ] * d_g_response[ image_index_1d ] * d_b_response[ image_index_1d ]; } } """ ) naive_normalized_cross_correlation_function = source_module.get_function("naive_normalized_cross_correlation") template_height = numpy.int32(r_template.shape[0]) template_width = numpy.int32(r_template.shape[1]) template_half_height = numpy.int32((template_height - 1) / 2) template_half_width = numpy.int32((template_width - 1) / 2) template_size = numpy.int32(template_height * template_width) template_mean_r = numpy.float32(numpy.mean(r_template)) template_mean_g = numpy.float32(numpy.mean(g_template)) template_mean_b = numpy.float32(numpy.mean(b_template)) naive_normalized_cross_correlation_function_block = (16,16,1) num_blocks_y = int(ceil(float(num_pixels_y) / float(naive_normalized_cross_correlation_function_block[1]))) num_blocks_x = int(ceil(float(num_pixels_x) / float(naive_normalized_cross_correlation_function_block[0]))) naive_normalized_cross_correlation_function_grid = (num_blocks_x, num_blocks_y) combined_response_function = source_module.get_function("combined_response") combined_response_function_block = (32,16,1) num_blocks_y = int(ceil(float(num_pixels_y) / float(combined_response_function_block[1]))) num_blocks_x = int(ceil(float(num_pixels_x) / float(combined_response_function_block[0]))) combined_response_function_grid = (num_blocks_x, num_blocks_y) pycuda.driver.memcpy_htod(r_device, r) pycuda.driver.memcpy_htod(g_device, g) pycuda.driver.memcpy_htod(b_device, b) pycuda.driver.memcpy_htod(r_response_device, r_response_gpu_naive) pycuda.driver.memcpy_htod(g_response_device, g_response_gpu_naive) pycuda.driver.memcpy_htod(b_response_device, b_response_gpu_naive) pycuda.driver.memcpy_htod(response_device, response_gpu_naive) pycuda.driver.memcpy_htod(r_template_device, r_template) pycuda.driver.memcpy_htod(g_template_device, g_template) pycuda.driver.memcpy_htod(b_template_device, b_template) pycuda.driver.Context.synchronize() naive_normalized_cross_correlation_function( r_response_device, r_device, r_template_device, num_pixels_y, num_pixels_x, template_half_height, template_height, template_half_width, template_width, template_size, template_mean_r, block=naive_normalized_cross_correlation_function_block, grid=naive_normalized_cross_correlation_function_grid) pycuda.driver.Context.synchronize() naive_normalized_cross_correlation_function( g_response_device, g_device, g_template_device, num_pixels_y, num_pixels_x, template_half_height, template_height, template_half_width, template_width, template_size, template_mean_g, block=naive_normalized_cross_correlation_function_block, grid=naive_normalized_cross_correlation_function_grid) pycuda.driver.Context.synchronize() naive_normalized_cross_correlation_function( b_response_device, b_device, b_template_device, num_pixels_y, num_pixels_x, template_half_height, template_height, template_half_width, template_width, template_size, template_mean_b, block=naive_normalized_cross_correlation_function_block, grid=naive_normalized_cross_correlation_function_grid) pycuda.driver.Context.synchronize() combined_response_function( response_device, r_response_device, g_response_device, b_response_device, num_pixels_y, num_pixels_x, block=combined_response_function_block, grid=combined_response_function_grid) pycuda.driver.Context.synchronize() pycuda.driver.memcpy_dtoh(r_response_gpu_naive, r_response_device) pycuda.driver.memcpy_dtoh(g_response_gpu_naive, g_response_device) pycuda.driver.memcpy_dtoh(b_response_gpu_naive, b_response_device) pycuda.driver.memcpy_dtoh(response_gpu_naive, response_device) figsize(20,4) matplotlib.pyplot.subplot(141); matplotlib.pyplot.imshow(r_response_gpu_naive, cmap="gray"); matplotlib.pyplot.title("r_response_gpu_naive"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(142); matplotlib.pyplot.imshow(g_response_gpu_naive, cmap="gray"); matplotlib.pyplot.title("g_response_gpu_naive"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(143); matplotlib.pyplot.imshow(b_response_gpu_naive, cmap="gray"); matplotlib.pyplot.title("b_response_gpu_naive"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(144); matplotlib.pyplot.imshow(response_gpu_naive, cmap="gray"); matplotlib.pyplot.title("response_gpu_naive"); matplotlib.pyplot.colorbar(); import pycuda.autoinit import pycuda.driver import pycuda.compiler source_module = pycuda.compiler.SourceModule \ ( """ #define BLOCK_SIZE_Y 16 #define BLOCK_SIZE_X 16 #define TEMPLATE_HALF_HEIGHT 16 #define TEMPLATE_HALF_WIDTH 16 #define SHARED_MEMORY_SIZE_Y BLOCK_SIZE_Y + ( 2 * TEMPLATE_HALF_HEIGHT ) #define SHARED_MEMORY_SIZE_X BLOCK_SIZE_X + ( 2 * TEMPLATE_HALF_WIDTH ) + 1 #define SHARED_MEMORY_OFFSET_Y TEMPLATE_HALF_HEIGHT #define SHARED_MEMORY_OFFSET_X TEMPLATE_HALF_WIDTH __global__ void shared_memory_normalized_cross_correlation( float* d_response, unsigned char* d_original, unsigned char* d_template, int num_pixels_y, int num_pixels_x, int template_half_height, int template_height, int template_half_width, int template_width, int template_size, float template_mean ) { __shared__ unsigned char s_original[ SHARED_MEMORY_SIZE_Y ][ SHARED_MEMORY_SIZE_X ]; int ny = num_pixels_y; int nx = num_pixels_x; int knx = template_width; int2 image_index_2d_global = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int2 image_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_index_2d_global.x ) ), min( ny - 1, max( 0, image_index_2d_global.y ) ) ); int image_index_1d_global_clamped = ( nx * image_index_2d_global_clamped.y ) + image_index_2d_global_clamped.x; int2 image_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y ); // // load center of shared memory // s_original[ image_index_2d_shared_memory.y ][ image_index_2d_shared_memory.x ] = d_original[ image_index_1d_global_clamped ]; // // load y+1 halo into shared memory // if ( threadIdx.y < TEMPLATE_HALF_HEIGHT ) { int2 image_halo_index_2d_global = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load y-1 halo into shared memory // if ( threadIdx.y >= BLOCK_SIZE_Y - TEMPLATE_HALF_HEIGHT ) { int2 image_halo_index_2d_global = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x+1 halo into shared memory // if ( threadIdx.x < TEMPLATE_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x-1 halo into shared memory // if ( threadIdx.x >= BLOCK_SIZE_X - TEMPLATE_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x+1,y+1 halo into shared memory // if ( threadIdx.x < TEMPLATE_HALF_WIDTH && threadIdx.y < TEMPLATE_HALF_HEIGHT ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x+1,y-1 halo into shared memory // if ( threadIdx.x < TEMPLATE_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - TEMPLATE_HALF_HEIGHT ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x-1,y+1 halo into shared memory // if ( threadIdx.x >= BLOCK_SIZE_X - TEMPLATE_HALF_WIDTH && threadIdx.y < TEMPLATE_HALF_HEIGHT ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x-1,y-1 halo into shared memory // if ( threadIdx.x >= BLOCK_SIZE_X - TEMPLATE_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - TEMPLATE_HALF_HEIGHT ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // wait until all threads in the thread block are finished loading the image chunk into shared memory // __syncthreads(); if ( image_index_2d_global.x < nx && image_index_2d_global.y < ny ) { // // compute image mean // float image_sum = 0.0f; for ( int y = -template_half_height; y <= template_half_height; y++ ) { for ( int x = -template_half_width; x <= template_half_width; x++ ) { int2 image_offset_index_2d = make_int2( image_index_2d_shared_memory.x + x, image_index_2d_shared_memory.y + y ); unsigned char image_offset_value = s_original[ image_offset_index_2d.y ][ image_offset_index_2d.x ]; image_sum += (float)image_offset_value; } } float image_mean = image_sum / (float)template_size; // // compute sums // float sum_of_image_template_diff_products = 0.0f; float sum_of_squared_image_diffs = 0.0f; float sum_of_squared_template_diffs = 0.0f; for ( int y = -template_half_height; y <= template_half_height; y++ ) { for ( int x = -template_half_width; x <= template_half_width; x++ ) { int2 image_offset_index_2d = make_int2( image_index_2d_shared_memory.x + x, image_index_2d_shared_memory.y + y ); unsigned char image_offset_value = s_original[ image_offset_index_2d.y ][ image_offset_index_2d.x ]; float image_diff = (float)image_offset_value - image_mean; int2 template_index_2d = make_int2( x + template_half_width, y + template_half_height ); int template_index_1d = ( knx * template_index_2d.y ) + template_index_2d.x; unsigned char template_value = d_template[ template_index_1d ]; float template_diff = template_value - template_mean; float image_template_diff_product = image_offset_value * template_diff; float squared_image_diff = image_diff * image_diff; float squared_template_diff = template_diff * template_diff; sum_of_image_template_diff_products += image_template_diff_product; sum_of_squared_image_diffs += squared_image_diff; sum_of_squared_template_diffs += squared_template_diff; } } // // compute final result // float result_value = 0.0f; if ( sum_of_squared_image_diffs != 0 && sum_of_squared_template_diffs != 0 ) { result_value = sum_of_image_template_diff_products / sqrt( sum_of_squared_image_diffs * sum_of_squared_template_diffs ); } d_response[ image_index_1d_global_clamped ] = result_value; } } __global__ void combined_response( float* d_response, float* d_r_response, float* d_g_response, float* d_b_response, int num_pixels_y, int num_pixels_x ) { int ny = num_pixels_y; int nx = num_pixels_x; int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x; if ( image_index_2d.x < nx && image_index_2d.y < ny ) { d_response[ image_index_1d ] = d_r_response[ image_index_1d ] * d_g_response[ image_index_1d ] * d_b_response[ image_index_1d ]; } } """ ) shared_memory_normalized_cross_correlation_function = source_module.get_function("shared_memory_normalized_cross_correlation") template_height = numpy.int32(r_template.shape[0]) template_width = numpy.int32(r_template.shape[1]) template_half_height = numpy.int32((template_height - 1) / 2) template_half_width = numpy.int32((template_width - 1) / 2) template_size = numpy.int32(template_height * template_width) template_mean_r = numpy.float32(numpy.mean(r_template)) template_mean_g = numpy.float32(numpy.mean(g_template)) template_mean_b = numpy.float32(numpy.mean(b_template)) shared_memory_normalized_cross_correlation_function_block = (16,16,1) num_blocks_y = \ int(ceil(float(num_pixels_y) / float(shared_memory_normalized_cross_correlation_function_block[1]))) num_blocks_x = \ int(ceil(float(num_pixels_x) / float(shared_memory_normalized_cross_correlation_function_block[0]))) shared_memory_normalized_cross_correlation_function_grid = (num_blocks_x, num_blocks_y) combined_response_function = source_module.get_function("combined_response") combined_response_function_block = (32,16,1) num_blocks_y = int(ceil(float(num_pixels_y) / float(combined_response_function_block[1]))) num_blocks_x = int(ceil(float(num_pixels_x) / float(combined_response_function_block[0]))) combined_response_function_grid = (num_blocks_x, num_blocks_y) pycuda.driver.memcpy_htod(r_device, r) pycuda.driver.memcpy_htod(g_device, g) pycuda.driver.memcpy_htod(b_device, b) pycuda.driver.memcpy_htod(r_response_device, r_response_gpu_shared_memory) pycuda.driver.memcpy_htod(g_response_device, g_response_gpu_shared_memory) pycuda.driver.memcpy_htod(b_response_device, b_response_gpu_shared_memory) pycuda.driver.memcpy_htod(response_device, response_gpu_shared_memory) pycuda.driver.memcpy_htod(r_template_device, r_template) pycuda.driver.memcpy_htod(g_template_device, g_template) pycuda.driver.memcpy_htod(b_template_device, b_template) pycuda.driver.Context.synchronize() shared_memory_normalized_cross_correlation_function( r_response_device, r_device, r_template_device, num_pixels_y, num_pixels_x, template_half_height, template_height, template_half_width, template_width, template_size, template_mean_r, block=naive_normalized_cross_correlation_function_block, grid=naive_normalized_cross_correlation_function_grid) pycuda.driver.Context.synchronize() shared_memory_normalized_cross_correlation_function( g_response_device, g_device, g_template_device, num_pixels_y, num_pixels_x, template_half_height, template_height, template_half_width, template_width, template_size, template_mean_g, block=naive_normalized_cross_correlation_function_block, grid=naive_normalized_cross_correlation_function_grid) pycuda.driver.Context.synchronize() shared_memory_normalized_cross_correlation_function( b_response_device, b_device, b_template_device, num_pixels_y, num_pixels_x, template_half_height, template_height, template_half_width, template_width, template_size, template_mean_b, block=naive_normalized_cross_correlation_function_block, grid=naive_normalized_cross_correlation_function_grid) pycuda.driver.Context.synchronize() combined_response_function( response_device, r_response_device, g_response_device, b_response_device, num_pixels_y, num_pixels_x, block=combined_response_function_block, grid=combined_response_function_grid) pycuda.driver.Context.synchronize() pycuda.driver.memcpy_dtoh(r_response_gpu_shared_memory, g_response_device) pycuda.driver.memcpy_dtoh(g_response_gpu_shared_memory, g_response_device) pycuda.driver.memcpy_dtoh(b_response_gpu_shared_memory, b_response_device) pycuda.driver.memcpy_dtoh(response_gpu_shared_memory, response_device) figsize(20,4) matplotlib.pyplot.subplot(141); matplotlib.pyplot.imshow(r_response_gpu_shared_memory, cmap="gray"); matplotlib.pyplot.title("r_response_gpu_shared_memory"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(142); matplotlib.pyplot.imshow(g_response_gpu_shared_memory, cmap="gray"); matplotlib.pyplot.title("g_response_gpu_shared_memory"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(143); matplotlib.pyplot.imshow(b_response_gpu_shared_memory, cmap="gray"); matplotlib.pyplot.title("b_response_gpu_shared_memory"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(144); matplotlib.pyplot.imshow(response_gpu_shared_memory, cmap="gray"); matplotlib.pyplot.title("response_gpu_shared_memory"); matplotlib.pyplot.colorbar(); response_gpu_naive_cropped = \ response_gpu_naive[ template_half_height:num_pixels_y-template_half_height, template_half_width:num_pixels_x-template_half_width ] diff = numpy.abs(response_opencv - response_gpu_naive_cropped) max_diff = numpy.ones_like(diff, dtype=numpy.float32) print \ "Difference between OpenCV and GPU naive results as a percentage of the maximum possible difference (should be less than 1%%): %0.2f%%" % \ (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff))) print figsize(14,4) matplotlib.pyplot.subplot(131); matplotlib.pyplot.imshow(response_gpu_naive_cropped, cmap="gray"); matplotlib.pyplot.title("response_gpu_naive_cropped"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(132); matplotlib.pyplot.imshow(response_opencv, cmap="gray"); matplotlib.pyplot.title("response_opencv"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(133); matplotlib.pyplot.imshow(diff, cmap="gray"); matplotlib.pyplot.title("diff"); matplotlib.pyplot.colorbar(); diff = numpy.abs(response_gpu_shared_memory - response_gpu_naive) max_diff = numpy.ones_like(diff, dtype=numpy.float32) print \ "Difference between GPU naive and GPU shared memory results as a percentage of the maximum possible difference (should be 0%%): %0.2f%%" % \ (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff))) print figsize(14,4) matplotlib.pyplot.subplot(131); matplotlib.pyplot.imshow(response_gpu_shared_memory, cmap="gray"); matplotlib.pyplot.title("response_gpu_shared_memory"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(132); matplotlib.pyplot.imshow(response_gpu_naive, cmap="gray"); matplotlib.pyplot.title("response_gpu_naive"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(133); matplotlib.pyplot.imshow(diff, cmap="gray"); matplotlib.pyplot.title("diff"); matplotlib.pyplot.colorbar(); import pycuda.autoinit import pycuda.driver import pycuda.compiler source_module = pycuda.compiler.SourceModule \ ( """ __global__ void compute_coordinates( ushort2* d_packed_coordinates, int num_pixels_y, int num_pixels_x ) { int ny = num_pixels_y; int nx = num_pixels_x; int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x; if ( image_index_2d.x < nx && image_index_2d.y < ny ) { d_packed_coordinates[ image_index_1d ] = make_ushort2( image_index_2d.x, image_index_2d.y ); } } __global__ void add_constant( float* d_input, float* d_output, float constant, int num_pixels_y, int num_pixels_x ) { int ny = num_pixels_y; int nx = num_pixels_x; int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x; if ( image_index_2d.x < nx && image_index_2d.y < ny ) { d_output[ image_index_1d ] = d_input[ image_index_1d ] + constant; } } __global__ void remove_redness_from_coordinates( ushort2* d_coordinates, unsigned char* d_r, unsigned char* d_g, unsigned char* d_b, unsigned char* d_r_output, int num_coordinates, int num_pixels_y, int num_pixels_x, int template_half_height, int template_half_width ) { int ny = num_pixels_y; int nx = num_pixels_x; int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x; if ( global_index_1d < num_coordinates ) { ushort2 image_index_2d = d_coordinates[ global_index_1d ]; for ( int y = image_index_2d.y - template_half_height; y < image_index_2d.y + template_half_height; y++ ) { for ( int x = image_index_2d.x - template_half_width; x < image_index_2d.x + template_half_width; x++ ) { int2 image_offset_index_2d = make_int2( x, y ); int2 image_offset_index_2d_clamped = make_int2( min( nx - 1, max( 0, image_offset_index_2d.x ) ), min( ny - 1, max( 0, image_offset_index_2d.y ) ) ); int image_offset_index_1d_clamped = ( nx * image_offset_index_2d_clamped.y ) + image_offset_index_2d_clamped.x; unsigned char g_value = d_g[ image_offset_index_1d_clamped ]; unsigned char b_value = d_b[ image_offset_index_1d_clamped ]; unsigned int gb_average = ( g_value + b_value ) / 2; d_r_output[ image_offset_index_1d_clamped ] = (unsigned char)gb_average; } } } } """ ) compute_coordinates_function = source_module.get_function("compute_coordinates") add_constant_function = source_module.get_function("add_constant") remove_redness_from_coordinates_function = source_module.get_function("remove_redness_from_coordinates") compute_coordinates_function_block = (16,16,1) num_blocks_y = int(ceil(float(num_pixels_y) / float(compute_coordinates_function_block[1]))) num_blocks_x = int(ceil(float(num_pixels_x) / float(compute_coordinates_function_block[0]))) compute_coordinates_function_grid = (num_blocks_x, num_blocks_y) add_constant_function_block = (16,16,1) num_blocks_y = int(ceil(float(num_pixels_y) / float(add_constant_function_block[1]))) num_blocks_x = int(ceil(float(num_pixels_x) / float(add_constant_function_block[0]))) add_constant_function_grid = (num_blocks_x, num_blocks_y) remove_redness_from_coordinates_function_block = (512,1,1) num_blocks = \ int(ceil(float(num_coordinates_to_recolor) / float(remove_redness_from_coordinates_function_block[0]))) remove_redness_from_coordinates_function_grid = (num_blocks, 1) compute_coordinates_function( coordinates_device, num_pixels_y, num_pixels_x, block=compute_coordinates_function_block, grid=compute_coordinates_function_grid) pycuda.driver.memcpy_dtoh(coordinates_gpu, coordinates_device) print coordinates_gpu print figsize(4,4) matplotlib.pyplot.imshow(coordinates_gpu, cmap="gray"); matplotlib.pyplot.title("coordinates_gpu"); matplotlib.pyplot.colorbar(); add_constant_function( response_device, positive_response_device, numpy.float32(1.0), num_pixels_y, num_pixels_x, block=compute_coordinates_function_block, grid=compute_coordinates_function_grid) pycuda.driver.memcpy_dtoh(positive_response_gpu, positive_response_device) figsize(4,4) matplotlib.pyplot.imshow(positive_response_gpu, cmap="gray"); matplotlib.pyplot.title("positive_response_gpu"); matplotlib.pyplot.colorbar(); radix_sort_manager.radix_sort_key_value_descending_device( positive_response_device, coordinates_device, sorted_positive_response_device, sorted_coordinates_device, int(num_pixels_y * num_pixels_x)) pycuda.driver.memcpy_dtoh(sorted_positive_response_gpu, sorted_positive_response_device) pycuda.driver.memcpy_dtoh(sorted_coordinates_gpu, sorted_coordinates_device) figsize(9,4) matplotlib.pyplot.subplot(121); matplotlib.pyplot.plot(sorted_positive_response_gpu.ravel()); matplotlib.pyplot.title("sorted_positive_response_gpu"); matplotlib.pyplot.subplot(122); matplotlib.pyplot.imshow(sorted_coordinates_gpu, cmap="gray"); matplotlib.pyplot.title("sorted_coordinates_gpu"); matplotlib.pyplot.colorbar(); maximum_responses_image = zeros_like(sorted_positive_response_gpu) sorted_coordinates_view = sorted_coordinates_gpu.ravel().view(dtype="u2, u2") for i in range(40): coordinate = sorted_coordinates_view[i] maximum_responses_image[coordinate[1],coordinate[0]] = 1 figsize(9,4) matplotlib.pyplot.subplot(121) matplotlib.pyplot.imshow(maximum_responses_image, cmap="gray"); matplotlib.pyplot.title("sorted_positive_response_gpu"); matplotlib.pyplot.colorbar(); matplotlib.pyplot.subplot(122) matplotlib.pyplot.imshow(response_opencv > 0.15, cmap="gray"); matplotlib.pyplot.title("response_opencv > 0.15"); matplotlib.pyplot.colorbar(); pycuda.driver.memcpy_dtod(r_output_device, r_device, int(num_pixels_y * num_pixels_x)) num_coordinates_to_recolor = numpy.int32(40) recolor_template_half_height = numpy.int32(7) recolor_template_half_width = numpy.int32(7) remove_redness_from_coordinates_function( sorted_coordinates_device, r_device, g_device, b_device, r_output_device, num_coordinates_to_recolor, num_pixels_y, num_pixels_x, recolor_template_half_width, recolor_template_half_height, block=remove_redness_from_coordinates_function_block, grid=remove_redness_from_coordinates_function_grid) pycuda.driver.memcpy_dtoh(r_output_gpu, r_output_device) rgba_output_gpu = numpy.concatenate((r_output_gpu[:,:,newaxis],g[:,:,newaxis],b[:,:,newaxis],a[:,:,newaxis]), axis=2).copy() figsize(9,4) matplotlib.pyplot.subplot(121); matplotlib.pyplot.imshow(rgba); matplotlib.pyplot.title("rgba"); matplotlib.pyplot.subplot(122); matplotlib.pyplot.imshow(rgba_output_gpu); matplotlib.pyplot.title("rgba_output_gpu");