The purpose of this code is to implement a simple histogram algorithm on the GPU.
n = 1024 * 100
nbins = 1024
input_data = numpy.random.normal(0, 1, n).astype(numpy.float32)
histogram_cpu, edges = numpy.histogram(input_data, nbins)
figsize(14,4)
matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(input_data);
matplotlib.pyplot.title("input_data");
matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_cpu);
matplotlib.pyplot.title("histogram_cpu");
matplotlib.pyplot.subplot(133);
matplotlib.pyplot.hist(input_data, nbins);
matplotlib.pyplot.title("matplotlib.pyplot.hist(input_data, nbins)");
input_max = numpy.max(input_data)
input_min = numpy.min(input_data)
input_range = input_max - input_min
histogram_cpu_loop = numpy.zeros_like(histogram_cpu)
for i in range(n):
input_value = input_data[i]
bin_index = min(nbins - 1, int((nbins * (input_value - input_min)) / input_range));
histogram_cpu_loop[bin_index] = histogram_cpu_loop[bin_index] + 1
diff = abs(histogram_cpu_loop - histogram_cpu)
print \
"Difference between numpy and hand-coded loop as a percentage of the maximum possible difference (should be less than 1.0%%): %0.2f%%" % \
(100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(histogram_cpu)))
figsize(14,4)
matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(histogram_cpu);
matplotlib.pyplot.title("histogram_cpu");
matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_cpu_loop);
matplotlib.pyplot.title("histogram_cpu_loop");
matplotlib.pyplot.subplot(133);
matplotlib.pyplot.plot(diff);
matplotlib.pyplot.title("diff");
Difference between numpy and hand-coded loop as a percentage of the maximum possible difference (should be less than 1.0%): 0.06%
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
import reduce
import radixsort
import prefixsum
source_module = pycuda.compiler.SourceModule \
(
"""
#define FINE_HISTOGRAM_NUM_VALUES 32
__global__ void compute_bin_ids(
float* d_input_data,
unsigned int* d_coarse_bin_ids,
int n,
int num_coarse_bins,
int num_fine_bins,
float min_value,
float max_value,
float range )
{
int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
if ( global_index_1d < n )
{
float input_value = d_input_data[ global_index_1d ];
int fine_bin_index = min( num_fine_bins - 1, int( ( num_fine_bins * ( input_value - min_value ) ) / range ) );
int coarse_bin_index = fine_bin_index / FINE_HISTOGRAM_NUM_VALUES;
d_coarse_bin_ids[ global_index_1d ] = coarse_bin_index;
}
}
__global__ void compute_bin_offsets(
int* d_bin_offsets,
unsigned int* d_sorted_coarse_bin_ids,
int n )
{
int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
if ( global_index_1d == 0 )
{
d_bin_offsets[ 0 ] = 0;
}
else if ( global_index_1d < n )
{
unsigned int left_bin_id = d_sorted_coarse_bin_ids[ global_index_1d - 1 ];
unsigned int right_bin_id = d_sorted_coarse_bin_ids[ global_index_1d ];
if ( left_bin_id != right_bin_id )
{
unsigned int offset_index = right_bin_id;
d_bin_offsets[ offset_index ] = global_index_1d;
}
}
}
__device__ void increment_warp_histogram( volatile unsigned int* s_warp_histogram, unsigned int warp_histogram_index, unsigned int warp_local_thread_id )
{
unsigned int warp_local_thread_tag = warp_local_thread_id << 24;
unsigned int old_count, new_count, new_count_tagged;
do
{
old_count = s_warp_histogram[ warp_histogram_index ] & 0x00FFFFFF;
new_count = old_count + 1;
new_count_tagged = warp_local_thread_tag | new_count;
s_warp_histogram[ warp_histogram_index ] = new_count_tagged;
}
while( s_warp_histogram[ warp_histogram_index ] != new_count_tagged );
new_count_tagged = s_warp_histogram[ warp_histogram_index ];
s_warp_histogram[ warp_histogram_index ] = new_count_tagged & 0x00FFFFFF;
}
__global__ void compute_fine_histogram(
unsigned int* d_histogram,
unsigned int* d_sorted_coarse_bin_ids,
float* d_sorted_input_data,
int* d_bin_offsets,
int n,
int num_coarse_bins,
int num_fine_bins,
float min_value,
float max_value,
float range )
{
__shared__ float s_sorted_input_data[ FINE_HISTOGRAM_NUM_VALUES ];
__shared__ unsigned int s_fine_histogram[ FINE_HISTOGRAM_NUM_VALUES ];
//
// figure out the start and end points in the sorted input data array for the current coarse bin
//
int start_data_index_1d = d_bin_offsets[ blockIdx.x ];
int end_data_index_1d;
unsigned int shared_memory_index_1d = threadIdx.x;
unsigned int coarse_bin_index_1d = d_sorted_coarse_bin_ids[ start_data_index_1d ];
if ( start_data_index_1d != -1 )
{
if ( coarse_bin_index_1d < num_coarse_bins - 1 )
{
int nex_bin_offset_index = 1;
int next_bin_start_index_1d = d_bin_offsets[ coarse_bin_index_1d + nex_bin_offset_index ];
while ( next_bin_start_index_1d == -1 )
{
nex_bin_offset_index++;
next_bin_start_index_1d = d_bin_offsets[ coarse_bin_index_1d + nex_bin_offset_index ];
}
end_data_index_1d = next_bin_start_index_1d - 1;
}
else if ( coarse_bin_index_1d == num_coarse_bins - 1 )
{
end_data_index_1d = n - 1;
}
else
{
end_data_index_1d = -1;
}
//
// initialize fine-grained histogram in shared memory
//
s_fine_histogram[ shared_memory_index_1d ] = 0;
__syncthreads();
//
// loop through the sorted input data and iteratively compute the histogram in shared memory
//
unsigned int total_num_data_values = end_data_index_1d - start_data_index_1d + 1;
int num_iterations =
int( ceil( float( total_num_data_values ) / float( FINE_HISTOGRAM_NUM_VALUES ) ) );
unsigned int current_iteration_data_base_index_1d = start_data_index_1d;
for ( int i = 0; i < num_iterations; i++ )
{
//
// load current chunk of data into shared memory
//
unsigned int current_iteration_data_thread_index_1d = current_iteration_data_base_index_1d + threadIdx.x;
if ( current_iteration_data_thread_index_1d <= end_data_index_1d )
{
s_sorted_input_data[ shared_memory_index_1d ] = d_sorted_input_data[ current_iteration_data_thread_index_1d ];
}
__syncthreads();
//
// compute fine-grained histogram
//
unsigned int current_max_num_data_values = end_data_index_1d - current_iteration_data_base_index_1d + 1;
unsigned int current_num_data_values = min( FINE_HISTOGRAM_NUM_VALUES, current_max_num_data_values );
if ( shared_memory_index_1d < current_num_data_values )
{
float input_value = s_sorted_input_data[ shared_memory_index_1d ];
unsigned int fine_bin_global_index_1d =
min( num_fine_bins - 1, int( ( num_fine_bins * ( input_value - min_value ) ) / range ) );
unsigned int fine_bin_shared_memory_index_1d =
max(0, min( FINE_HISTOGRAM_NUM_VALUES - 1, fine_bin_global_index_1d - ( coarse_bin_index_1d * FINE_HISTOGRAM_NUM_VALUES ) ) );
increment_warp_histogram( s_fine_histogram, fine_bin_shared_memory_index_1d, shared_memory_index_1d );
}
current_iteration_data_base_index_1d += FINE_HISTOGRAM_NUM_VALUES;
}
__syncthreads();
//
// copy fine-grained histogram into shared memory
//
unsigned int histogram_base_index_1d = coarse_bin_index_1d * FINE_HISTOGRAM_NUM_VALUES;
unsigned int histogram_thread_index_1d = histogram_base_index_1d + threadIdx.x;
if ( histogram_thread_index_1d < num_fine_bins )
{
d_histogram[ histogram_thread_index_1d ] = s_fine_histogram[ shared_memory_index_1d ];
}
}
}
"""
)
num_fine_bins_per_coarse_bin = 32
num_bytes_per_int32 = 4
num_coarse_bins = int(ceil(float(nbins) / float(num_fine_bins_per_coarse_bin)))
histogram_gpu = numpy.zeros_like(histogram_cpu)
input_data_device = pycuda.driver.mem_alloc(input_data.nbytes)
coarse_bin_ids_device = pycuda.driver.mem_alloc(input_data.nbytes)
sorted_data_device = pycuda.driver.mem_alloc(input_data.nbytes)
sorted_coarse_bin_ids_device = pycuda.driver.mem_alloc(input_data.nbytes)
bin_offsets_device = pycuda.driver.mem_alloc(num_coarse_bins * num_bytes_per_int32)
histogram_device = pycuda.driver.mem_alloc(histogram_gpu.nbytes)
compute_bin_ids_function = source_module.get_function("compute_bin_ids")
compute_bin_ids_function_block = (512,1,1)
num_blocks = int(ceil(float(n) / float(compute_bin_ids_function_block[0])))
compute_bin_ids_function_grid = (num_blocks, 1)
compute_bin_offsets_function = source_module.get_function("compute_bin_offsets")
compute_bin_offsets_function_block = (512,1,1)
num_blocks = int(ceil(float(n) / float(compute_bin_offsets_function_block[0])))
compute_bin_offsets_function_grid = (num_blocks, 1)
compute_fine_histogram_function = source_module.get_function("compute_fine_histogram")
compute_fine_histogram_function_block = (num_fine_bins_per_coarse_bin,1,1)
num_blocks = num_coarse_bins
compute_fine_histogram_function_grid = (num_blocks, 1)
reduce_manager = reduce.ReduceManager(n)
radix_sort_manager = radixsort.RadixSortManager(n)
prefix_sum_manager = prefixsum.PrefixSumManager(n)
pycuda.driver.memcpy_htod(input_data_device, input_data)
input_min = reduce_manager.reduce_min_device(input_data_device, n)
input_max = reduce_manager.reduce_max_device(input_data_device, n)
input_range = input_max - input_min
compute_bin_ids_function(
input_data_device,
coarse_bin_ids_device,
numpy.int32(n),
numpy.int32(num_coarse_bins),
numpy.int32(nbins),
numpy.float32(input_min),
numpy.float32(input_max),
numpy.float32(input_range),
block=compute_bin_ids_function_block,
grid=compute_bin_ids_function_grid)
radix_sort_manager.radix_sort_key_value_ascending_device(coarse_bin_ids_device, input_data_device, sorted_coarse_bin_ids_device, sorted_data_device, n)
pycuda.driver.memset_d32(bin_offsets_device, int(0xffffffff), num_coarse_bins)
compute_bin_offsets_function(
bin_offsets_device,
sorted_coarse_bin_ids_device,
numpy.int32(n),
block=compute_bin_offsets_function_block,
grid=compute_bin_offsets_function_grid)
pycuda.driver.memset_d32(histogram_device, int(0), nbins)
compute_fine_histogram_function(
histogram_device,
sorted_coarse_bin_ids_device,
sorted_data_device,
bin_offsets_device,
numpy.int32(n),
numpy.int32(num_coarse_bins),
numpy.int32(nbins),
numpy.float32(input_min),
numpy.float32(input_max),
numpy.float32(input_range),
block=compute_fine_histogram_function_block,
grid=compute_fine_histogram_function_grid)
pycuda.driver.memcpy_dtoh(histogram_gpu, histogram_device)
diff = abs(histogram_gpu - histogram_cpu_loop)
print numpy.sum(histogram_gpu)
print numpy.sum(histogram_cpu_loop)
print \
"Difference between GPU and CPU as a percentage of the maximum possible difference (should be less than 1.0%%): %0.2f%%" % \
(100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(histogram_cpu_loop)))
figsize(14,4)
matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(histogram_gpu);
matplotlib.pyplot.title("histogram_gpu");
matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_cpu_loop);
matplotlib.pyplot.title("histogram_cpu_loop");
matplotlib.pyplot.subplot(133);
matplotlib.pyplot.plot(diff);
matplotlib.pyplot.title("diff");
102400 102400 Difference between GPU and CPU as a percentage of the maximum possible difference (should be less than 1.0%): 0.05%
source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void compute_fine_histogram_global_atomic(
unsigned int* d_histogram,
float* d_input_data,
int n,
int num_bins,
float min_value,
float max_value,
float range )
{
int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
if ( global_index_1d < n )
{
float data_value = d_input_data[ global_index_1d ];
int bin_index = min( num_bins - 1, int( ( num_bins * ( data_value - min_value ) ) / range ) );
atomicAdd( d_histogram + bin_index, 1 );
}
}
"""
)
histogram_gpu_global_atomic = numpy.zeros_like(histogram_cpu)
compute_fine_histogram_global_atomic_function = source_module.get_function("compute_fine_histogram_global_atomic")
compute_fine_histogram_global_atomic_function_block = (512,1,1)
num_blocks = int(ceil(float(n) / float(compute_fine_histogram_global_atomic_function_block[0])))
compute_fine_histogram_global_atomic_function_grid = (num_blocks, 1)
pycuda.driver.memcpy_htod(input_data_device, input_data)
input_min = reduce_manager.reduce_min_device(input_data_device, n)
input_max = reduce_manager.reduce_max_device(input_data_device, n)
input_range = input_max - input_min
compute_bin_ids_function(
input_data_device,
coarse_bin_ids_device,
numpy.int32(n),
numpy.int32(num_coarse_bins),
numpy.int32(nbins),
numpy.float32(input_min),
numpy.float32(input_max),
numpy.float32(input_range),
block=compute_bin_ids_function_block,
grid=compute_bin_ids_function_grid)
radix_sort_manager.radix_sort_key_value_ascending_device(coarse_bin_ids_device, input_data_device, sorted_coarse_bin_ids_device, sorted_data_device, n)
pycuda.driver.memset_d32(histogram_device, int(0), nbins)
compute_fine_histogram_global_atomic_function(
histogram_device,
sorted_data_device,
numpy.int32(n),
numpy.int32(nbins),
numpy.float32(input_min),
numpy.float32(input_max),
numpy.float32(input_range),
block=compute_fine_histogram_global_atomic_function_block,
grid=compute_fine_histogram_global_atomic_function_grid)
pycuda.driver.memcpy_dtoh(histogram_gpu_global_atomic, histogram_device)
diff = abs(histogram_gpu_global_atomic - histogram_gpu)
print numpy.sum(histogram_gpu_global_atomic)
print numpy.sum(histogram_gpu)
print \
"Difference between GPU (global atomics) and GPU (shared memory) as a percentage of the maximum possible difference (should be 0.0%%): %0.2f%%" % \
(100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(histogram_gpu)))
figsize(14,4)
matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(histogram_gpu_global_atomic);
matplotlib.pyplot.title("histogram_gpu_global_atomic");
matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_gpu);
matplotlib.pyplot.title("histogram_gpu");
matplotlib.pyplot.subplot(133);
matplotlib.pyplot.plot(diff);
matplotlib.pyplot.title("diff");
102400 102400 Difference between GPU (global atomics) and GPU (shared memory) as a percentage of the maximum possible difference (should be 0.0%): 0.00%
import sys
import time
if sys.platform == "win32":
print "Using time.clock for benchmarking...\n"
system_timer_function = time.clock
else:
print "Using time.time for benchmarking...\n"
system_timer_function = time.time
num_timing_iterations = 100
print "num_timing_iterations = %d" % num_timing_iterations
Using time.clock for benchmarking... num_timing_iterations = 100
total_time_seconds = 0
for i in range(num_timing_iterations):
pycuda.driver.memcpy_htod(input_data_device, input_data)
input_min = reduce_manager.reduce_min_device(input_data_device, n)
input_max = reduce_manager.reduce_max_device(input_data_device, n)
input_range = input_max - input_min
pycuda.driver.Context.synchronize()
start_time_seconds = system_timer_function()
compute_bin_ids_function(
input_data_device,
coarse_bin_ids_device,
numpy.int32(n),
numpy.int32(num_coarse_bins),
numpy.int32(nbins),
numpy.float32(input_min),
numpy.float32(input_max),
numpy.float32(input_range),
block=compute_bin_ids_function_block,
grid=compute_bin_ids_function_grid)
#
# I have excluded sorting in this performance benchmark, even though it is necessary for correctness.
# In the C++ version, we should replace my unoptimized sort with an optimized sort (e.g. thrust::sort)
# and include it in the performance benchmark.
#
#radix_sort_manager.radix_sort_key_value_ascending_device(coarse_bin_ids_device, input_data_device, sorted_coarse_bin_ids_device, sorted_data_device, n)
pycuda.driver.memset_d32(bin_offsets_device, int(0xffffffff), num_coarse_bins)
compute_bin_offsets_function(
bin_offsets_device,
sorted_coarse_bin_ids_device,
numpy.int32(n),
block=compute_bin_offsets_function_block,
grid=compute_bin_offsets_function_grid)
pycuda.driver.memset_d32(histogram_device, int(0), nbins)
compute_fine_histogram_function(
histogram_device,
sorted_coarse_bin_ids_device,
sorted_data_device,
bin_offsets_device,
numpy.int32(n),
numpy.int32(num_coarse_bins),
numpy.int32(nbins),
numpy.float32(input_min),
numpy.float32(input_max),
numpy.float32(input_range),
block=compute_fine_histogram_function_block,
grid=compute_fine_histogram_function_grid)
pycuda.driver.Context.synchronize()
end_time_seconds = system_timer_function()
elapsed_time_seconds = end_time_seconds - start_time_seconds
total_time_seconds = total_time_seconds + elapsed_time_seconds
pycuda.driver.memcpy_dtoh(histogram_gpu, histogram_device)
average_time_seconds_gpu_system = total_time_seconds / num_timing_iterations
print "Using system timer for benchmarking (see above)..."
print "Average time elapsed executing GPU (shared memory) over %d runs: %f s" % (num_timing_iterations,average_time_seconds_gpu_system)
print
figsize(4,4)
matplotlib.pyplot.plot(histogram_gpu);
matplotlib.pyplot.title("histogram_gpu");
Using system timer for benchmarking (see above)... Average time elapsed executing GPU (shared memory) over 100 runs: 0.001150 s
total_time_seconds = 0
for i in range(num_timing_iterations):
pycuda.driver.memcpy_htod(input_data_device, input_data)
input_min = reduce_manager.reduce_min_device(input_data_device, n)
input_max = reduce_manager.reduce_max_device(input_data_device, n)
input_range = input_max - input_min
pycuda.driver.Context.synchronize()
start_time_seconds = system_timer_function()
pycuda.driver.memset_d32(histogram_device, int(0), nbins)
compute_fine_histogram_global_atomic_function(
histogram_device,
sorted_data_device,
numpy.int32(n),
numpy.int32(nbins),
numpy.float32(input_min),
numpy.float32(input_max),
numpy.float32(input_range),
block=compute_fine_histogram_global_atomic_function_block,
grid=compute_fine_histogram_global_atomic_function_grid)
pycuda.driver.Context.synchronize()
end_time_seconds = system_timer_function()
elapsed_time_seconds = end_time_seconds - start_time_seconds
total_time_seconds = total_time_seconds + elapsed_time_seconds
pycuda.driver.memcpy_dtoh(histogram_gpu_global_atomic, histogram_device)
average_time_seconds_gpu_global_atomic_system = total_time_seconds / num_timing_iterations
print "Using system timer for benchmarking (see above)..."
print "Average time elapsed executing GPU (global atomic) over %d runs: %f s" % (num_timing_iterations,average_time_seconds_gpu_global_atomic_system)
print
figsize(4,4)
matplotlib.pyplot.plot(histogram_gpu_global_atomic);
matplotlib.pyplot.title("histogram_gpu_global_atomic");
Using system timer for benchmarking (see above)... Average time elapsed executing GPU (global atomic) over 100 runs: 0.012794 s
speedup_gpu = average_time_seconds_gpu_global_atomic_system / average_time_seconds_gpu_system
print "Average GPU (global atomic) time: %f s" % average_time_seconds_gpu_global_atomic_system
print "Average GPU (shared memory) time: %f s" % average_time_seconds_gpu_system
print
print "GPU (shared memory) speedup: %f x" % speedup_gpu
Average GPU (global atomic) time: 0.012794 s Average GPU (shared memory) time: 0.001150 s GPU (shared memory) speedup: 11.121705 x