Notebook

Histogram¶

The purpose of this code is to implement a simple histogram algorithm on the GPU.

generate data¶

In [1]:

n                    = 1024 * 100
nbins                = 1024
input_data           = numpy.random.normal(0, 1, n).astype(numpy.float32)
histogram_cpu, edges = numpy.histogram(input_data, nbins)



figsize(14,4)

matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(input_data);
matplotlib.pyplot.title("input_data");

matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_cpu);
matplotlib.pyplot.title("histogram_cpu");

matplotlib.pyplot.subplot(133);
matplotlib.pyplot.hist(input_data, nbins);
matplotlib.pyplot.title("matplotlib.pyplot.hist(input_data, nbins)");

hand-coded CPU loop¶

In [2]:

input_max   = numpy.max(input_data)
input_min   = numpy.min(input_data)
input_range = input_max - input_min

histogram_cpu_loop = numpy.zeros_like(histogram_cpu)

for i in range(n):
    input_value                   = input_data[i]
    bin_index                     = min(nbins - 1, int((nbins * (input_value - input_min)) / input_range));
    histogram_cpu_loop[bin_index] = histogram_cpu_loop[bin_index] + 1



diff = abs(histogram_cpu_loop - histogram_cpu)

print \
    "Difference between numpy and hand-coded loop as a percentage of the maximum possible difference (should be less than 1.0%%): %0.2f%%" % \
    (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(histogram_cpu)))

figsize(14,4)

matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(histogram_cpu);
matplotlib.pyplot.title("histogram_cpu");

matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_cpu_loop);
matplotlib.pyplot.title("histogram_cpu_loop");

matplotlib.pyplot.subplot(133);
matplotlib.pyplot.plot(diff);
matplotlib.pyplot.title("diff");

Difference between numpy and hand-coded loop as a percentage of the maximum possible difference (should be less than 1.0%): 0.06%

define GPU kernels¶

In [3]:

import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
import reduce
import radixsort
import prefixsum

source_module = pycuda.compiler.SourceModule \
(
"""

#define FINE_HISTOGRAM_NUM_VALUES 32

__global__ void compute_bin_ids(
    float*        d_input_data,
    unsigned int* d_coarse_bin_ids,
    int           n,
    int           num_coarse_bins,
    int           num_fine_bins,
    float         min_value,
    float         max_value,
    float         range )
{
    int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;

    if ( global_index_1d < n )
    {
        float input_value      = d_input_data[ global_index_1d ];
        int   fine_bin_index   = min( num_fine_bins - 1, int( ( num_fine_bins * ( input_value - min_value ) ) / range ) );
        int   coarse_bin_index = fine_bin_index / FINE_HISTOGRAM_NUM_VALUES;

        d_coarse_bin_ids[ global_index_1d ] = coarse_bin_index;
    }
}

__global__ void compute_bin_offsets(
    int*          d_bin_offsets,
    unsigned int* d_sorted_coarse_bin_ids,
    int           n )
{
    int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;

    if ( global_index_1d == 0 )
    {
        d_bin_offsets[ 0 ] = 0;
    }
    else if ( global_index_1d < n )
    {
        unsigned int left_bin_id  = d_sorted_coarse_bin_ids[ global_index_1d - 1 ];
        unsigned int right_bin_id = d_sorted_coarse_bin_ids[ global_index_1d ];

        if ( left_bin_id != right_bin_id )
        {
            unsigned int offset_index     = right_bin_id;
            d_bin_offsets[ offset_index ] = global_index_1d;
        }
    }
}

__device__ void increment_warp_histogram( volatile unsigned int* s_warp_histogram, unsigned int warp_histogram_index, unsigned int warp_local_thread_id )
{
    unsigned int warp_local_thread_tag = warp_local_thread_id << 24;
    unsigned int old_count, new_count, new_count_tagged;

    do
    { 
        old_count                                = s_warp_histogram[ warp_histogram_index ] & 0x00FFFFFF; 
        new_count                                = old_count + 1;
        new_count_tagged                         = warp_local_thread_tag | new_count;
        s_warp_histogram[ warp_histogram_index ] = new_count_tagged;
    }
    while( s_warp_histogram[ warp_histogram_index ] != new_count_tagged );

    new_count_tagged                         = s_warp_histogram[ warp_histogram_index ];
    s_warp_histogram[ warp_histogram_index ] = new_count_tagged & 0x00FFFFFF;
}

__global__ void compute_fine_histogram(
    unsigned int* d_histogram,
    unsigned int* d_sorted_coarse_bin_ids,
    float*        d_sorted_input_data,
    int*          d_bin_offsets,
    int           n,
    int           num_coarse_bins,
    int           num_fine_bins,
    float         min_value,
    float         max_value,
    float         range )
{
    __shared__ float        s_sorted_input_data[ FINE_HISTOGRAM_NUM_VALUES ];
    __shared__ unsigned int s_fine_histogram[ FINE_HISTOGRAM_NUM_VALUES ];

    //
    // figure out the start and end points in the sorted input data array for the current coarse bin
    //
    int          start_data_index_1d    = d_bin_offsets[ blockIdx.x ];
    int          end_data_index_1d;
    unsigned int shared_memory_index_1d = threadIdx.x;
    unsigned int coarse_bin_index_1d    = d_sorted_coarse_bin_ids[ start_data_index_1d ];
    
    if ( start_data_index_1d != -1 )
    {
        if ( coarse_bin_index_1d < num_coarse_bins - 1 )
        {
            int nex_bin_offset_index = 1;
            int next_bin_start_index_1d = d_bin_offsets[ coarse_bin_index_1d + nex_bin_offset_index ];

            while ( next_bin_start_index_1d == -1 )
            {
                nex_bin_offset_index++;
                next_bin_start_index_1d = d_bin_offsets[ coarse_bin_index_1d + nex_bin_offset_index ];
            }

            end_data_index_1d = next_bin_start_index_1d - 1;
        }
        else if ( coarse_bin_index_1d == num_coarse_bins - 1 )
        {
            end_data_index_1d = n - 1;
        }
        else
        {
            end_data_index_1d = -1;
        }

        //
        // initialize fine-grained histogram in shared memory
        //
        s_fine_histogram[ shared_memory_index_1d ] = 0;

        __syncthreads();

        //
        // loop through the sorted input data and iteratively compute the histogram in shared memory
        //
        unsigned int total_num_data_values                = end_data_index_1d - start_data_index_1d + 1;
        int          num_iterations                       =
            int( ceil( float( total_num_data_values ) / float( FINE_HISTOGRAM_NUM_VALUES ) ) );
        unsigned int current_iteration_data_base_index_1d = start_data_index_1d;

        for ( int i = 0; i < num_iterations; i++ )
        {
            //
            // load current chunk of data into shared memory
            //
            unsigned int current_iteration_data_thread_index_1d = current_iteration_data_base_index_1d + threadIdx.x;

            if ( current_iteration_data_thread_index_1d <= end_data_index_1d )
            {
                s_sorted_input_data[ shared_memory_index_1d ] = d_sorted_input_data[ current_iteration_data_thread_index_1d ];
            }

            __syncthreads();

            //
            // compute fine-grained histogram
            //
            unsigned int current_max_num_data_values = end_data_index_1d - current_iteration_data_base_index_1d + 1;
            unsigned int current_num_data_values     = min( FINE_HISTOGRAM_NUM_VALUES, current_max_num_data_values );

            if ( shared_memory_index_1d < current_num_data_values )
            {
                float input_value = s_sorted_input_data[ shared_memory_index_1d ];

                unsigned int fine_bin_global_index_1d =
                    min( num_fine_bins - 1, int( ( num_fine_bins * ( input_value - min_value ) ) / range ) );

                unsigned int fine_bin_shared_memory_index_1d =
                    max(0, min( FINE_HISTOGRAM_NUM_VALUES - 1, fine_bin_global_index_1d - ( coarse_bin_index_1d * FINE_HISTOGRAM_NUM_VALUES ) ) );

                increment_warp_histogram( s_fine_histogram, fine_bin_shared_memory_index_1d, shared_memory_index_1d );
            }

            current_iteration_data_base_index_1d += FINE_HISTOGRAM_NUM_VALUES;
        }

        __syncthreads();
    
        //
        // copy fine-grained histogram into shared memory
        //
        unsigned int histogram_base_index_1d   = coarse_bin_index_1d * FINE_HISTOGRAM_NUM_VALUES;
        unsigned int histogram_thread_index_1d = histogram_base_index_1d + threadIdx.x;
    
        if ( histogram_thread_index_1d < num_fine_bins )
        {
            d_histogram[ histogram_thread_index_1d ] = s_fine_histogram[ shared_memory_index_1d ];
        }
    }
}
"""
)

num_fine_bins_per_coarse_bin = 32
num_bytes_per_int32          = 4
num_coarse_bins              = int(ceil(float(nbins) / float(num_fine_bins_per_coarse_bin)))

histogram_gpu   = numpy.zeros_like(histogram_cpu)

input_data_device            = pycuda.driver.mem_alloc(input_data.nbytes)
coarse_bin_ids_device        = pycuda.driver.mem_alloc(input_data.nbytes)
sorted_data_device           = pycuda.driver.mem_alloc(input_data.nbytes)
sorted_coarse_bin_ids_device = pycuda.driver.mem_alloc(input_data.nbytes)
bin_offsets_device           = pycuda.driver.mem_alloc(num_coarse_bins * num_bytes_per_int32)
histogram_device             = pycuda.driver.mem_alloc(histogram_gpu.nbytes)

compute_bin_ids_function       = source_module.get_function("compute_bin_ids")
compute_bin_ids_function_block = (512,1,1)
num_blocks                     = int(ceil(float(n) / float(compute_bin_ids_function_block[0])))
compute_bin_ids_function_grid  = (num_blocks, 1)

compute_bin_offsets_function       = source_module.get_function("compute_bin_offsets")
compute_bin_offsets_function_block = (512,1,1)
num_blocks                         = int(ceil(float(n) / float(compute_bin_offsets_function_block[0])))
compute_bin_offsets_function_grid  = (num_blocks, 1)

compute_fine_histogram_function       = source_module.get_function("compute_fine_histogram")
compute_fine_histogram_function_block = (num_fine_bins_per_coarse_bin,1,1)
num_blocks                            = num_coarse_bins
compute_fine_histogram_function_grid  = (num_blocks, 1)

reduce_manager     = reduce.ReduceManager(n)
radix_sort_manager = radixsort.RadixSortManager(n)
prefix_sum_manager = prefixsum.PrefixSumManager(n)

call GPU histogram algorithm¶

In [5]:

pycuda.driver.memcpy_htod(input_data_device, input_data)

input_min   = reduce_manager.reduce_min_device(input_data_device, n)
input_max   = reduce_manager.reduce_max_device(input_data_device, n)
input_range = input_max - input_min

compute_bin_ids_function(
    input_data_device,
    coarse_bin_ids_device,
    numpy.int32(n),
    numpy.int32(num_coarse_bins),
    numpy.int32(nbins),
    numpy.float32(input_min),
    numpy.float32(input_max),
    numpy.float32(input_range),
    block=compute_bin_ids_function_block,
    grid=compute_bin_ids_function_grid)

radix_sort_manager.radix_sort_key_value_ascending_device(coarse_bin_ids_device, input_data_device, sorted_coarse_bin_ids_device, sorted_data_device, n)

pycuda.driver.memset_d32(bin_offsets_device, int(0xffffffff), num_coarse_bins)

compute_bin_offsets_function(
    bin_offsets_device,
    sorted_coarse_bin_ids_device,
    numpy.int32(n),
    block=compute_bin_offsets_function_block,
    grid=compute_bin_offsets_function_grid)

pycuda.driver.memset_d32(histogram_device, int(0), nbins)

compute_fine_histogram_function(
    histogram_device,
    sorted_coarse_bin_ids_device,
    sorted_data_device,
    bin_offsets_device,
    numpy.int32(n),
    numpy.int32(num_coarse_bins),
    numpy.int32(nbins),
    numpy.float32(input_min),
    numpy.float32(input_max),
    numpy.float32(input_range),
    block=compute_fine_histogram_function_block,
    grid=compute_fine_histogram_function_grid)

pycuda.driver.memcpy_dtoh(histogram_gpu, histogram_device)



diff = abs(histogram_gpu - histogram_cpu_loop)

print numpy.sum(histogram_gpu)
print numpy.sum(histogram_cpu_loop)

print \
    "Difference between GPU and CPU as a percentage of the maximum possible difference (should be less than 1.0%%): %0.2f%%" % \
    (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(histogram_cpu_loop)))

figsize(14,4)

matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(histogram_gpu);
matplotlib.pyplot.title("histogram_gpu");

matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_cpu_loop);
matplotlib.pyplot.title("histogram_cpu_loop");

matplotlib.pyplot.subplot(133);
matplotlib.pyplot.plot(diff);
matplotlib.pyplot.title("diff");

102400
102400
Difference between GPU and CPU as a percentage of the maximum possible difference (should be less than 1.0%): 0.05%

define GPU histogram kernel using global atomics¶

In [6]:

source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void compute_fine_histogram_global_atomic(
    unsigned int* d_histogram,
    float*        d_input_data,
    int           n,
    int           num_bins,
    float         min_value,
    float         max_value,
    float         range )
{
    int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;

    if ( global_index_1d < n )
    {
        float data_value = d_input_data[ global_index_1d ];
        int   bin_index  = min( num_bins - 1, int( ( num_bins * ( data_value - min_value ) ) / range ) );

        atomicAdd( d_histogram + bin_index, 1 );
    }
}
"""
)

histogram_gpu_global_atomic = numpy.zeros_like(histogram_cpu)

compute_fine_histogram_global_atomic_function       = source_module.get_function("compute_fine_histogram_global_atomic")
compute_fine_histogram_global_atomic_function_block = (512,1,1)
num_blocks                                          = int(ceil(float(n) / float(compute_fine_histogram_global_atomic_function_block[0])))
compute_fine_histogram_global_atomic_function_grid  = (num_blocks, 1)

In [7]:

pycuda.driver.memcpy_htod(input_data_device, input_data)

input_min   = reduce_manager.reduce_min_device(input_data_device, n)
input_max   = reduce_manager.reduce_max_device(input_data_device, n)
input_range = input_max - input_min

compute_bin_ids_function(
    input_data_device,
    coarse_bin_ids_device,
    numpy.int32(n),
    numpy.int32(num_coarse_bins),
    numpy.int32(nbins),
    numpy.float32(input_min),
    numpy.float32(input_max),
    numpy.float32(input_range),
    block=compute_bin_ids_function_block,
    grid=compute_bin_ids_function_grid)

radix_sort_manager.radix_sort_key_value_ascending_device(coarse_bin_ids_device, input_data_device, sorted_coarse_bin_ids_device, sorted_data_device, n)

pycuda.driver.memset_d32(histogram_device, int(0), nbins)

compute_fine_histogram_global_atomic_function(
    histogram_device,
    sorted_data_device,
    numpy.int32(n),
    numpy.int32(nbins),
    numpy.float32(input_min),
    numpy.float32(input_max),
    numpy.float32(input_range),
    block=compute_fine_histogram_global_atomic_function_block,
    grid=compute_fine_histogram_global_atomic_function_grid)

pycuda.driver.memcpy_dtoh(histogram_gpu_global_atomic, histogram_device)



diff = abs(histogram_gpu_global_atomic - histogram_gpu)

print numpy.sum(histogram_gpu_global_atomic)
print numpy.sum(histogram_gpu)

print \
    "Difference between GPU (global atomics) and GPU (shared memory) as a percentage of the maximum possible difference (should be 0.0%%): %0.2f%%" % \
    (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(histogram_gpu)))

figsize(14,4)

matplotlib.pyplot.subplot(131);
matplotlib.pyplot.plot(histogram_gpu_global_atomic);
matplotlib.pyplot.title("histogram_gpu_global_atomic");

matplotlib.pyplot.subplot(132);
matplotlib.pyplot.plot(histogram_gpu);
matplotlib.pyplot.title("histogram_gpu");

matplotlib.pyplot.subplot(133);
matplotlib.pyplot.plot(diff);
matplotlib.pyplot.title("diff");

102400
102400
Difference between GPU (global atomics) and GPU (shared memory) as a percentage of the maximum possible difference (should be 0.0%): 0.00%

common benchmarking code¶

In [8]:

import sys
import time

if sys.platform == "win32":
    print "Using time.clock for benchmarking...\n" 
    system_timer_function = time.clock
else:
    print "Using time.time for benchmarking...\n" 
    system_timer_function = time.time
    
num_timing_iterations = 100
    
    
    
print "num_timing_iterations = %d" % num_timing_iterations

Using time.clock for benchmarking...

num_timing_iterations = 100

benchmark GPU (shared memory)¶

In [10]:

total_time_seconds = 0

for i in range(num_timing_iterations):
    
    pycuda.driver.memcpy_htod(input_data_device, input_data)

    input_min   = reduce_manager.reduce_min_device(input_data_device, n)
    input_max   = reduce_manager.reduce_max_device(input_data_device, n)
    input_range = input_max - input_min
        
    pycuda.driver.Context.synchronize()
    
    start_time_seconds = system_timer_function()

    compute_bin_ids_function(
        input_data_device,
        coarse_bin_ids_device,
        numpy.int32(n),
        numpy.int32(num_coarse_bins),
        numpy.int32(nbins),
        numpy.float32(input_min),
        numpy.float32(input_max),
        numpy.float32(input_range),
        block=compute_bin_ids_function_block,
        grid=compute_bin_ids_function_grid)

    #
    # I have excluded sorting in this performance benchmark, even though it is necessary for correctness.
    # In the C++ version, we should replace my unoptimized sort with an optimized sort (e.g. thrust::sort)
    # and include it in the performance benchmark.
    #
    #radix_sort_manager.radix_sort_key_value_ascending_device(coarse_bin_ids_device, input_data_device, sorted_coarse_bin_ids_device, sorted_data_device, n)
    
    pycuda.driver.memset_d32(bin_offsets_device, int(0xffffffff), num_coarse_bins)
    
    compute_bin_offsets_function(
        bin_offsets_device,
        sorted_coarse_bin_ids_device,
        numpy.int32(n),
        block=compute_bin_offsets_function_block,
        grid=compute_bin_offsets_function_grid)
    
    pycuda.driver.memset_d32(histogram_device, int(0), nbins)
    
    compute_fine_histogram_function(
        histogram_device,
        sorted_coarse_bin_ids_device,
        sorted_data_device,
        bin_offsets_device,
        numpy.int32(n),
        numpy.int32(num_coarse_bins),
        numpy.int32(nbins),
        numpy.float32(input_min),
        numpy.float32(input_max),
        numpy.float32(input_range),
        block=compute_fine_histogram_function_block,
        grid=compute_fine_histogram_function_grid)
    
    pycuda.driver.Context.synchronize()
    
    end_time_seconds     = system_timer_function()
    elapsed_time_seconds = end_time_seconds - start_time_seconds
    total_time_seconds   = total_time_seconds + elapsed_time_seconds

pycuda.driver.memcpy_dtoh(histogram_gpu, histogram_device)    
        
average_time_seconds_gpu_system = total_time_seconds / num_timing_iterations



print "Using system timer for benchmarking (see above)..."
print "Average time elapsed executing GPU (shared memory) over %d runs: %f s" % (num_timing_iterations,average_time_seconds_gpu_system)

print

figsize(4,4)

matplotlib.pyplot.plot(histogram_gpu);
matplotlib.pyplot.title("histogram_gpu");

Using system timer for benchmarking (see above)...
Average time elapsed executing GPU (shared memory) over 100 runs: 0.001150 s

benchmark GPU (global atomics) code¶

In [11]:

total_time_seconds = 0

for i in range(num_timing_iterations):
    
    pycuda.driver.memcpy_htod(input_data_device, input_data)

    input_min   = reduce_manager.reduce_min_device(input_data_device, n)
    input_max   = reduce_manager.reduce_max_device(input_data_device, n)
    input_range = input_max - input_min
        
    pycuda.driver.Context.synchronize()
    
    start_time_seconds = system_timer_function()

    pycuda.driver.memset_d32(histogram_device, int(0), nbins)
    
    compute_fine_histogram_global_atomic_function(
        histogram_device,
        sorted_data_device,
        numpy.int32(n),
        numpy.int32(nbins),
        numpy.float32(input_min),
        numpy.float32(input_max),
        numpy.float32(input_range),
        block=compute_fine_histogram_global_atomic_function_block,
        grid=compute_fine_histogram_global_atomic_function_grid)
    
    pycuda.driver.Context.synchronize()
    
    end_time_seconds     = system_timer_function()
    elapsed_time_seconds = end_time_seconds - start_time_seconds
    total_time_seconds   = total_time_seconds + elapsed_time_seconds

pycuda.driver.memcpy_dtoh(histogram_gpu_global_atomic, histogram_device)
        
average_time_seconds_gpu_global_atomic_system = total_time_seconds / num_timing_iterations



print "Using system timer for benchmarking (see above)..."
print "Average time elapsed executing GPU (global atomic) over %d runs: %f s" % (num_timing_iterations,average_time_seconds_gpu_global_atomic_system)

print

figsize(4,4)

matplotlib.pyplot.plot(histogram_gpu_global_atomic);
matplotlib.pyplot.title("histogram_gpu_global_atomic");

Using system timer for benchmarking (see above)...
Average time elapsed executing GPU (global atomic) over 100 runs: 0.012794 s

measure speedup¶

In [12]:

speedup_gpu = average_time_seconds_gpu_global_atomic_system / average_time_seconds_gpu_system

print "Average GPU (global atomic) time: %f s" % average_time_seconds_gpu_global_atomic_system
print "Average GPU (shared memory) time: %f s" % average_time_seconds_gpu_system

print

print "GPU (shared memory) speedup:      %f x" % speedup_gpu

Average GPU (global atomic) time: 0.012794 s
Average GPU (shared memory) time: 0.001150 s

GPU (shared memory) speedup:      11.121705 x