Notebook

Radix Sort¶

The purpose of this code is to implement a canonical radix sort algorithm on the GPU.

generate data¶

In [1]:

n                    = 10000
input_keys           = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values         = input_keys.astype(numpy.float32)

print input_keys
print input_values

[9875  764 6555 ..., 5036 7990 9689]
[ 9875.   764.  6555. ...,  5036.  7990.  9689.]

perform mostly CPU radix sort¶

In [2]:

import split

num_bits_per_element = 32
split_manager        = split.SplitManager(15000)

flag_data            = zeros_like(input_keys)
split_keys_old       = input_keys.copy()
split_values_old     = input_values.copy()
split_keys_new       = zeros_like(input_keys)
split_values_new     = zeros_like(input_values)

for b in range(num_bits_per_element):

    mask = 2**b

    for i in range(n):
        input_value  = split_keys_old[i]
        flag_data[i] = not (input_value & mask)

    split_manager.split_host(split_keys_old,   flag_data, split_keys_new)
    split_manager.split_host(split_values_old, flag_data, split_values_new)

    split_keys_old,   split_keys_new   = split_keys_new, split_keys_old
    split_values_old, split_values_new = split_values_new, split_values_old
    
    
    
print input_keys
print input_values
print split_keys_old
print split_values_old
print numpy.sort(input_keys)
print
    
print "Difference between GPU and CPU keys   (should be 0.0%%): %f" % numpy.linalg.norm(split_keys_new   - numpy.sort(input_keys))
print "Difference between GPU and CPU values (should be 0.0%%): %f" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))

[9875  764 6555 ..., 5036 7990 9689]
[ 9875.   764.  6555. ...,  5036.  7990.  9689.]
[   0    1    2 ..., 9999 9999 9999]
[  0.00000000e+00   1.00000000e+00   2.00000000e+00 ...,   9.99900000e+03
   9.99900000e+03   9.99900000e+03]
[   0    1    2 ..., 9999 9999 9999]

Difference between GPU and CPU keys   (should be 0.0%): 0.000000
Difference between GPU and CPU values (should be 0.0%): 0.000000

perform fully GPU radix sort¶

In [3]:

import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
import split

source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void radix_sort_compute_flags(
    unsigned int* d_input_data,
    unsigned int* d_output_data,
    int mask,
    int n )
{
    int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;

    if ( global_index_1d < n )
    {
        unsigned int input_value = d_input_data[ global_index_1d ];

        if ( input_value & mask )
        {
            d_output_data[ global_index_1d ] = 0;
        }
        else
        {
            d_output_data[ global_index_1d ] = 1;
        }
    }
}
"""
)

radix_sort_compute_flags_function = source_module.get_function("radix_sort_compute_flags")

size_of_element_bytes   = 4
size_of_element_bits    = 32
max_num_elements        = 15000
num_bytes               = max_num_elements * size_of_element_bytes

input_keys_device       = pycuda.driver.mem_alloc(num_bytes)
input_values_device     = pycuda.driver.mem_alloc(num_bytes)
flag_data_device        = pycuda.driver.mem_alloc(num_bytes)
split_keys_old_device   = pycuda.driver.mem_alloc(num_bytes)
split_values_old_device = pycuda.driver.mem_alloc(num_bytes)
split_keys_new_device   = pycuda.driver.mem_alloc(num_bytes)
split_values_new_device = pycuda.driver.mem_alloc(num_bytes)

split_manager           = split.SplitManager(max_num_elements)

In [4]:

pycuda.driver.memcpy_htod(input_keys_device,       input_keys)
pycuda.driver.memcpy_htod(input_values_device,     input_values)
pycuda.driver.memcpy_htod(split_keys_old_device,   input_keys)
pycuda.driver.memcpy_htod(split_values_old_device, input_values)

pycuda.driver.memset_d32(flag_data_device,             0, n)
pycuda.driver.memset_d32(split_keys_new_device,   0, n)
pycuda.driver.memset_d32(split_values_new_device, 0, n)

for b in range(num_bits_per_element):

    mask = numpy.int32(2**numpy.int8(b))

    radix_sort_compute_flags_funcion_block = (512,1,1)
    num_blocks                             = int(ceil(float(n) / float(radix_sort_compute_flags_funcion_block[0])))
    radix_sort_compute_flags_funcion_grid  = (num_blocks, 1)

    radix_sort_compute_flags_function(
        split_keys_old_device,
        flag_data_device,
        numpy.int32(mask),
        numpy.int32(n),
        block=radix_sort_compute_flags_funcion_block,
        grid=radix_sort_compute_flags_funcion_grid)

    split_manager.split_device(split_keys_old_device,   flag_data_device, split_keys_new_device,   n)
    split_manager.split_device(split_values_old_device, flag_data_device, split_values_new_device, n)

    split_keys_old_device,   split_keys_new_device   = split_keys_new_device, split_keys_old_device
    split_values_old_device, split_values_new_device = split_values_new_device, split_values_old_device
    
pycuda.driver.memcpy_dtoh(split_keys_new,   split_keys_old_device)
pycuda.driver.memcpy_dtoh(split_values_new, split_values_old_device)



print input_keys
print input_values
print split_keys_new
print split_values_new
print numpy.sort(input_keys)
print

print "Difference between GPU and CPU keys   (should be 0.0%%): %f" % numpy.linalg.norm(split_keys_new   - numpy.sort(input_keys))
print "Difference between GPU and CPU values (should be 0.0%%): %f" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))

[9875  764 6555 ..., 5036 7990 9689]
[ 9875.   764.  6555. ...,  5036.  7990.  9689.]
[   0    1    2 ..., 9999 9999 9999]
[  0.00000000e+00   1.00000000e+00   2.00000000e+00 ...,   9.99900000e+03
   9.99900000e+03   9.99900000e+03]
[   0    1    2 ..., 9999 9999 9999]

Difference between GPU and CPU keys   (should be 0.0%): 0.000000
Difference between GPU and CPU values (should be 0.0%): 0.000000

-c:12: RuntimeWarning: invalid value encountered in power

define GPU radix sort class¶

In [5]:

import math
import numpy
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
import split

class RadixSortManager:
    
    source_module = pycuda.compiler.SourceModule \
    (
    """
    __global__ void radix_sort_compute_flags_ascending(
        unsigned int* d_input_data,
        unsigned int* d_output_data,
        int mask,
        int n )
    {
        int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;

        if ( global_index_1d < n )
        {
            unsigned int input_value = d_input_data[ global_index_1d ];

            if ( input_value & mask )
            {
                d_output_data[ global_index_1d ] = 0;
            }
            else
            {
                d_output_data[ global_index_1d ] = 1;
            }
        }
    }

    __global__ void radix_sort_compute_flags_descending(
        unsigned int* d_input_data,
        unsigned int* d_output_data,
        int mask,
        int n )
    {
        int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;

        if ( global_index_1d < n )
        {
            unsigned int input_value = d_input_data[ global_index_1d ];

            if ( input_value & mask )
            {
                d_output_data[ global_index_1d ] = 1;
            }
            else
            {
                d_output_data[ global_index_1d ] = 0;
            }
        }
    }    
    """
    )

    _radix_sort_compute_flags_ascending_function  = source_module.get_function("radix_sort_compute_flags_ascending")
    _radix_sort_compute_flags_descending_function = source_module.get_function("radix_sort_compute_flags_descending")

    _size_of_element_bytes   = 4
    _size_of_element_bits    = 32
    
    _max_num_elements        = -1
    _num_bytes               = -1

    _input_keys_device       = -1
    _input_values_device     = -1
    _flag_data_device        = -1
    _split_keys_old_device   = -1
    _split_values_old_device = -1
    _split_keys_new_device   = -1
    _split_values_new_device = -1

    _split_manager           = -1
    
    def __init__(self, max_num_elements):
        
        self._max_num_elements        = max_num_elements
        self._num_bytes               = self._max_num_elements * self._size_of_element_bytes

        self._input_keys_device       = pycuda.driver.mem_alloc(self._num_bytes)
        self._input_values_device     = pycuda.driver.mem_alloc(self._num_bytes)
        self._flag_data_device        = pycuda.driver.mem_alloc(self._num_bytes)
        self._split_keys_old_device   = pycuda.driver.mem_alloc(self._num_bytes)
        self._split_values_old_device = pycuda.driver.mem_alloc(self._num_bytes)
        self._output_keys_device      = pycuda.driver.mem_alloc(self._num_bytes)
        self._output_values_device    = pycuda.driver.mem_alloc(self._num_bytes)

        self._split_manager           = split.SplitManager(max_num_elements)

    def __copy_input_htod_key(self, input_keys_host):

        assert input_keys_host.shape[0] <= self._max_num_elements

        assert \
            input_keys_host.dtype == numpy.uint32  or \
            input_keys_host.dtype == numpy.int32   or \
            input_keys_host.dtype == numpy.float32

        pycuda.driver.memcpy_htod(self._input_keys_device,   input_keys_host)

    def __copy_input_htod_key_value(self, input_keys_host, input_values_host):

        assert input_keys_host.shape[0] == input_values_host.shape[0]
        assert input_keys_host.shape[0] <= self._max_num_elements

        assert \
            input_keys_host.dtype == numpy.uint32  or \
            input_keys_host.dtype == numpy.int32   or \
            input_keys_host.dtype == numpy.float32

        assert \
            input_values_host.dtype == numpy.uint32  or \
            input_values_host.dtype == numpy.int32   or \
            input_values_host.dtype == numpy.float32

        pycuda.driver.memcpy_htod(self._input_keys_device,   input_keys_host)
        pycuda.driver.memcpy_htod(self._input_values_device, input_values_host)

    def __radix_sort_key(self, input_keys_device, output_keys_device, num_elements, compute_flags_function):

        assert num_elements <= self._max_num_elements

        self._n = num_elements

        pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)
        
        pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)
        pycuda.driver.memset_d32(output_keys_device,     0, self._n)

        for b in range(self._size_of_element_bits):

            mask = numpy.int32(2**numpy.int8(b))

            radix_sort_compute_flags_funcion_block = (512,1,1)
            num_blocks                             = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))
            radix_sort_compute_flags_funcion_grid  = (num_blocks, 1)
            
            compute_flags_function(
                self._split_keys_old_device,
                self._flag_data_device,
                numpy.int32(mask),
                numpy.int32(self._n),
                block=radix_sort_compute_flags_funcion_block,
                grid=radix_sort_compute_flags_funcion_grid)

            self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)

            self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device

        pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)
        
    def __radix_sort_key_value(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements, compute_flags_function):

        assert num_elements <= self._max_num_elements

        self._n = num_elements

        pycuda.driver.memcpy_dtod(self._split_keys_old_device,   input_keys_device,   self._n * self._size_of_element_bytes)
        pycuda.driver.memcpy_dtod(self._split_values_old_device, input_values_device, self._n * self._size_of_element_bytes)
        
        pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)
        pycuda.driver.memset_d32(output_keys_device,     0, self._n)
        pycuda.driver.memset_d32(output_values_device,   0, self._n)

        for b in range(self._size_of_element_bits):

            mask = numpy.int32(2**numpy.int8(b))

            radix_sort_compute_flags_funcion_block = (512,1,1)
            num_blocks                             = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))
            radix_sort_compute_flags_funcion_grid  = (num_blocks, 1)
            
            compute_flags_function(
                self._split_keys_old_device,
                self._flag_data_device,
                numpy.int32(mask),
                numpy.int32(self._n),
                block=radix_sort_compute_flags_funcion_block,
                grid=radix_sort_compute_flags_funcion_grid)

            self._split_manager.split_device(self._split_keys_old_device,   self._flag_data_device, output_keys_device,   self._n)
            self._split_manager.split_device(self._split_values_old_device, self._flag_data_device, output_values_device, self._n)

            self._split_keys_old_device,   output_keys_device   = output_keys_device,   self._split_keys_old_device
            self._split_values_old_device, output_values_device = output_values_device, self._split_values_old_device

        pycuda.driver.memcpy_dtod(output_keys_device,   self._split_keys_old_device,   self._n * self._size_of_element_bytes)
        pycuda.driver.memcpy_dtod(output_values_device, self._split_values_old_device, self._n * self._size_of_element_bytes)

    def __copy_output_dtoh_key(self, output_keys_host):

        pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)

    def __copy_output_dtoh_key_value(self, output_keys_host, output_values_host):

        pycuda.driver.memcpy_dtoh(output_keys_host,   self._output_keys_device)
        pycuda.driver.memcpy_dtoh(output_values_host, self._output_values_device)

    def radix_sort_key_ascending_device(self, input_keys_device, output_keys_device, num_elements):

        self.__radix_sort_key(
            input_keys_device,
            output_keys_device,
            num_elements,
            self._radix_sort_compute_flags_ascending_function)

    def radix_sort_key_descending_device(self, input_keys_device, output_keys_device, num_elements):

        self.__radix_sort_key(
            input_keys_device,
            output_keys_device,
            num_elements,
            self._radix_sort_compute_flags_descending_function)

    def radix_sort_key_ascending_host(self, input_keys_host, output_keys_host):

        num_elements = input_keys_host.shape[0]
        
        self.__copy_input_htod_key(input_keys_host)
        self.radix_sort_key_ascending_device(self._input_keys_device, self._output_keys_device, num_elements)
        self.__copy_output_dtoh_key(output_keys_host)

    def radix_sort_key_descending_host(self, input_keys_host, output_keys_host):

        num_elements = input_keys_host.shape[0]
        
        self.__copy_input_htod_key(input_keys_host)
        self.radix_sort_key_descending_device(self._input_keys_device, self._output_keys_device, num_elements)
        self.__copy_output_dtoh_key(output_keys_host)

    def radix_sort_key_value_ascending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):

        self.__radix_sort_key_value(
            input_keys_device,
            input_values_device,
            output_keys_device,
            output_values_device,
            num_elements,
            self._radix_sort_compute_flags_ascending_function)

    def radix_sort_key_value_descending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):

        self.__radix_sort_key_value(
            input_keys_device,
            input_values_device,
            output_keys_device,
            output_values_device,
            num_elements,
            self._radix_sort_compute_flags_descending_function)

    def radix_sort_key_value_ascending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):

        num_elements = input_keys_host.shape[0]
        
        self.__copy_input_htod_key_value(input_keys_host, input_values_host)
        self.radix_sort_key_value_ascending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)
        self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)

    def radix_sort_key_value_descending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):

        num_elements = input_keys_host.shape[0]
        
        self.__copy_input_htod_key_value(input_keys_host, input_values_host)
        self.radix_sort_key_value_descending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)
        self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)

invoke GPU radix sort class¶

In [6]:

n                    = 10000
input_keys           = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values         = input_keys.astype(numpy.float32)
output_keys          = numpy.zeros_like(input_keys)
output_values        = numpy.zeros_like(input_values)
radix_sort_manager   = RadixSortManager(15000)

radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)

print input_keys
print input_values
print output_keys
print output_values
print
print "Difference between GPU and CPU keys   (should be 0.0%%): %f" % numpy.linalg.norm(output_keys   - numpy.sort(input_keys))
print "Difference between GPU and CPU values (should be 0.0%%): %f" % numpy.linalg.norm(output_values - numpy.sort(input_keys).astype(float32))

[3229 8213 9674 ..., 5248 4543 1902]
[ 3229.  8213.  9674. ...,  5248.  4543.  1902.]
[   0    5    5 ..., 9997 9997 9999]
[  0.00000000e+00   5.00000000e+00   5.00000000e+00 ...,   9.99700000e+03
   9.99700000e+03   9.99900000e+03]

Difference between GPU and CPU keys   (should be 0.0%): 0.000000
Difference between GPU and CPU values (should be 0.0%): 0.000000

-c:172: RuntimeWarning: invalid value encountered in power

ascending vs. descending and key-value vs. key-only¶

In [7]:

n                    = 10
input_keys           = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values         = input_keys.astype(numpy.float32)
output_keys          = numpy.zeros_like(input_keys)
output_values        = numpy.zeros_like(input_values)
radix_sort_manager   = RadixSortManager(15000)

radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)

print input_keys
print input_values
print output_keys
print output_values
print



n                    = 10
input_keys           = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values         = input_keys.astype(numpy.float32)
output_keys          = numpy.zeros_like(input_keys)
output_values        = numpy.zeros_like(input_values)
radix_sort_manager   = RadixSortManager(15000)

radix_sort_manager.radix_sort_key_value_descending_host(input_keys, input_values, output_keys, output_values)

print input_keys
print input_values
print output_keys
print output_values
print



n                    = 10
input_keys           = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values         = input_keys.astype(numpy.float32)
output_keys          = numpy.zeros_like(input_keys)
output_values        = numpy.zeros_like(input_values)
radix_sort_manager   = RadixSortManager(15000)

radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)

print input_keys
print output_keys
print



n                    = 10
input_keys           = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values         = input_keys.astype(numpy.float32)
output_keys          = numpy.zeros_like(input_keys)
output_values        = numpy.zeros_like(input_values)
radix_sort_manager   = RadixSortManager(15000)

radix_sort_manager.radix_sort_key_descending_host(input_keys, output_keys)

print input_keys
print output_keys
print

[9 8 8 2 4 3 3 8 4 6]
[ 9.  8.  8.  2.  4.  3.  3.  8.  4.  6.]
[2 3 3 4 4 6 8 8 8 9]
[ 2.  3.  3.  4.  4.  6.  8.  8.  8.  9.]

[8 9 8 4 2 1 1 8 8 5]
[ 8.  9.  8.  4.  2.  1.  1.  8.  8.  5.]
[9 8 8 8 8 5 4 2 1 1]
[ 9.  8.  8.  8.  8.  5.  4.  2.  1.  1.]

[7 6 8 7 4 4 9 7 8 5]
[4 4 5 6 7 7 7 8 8 9]

[6 5 8 6 5 0 6 0 6 3]
[8 6 6 6 6 5 5 3 0 0]

-c:137: RuntimeWarning: invalid value encountered in power

floating point keys¶

In [8]:

n                    = 10000
input_keys           = (numpy.random.rand(n) * n).astype(numpy.float32)
output_keys          = numpy.zeros_like(input_keys)
radix_sort_manager   = RadixSortManager(15000)

radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)

print input_keys
print output_keys
print
print "Difference between GPU and CPU keys (should be 0.0%%): %f" % numpy.linalg.norm(output_keys   - numpy.sort(input_keys))

[ 4938.57958984  9086.15136719  9926.49511719 ...,  9953.21875
  5300.93457031  2960.62329102]
[  1.85716784e+00   2.97219419e+00   3.74738264e+00 ...,   9.99855371e+03
   9.99873047e+03   9.99973535e+03]

Difference between GPU and CPU keys (should be 0.0%): 0.000000