The purpose of this code is to implement a canonical radix sort algorithm on the GPU.
n = 10000
input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values = input_keys.astype(numpy.float32)
print input_keys
print input_values
[9875 764 6555 ..., 5036 7990 9689] [ 9875. 764. 6555. ..., 5036. 7990. 9689.]
import split
num_bits_per_element = 32
split_manager = split.SplitManager(15000)
flag_data = zeros_like(input_keys)
split_keys_old = input_keys.copy()
split_values_old = input_values.copy()
split_keys_new = zeros_like(input_keys)
split_values_new = zeros_like(input_values)
for b in range(num_bits_per_element):
mask = 2**b
for i in range(n):
input_value = split_keys_old[i]
flag_data[i] = not (input_value & mask)
split_manager.split_host(split_keys_old, flag_data, split_keys_new)
split_manager.split_host(split_values_old, flag_data, split_values_new)
split_keys_old, split_keys_new = split_keys_new, split_keys_old
split_values_old, split_values_new = split_values_new, split_values_old
print input_keys
print input_values
print split_keys_old
print split_values_old
print numpy.sort(input_keys)
print
print "Difference between GPU and CPU keys (should be 0.0%%): %f" % numpy.linalg.norm(split_keys_new - numpy.sort(input_keys))
print "Difference between GPU and CPU values (should be 0.0%%): %f" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))
[9875 764 6555 ..., 5036 7990 9689] [ 9875. 764. 6555. ..., 5036. 7990. 9689.] [ 0 1 2 ..., 9999 9999 9999] [ 0.00000000e+00 1.00000000e+00 2.00000000e+00 ..., 9.99900000e+03 9.99900000e+03 9.99900000e+03] [ 0 1 2 ..., 9999 9999 9999] Difference between GPU and CPU keys (should be 0.0%): 0.000000 Difference between GPU and CPU values (should be 0.0%): 0.000000
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
import split
source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void radix_sort_compute_flags(
unsigned int* d_input_data,
unsigned int* d_output_data,
int mask,
int n )
{
int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
if ( global_index_1d < n )
{
unsigned int input_value = d_input_data[ global_index_1d ];
if ( input_value & mask )
{
d_output_data[ global_index_1d ] = 0;
}
else
{
d_output_data[ global_index_1d ] = 1;
}
}
}
"""
)
radix_sort_compute_flags_function = source_module.get_function("radix_sort_compute_flags")
size_of_element_bytes = 4
size_of_element_bits = 32
max_num_elements = 15000
num_bytes = max_num_elements * size_of_element_bytes
input_keys_device = pycuda.driver.mem_alloc(num_bytes)
input_values_device = pycuda.driver.mem_alloc(num_bytes)
flag_data_device = pycuda.driver.mem_alloc(num_bytes)
split_keys_old_device = pycuda.driver.mem_alloc(num_bytes)
split_values_old_device = pycuda.driver.mem_alloc(num_bytes)
split_keys_new_device = pycuda.driver.mem_alloc(num_bytes)
split_values_new_device = pycuda.driver.mem_alloc(num_bytes)
split_manager = split.SplitManager(max_num_elements)
pycuda.driver.memcpy_htod(input_keys_device, input_keys)
pycuda.driver.memcpy_htod(input_values_device, input_values)
pycuda.driver.memcpy_htod(split_keys_old_device, input_keys)
pycuda.driver.memcpy_htod(split_values_old_device, input_values)
pycuda.driver.memset_d32(flag_data_device, 0, n)
pycuda.driver.memset_d32(split_keys_new_device, 0, n)
pycuda.driver.memset_d32(split_values_new_device, 0, n)
for b in range(num_bits_per_element):
mask = numpy.int32(2**numpy.int8(b))
radix_sort_compute_flags_funcion_block = (512,1,1)
num_blocks = int(ceil(float(n) / float(radix_sort_compute_flags_funcion_block[0])))
radix_sort_compute_flags_funcion_grid = (num_blocks, 1)
radix_sort_compute_flags_function(
split_keys_old_device,
flag_data_device,
numpy.int32(mask),
numpy.int32(n),
block=radix_sort_compute_flags_funcion_block,
grid=radix_sort_compute_flags_funcion_grid)
split_manager.split_device(split_keys_old_device, flag_data_device, split_keys_new_device, n)
split_manager.split_device(split_values_old_device, flag_data_device, split_values_new_device, n)
split_keys_old_device, split_keys_new_device = split_keys_new_device, split_keys_old_device
split_values_old_device, split_values_new_device = split_values_new_device, split_values_old_device
pycuda.driver.memcpy_dtoh(split_keys_new, split_keys_old_device)
pycuda.driver.memcpy_dtoh(split_values_new, split_values_old_device)
print input_keys
print input_values
print split_keys_new
print split_values_new
print numpy.sort(input_keys)
print
print "Difference between GPU and CPU keys (should be 0.0%%): %f" % numpy.linalg.norm(split_keys_new - numpy.sort(input_keys))
print "Difference between GPU and CPU values (should be 0.0%%): %f" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))
[9875 764 6555 ..., 5036 7990 9689] [ 9875. 764. 6555. ..., 5036. 7990. 9689.] [ 0 1 2 ..., 9999 9999 9999] [ 0.00000000e+00 1.00000000e+00 2.00000000e+00 ..., 9.99900000e+03 9.99900000e+03 9.99900000e+03] [ 0 1 2 ..., 9999 9999 9999] Difference between GPU and CPU keys (should be 0.0%): 0.000000 Difference between GPU and CPU values (should be 0.0%): 0.000000
-c:12: RuntimeWarning: invalid value encountered in power
import math
import numpy
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
import split
class RadixSortManager:
source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void radix_sort_compute_flags_ascending(
unsigned int* d_input_data,
unsigned int* d_output_data,
int mask,
int n )
{
int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
if ( global_index_1d < n )
{
unsigned int input_value = d_input_data[ global_index_1d ];
if ( input_value & mask )
{
d_output_data[ global_index_1d ] = 0;
}
else
{
d_output_data[ global_index_1d ] = 1;
}
}
}
__global__ void radix_sort_compute_flags_descending(
unsigned int* d_input_data,
unsigned int* d_output_data,
int mask,
int n )
{
int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
if ( global_index_1d < n )
{
unsigned int input_value = d_input_data[ global_index_1d ];
if ( input_value & mask )
{
d_output_data[ global_index_1d ] = 1;
}
else
{
d_output_data[ global_index_1d ] = 0;
}
}
}
"""
)
_radix_sort_compute_flags_ascending_function = source_module.get_function("radix_sort_compute_flags_ascending")
_radix_sort_compute_flags_descending_function = source_module.get_function("radix_sort_compute_flags_descending")
_size_of_element_bytes = 4
_size_of_element_bits = 32
_max_num_elements = -1
_num_bytes = -1
_input_keys_device = -1
_input_values_device = -1
_flag_data_device = -1
_split_keys_old_device = -1
_split_values_old_device = -1
_split_keys_new_device = -1
_split_values_new_device = -1
_split_manager = -1
def __init__(self, max_num_elements):
self._max_num_elements = max_num_elements
self._num_bytes = self._max_num_elements * self._size_of_element_bytes
self._input_keys_device = pycuda.driver.mem_alloc(self._num_bytes)
self._input_values_device = pycuda.driver.mem_alloc(self._num_bytes)
self._flag_data_device = pycuda.driver.mem_alloc(self._num_bytes)
self._split_keys_old_device = pycuda.driver.mem_alloc(self._num_bytes)
self._split_values_old_device = pycuda.driver.mem_alloc(self._num_bytes)
self._output_keys_device = pycuda.driver.mem_alloc(self._num_bytes)
self._output_values_device = pycuda.driver.mem_alloc(self._num_bytes)
self._split_manager = split.SplitManager(max_num_elements)
def __copy_input_htod_key(self, input_keys_host):
assert input_keys_host.shape[0] <= self._max_num_elements
assert \
input_keys_host.dtype == numpy.uint32 or \
input_keys_host.dtype == numpy.int32 or \
input_keys_host.dtype == numpy.float32
pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)
def __copy_input_htod_key_value(self, input_keys_host, input_values_host):
assert input_keys_host.shape[0] == input_values_host.shape[0]
assert input_keys_host.shape[0] <= self._max_num_elements
assert \
input_keys_host.dtype == numpy.uint32 or \
input_keys_host.dtype == numpy.int32 or \
input_keys_host.dtype == numpy.float32
assert \
input_values_host.dtype == numpy.uint32 or \
input_values_host.dtype == numpy.int32 or \
input_values_host.dtype == numpy.float32
pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)
pycuda.driver.memcpy_htod(self._input_values_device, input_values_host)
def __radix_sort_key(self, input_keys_device, output_keys_device, num_elements, compute_flags_function):
assert num_elements <= self._max_num_elements
self._n = num_elements
pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)
pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)
pycuda.driver.memset_d32(output_keys_device, 0, self._n)
for b in range(self._size_of_element_bits):
mask = numpy.int32(2**numpy.int8(b))
radix_sort_compute_flags_funcion_block = (512,1,1)
num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))
radix_sort_compute_flags_funcion_grid = (num_blocks, 1)
compute_flags_function(
self._split_keys_old_device,
self._flag_data_device,
numpy.int32(mask),
numpy.int32(self._n),
block=radix_sort_compute_flags_funcion_block,
grid=radix_sort_compute_flags_funcion_grid)
self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)
self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device
pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)
def __radix_sort_key_value(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements, compute_flags_function):
assert num_elements <= self._max_num_elements
self._n = num_elements
pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)
pycuda.driver.memcpy_dtod(self._split_values_old_device, input_values_device, self._n * self._size_of_element_bytes)
pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)
pycuda.driver.memset_d32(output_keys_device, 0, self._n)
pycuda.driver.memset_d32(output_values_device, 0, self._n)
for b in range(self._size_of_element_bits):
mask = numpy.int32(2**numpy.int8(b))
radix_sort_compute_flags_funcion_block = (512,1,1)
num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))
radix_sort_compute_flags_funcion_grid = (num_blocks, 1)
compute_flags_function(
self._split_keys_old_device,
self._flag_data_device,
numpy.int32(mask),
numpy.int32(self._n),
block=radix_sort_compute_flags_funcion_block,
grid=radix_sort_compute_flags_funcion_grid)
self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)
self._split_manager.split_device(self._split_values_old_device, self._flag_data_device, output_values_device, self._n)
self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device
self._split_values_old_device, output_values_device = output_values_device, self._split_values_old_device
pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)
pycuda.driver.memcpy_dtod(output_values_device, self._split_values_old_device, self._n * self._size_of_element_bytes)
def __copy_output_dtoh_key(self, output_keys_host):
pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)
def __copy_output_dtoh_key_value(self, output_keys_host, output_values_host):
pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)
pycuda.driver.memcpy_dtoh(output_values_host, self._output_values_device)
def radix_sort_key_ascending_device(self, input_keys_device, output_keys_device, num_elements):
self.__radix_sort_key(
input_keys_device,
output_keys_device,
num_elements,
self._radix_sort_compute_flags_ascending_function)
def radix_sort_key_descending_device(self, input_keys_device, output_keys_device, num_elements):
self.__radix_sort_key(
input_keys_device,
output_keys_device,
num_elements,
self._radix_sort_compute_flags_descending_function)
def radix_sort_key_ascending_host(self, input_keys_host, output_keys_host):
num_elements = input_keys_host.shape[0]
self.__copy_input_htod_key(input_keys_host)
self.radix_sort_key_ascending_device(self._input_keys_device, self._output_keys_device, num_elements)
self.__copy_output_dtoh_key(output_keys_host)
def radix_sort_key_descending_host(self, input_keys_host, output_keys_host):
num_elements = input_keys_host.shape[0]
self.__copy_input_htod_key(input_keys_host)
self.radix_sort_key_descending_device(self._input_keys_device, self._output_keys_device, num_elements)
self.__copy_output_dtoh_key(output_keys_host)
def radix_sort_key_value_ascending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):
self.__radix_sort_key_value(
input_keys_device,
input_values_device,
output_keys_device,
output_values_device,
num_elements,
self._radix_sort_compute_flags_ascending_function)
def radix_sort_key_value_descending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):
self.__radix_sort_key_value(
input_keys_device,
input_values_device,
output_keys_device,
output_values_device,
num_elements,
self._radix_sort_compute_flags_descending_function)
def radix_sort_key_value_ascending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):
num_elements = input_keys_host.shape[0]
self.__copy_input_htod_key_value(input_keys_host, input_values_host)
self.radix_sort_key_value_ascending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)
self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)
def radix_sort_key_value_descending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):
num_elements = input_keys_host.shape[0]
self.__copy_input_htod_key_value(input_keys_host, input_values_host)
self.radix_sort_key_value_descending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)
self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)
n = 10000
input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values = input_keys.astype(numpy.float32)
output_keys = numpy.zeros_like(input_keys)
output_values = numpy.zeros_like(input_values)
radix_sort_manager = RadixSortManager(15000)
radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)
print input_keys
print input_values
print output_keys
print output_values
print
print "Difference between GPU and CPU keys (should be 0.0%%): %f" % numpy.linalg.norm(output_keys - numpy.sort(input_keys))
print "Difference between GPU and CPU values (should be 0.0%%): %f" % numpy.linalg.norm(output_values - numpy.sort(input_keys).astype(float32))
[3229 8213 9674 ..., 5248 4543 1902] [ 3229. 8213. 9674. ..., 5248. 4543. 1902.] [ 0 5 5 ..., 9997 9997 9999] [ 0.00000000e+00 5.00000000e+00 5.00000000e+00 ..., 9.99700000e+03 9.99700000e+03 9.99900000e+03] Difference between GPU and CPU keys (should be 0.0%): 0.000000 Difference between GPU and CPU values (should be 0.0%): 0.000000
-c:172: RuntimeWarning: invalid value encountered in power
n = 10
input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values = input_keys.astype(numpy.float32)
output_keys = numpy.zeros_like(input_keys)
output_values = numpy.zeros_like(input_values)
radix_sort_manager = RadixSortManager(15000)
radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)
print input_keys
print input_values
print output_keys
print output_values
print
n = 10
input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values = input_keys.astype(numpy.float32)
output_keys = numpy.zeros_like(input_keys)
output_values = numpy.zeros_like(input_values)
radix_sort_manager = RadixSortManager(15000)
radix_sort_manager.radix_sort_key_value_descending_host(input_keys, input_values, output_keys, output_values)
print input_keys
print input_values
print output_keys
print output_values
print
n = 10
input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values = input_keys.astype(numpy.float32)
output_keys = numpy.zeros_like(input_keys)
output_values = numpy.zeros_like(input_values)
radix_sort_manager = RadixSortManager(15000)
radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)
print input_keys
print output_keys
print
n = 10
input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)
input_values = input_keys.astype(numpy.float32)
output_keys = numpy.zeros_like(input_keys)
output_values = numpy.zeros_like(input_values)
radix_sort_manager = RadixSortManager(15000)
radix_sort_manager.radix_sort_key_descending_host(input_keys, output_keys)
print input_keys
print output_keys
print
[9 8 8 2 4 3 3 8 4 6] [ 9. 8. 8. 2. 4. 3. 3. 8. 4. 6.] [2 3 3 4 4 6 8 8 8 9] [ 2. 3. 3. 4. 4. 6. 8. 8. 8. 9.] [8 9 8 4 2 1 1 8 8 5] [ 8. 9. 8. 4. 2. 1. 1. 8. 8. 5.] [9 8 8 8 8 5 4 2 1 1] [ 9. 8. 8. 8. 8. 5. 4. 2. 1. 1.] [7 6 8 7 4 4 9 7 8 5] [4 4 5 6 7 7 7 8 8 9] [6 5 8 6 5 0 6 0 6 3] [8 6 6 6 6 5 5 3 0 0]
-c:137: RuntimeWarning: invalid value encountered in power
n = 10000
input_keys = (numpy.random.rand(n) * n).astype(numpy.float32)
output_keys = numpy.zeros_like(input_keys)
radix_sort_manager = RadixSortManager(15000)
radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)
print input_keys
print output_keys
print
print "Difference between GPU and CPU keys (should be 0.0%%): %f" % numpy.linalg.norm(output_keys - numpy.sort(input_keys))
[ 4938.57958984 9086.15136719 9926.49511719 ..., 9953.21875 5300.93457031 2960.62329102] [ 1.85716784e+00 2.97219419e+00 3.74738264e+00 ..., 9.99855371e+03 9.99873047e+03 9.99973535e+03] Difference between GPU and CPU keys (should be 0.0%): 0.000000