using PyPlot import OpenCL const cl = OpenCL; BLOCK_SIZE = 16; naive_transpose = " __kernel void transpose(__global *a_t, __global *a, unsigned a_width, unsigned a_height) { int read_idx = get_global_id(0) + get_global_id(1) * a_width; int write_idx = get_global_id(1) + get_global_id(0) * a_height; a_t[write_idx] = a[read_idx]; }"; block_transpose = " #define BLOCK_SIZE $(BLOCK_SIZE) __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1))) void transpose(__global float *a_t, __global float *a, unsigned a_width, unsigned a_height, __local float *a_local) { int base_idx_a = get_group_id(0) * BLOCK_SIZE + get_group_id(1) * (BLOCK_SIZE * a_width); int base_idx_a_t = get_group_id(1) * BLOCK_SIZE + get_group_id(0) * (BLOCK_SIZE * a_height); int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1); int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1); a_local[get_local_id(1) * BLOCK_SIZE + get_local_id(0)] = a[glob_idx_a]; barrier(CLK_LOCAL_MEM_FENCE); a_t[glob_idx_a_t] = a_local[get_local_id(0) * BLOCK_SIZE + get_local_id(1)]; }"; function enqueue_naive_kernel{T}(queue::cl.CmdQueue, k::cl.Kernel, dst::cl.Buffer{T}, src::cl.Buffer{T}, dims) h, w = dims @assert w % BLOCK_SIZE == 0 @assert h % BLOCK_SIZE == 0 cl.set_args!(k, dst, src, uint32(h), uint32(h)) cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE)) end function enqueue_block_kernel{T}(queue::cl.CmdQueue, k::cl.Kernel, dst::cl.Buffer{T}, src::cl.Buffer{T}, dims::Dims) h, w = dims @assert w % BLOCK_SIZE == 0 @assert h % BLOCK_SIZE == 0 lmem = cl.LocalMem(Float32, BLOCK_SIZE * (BLOCK_SIZE + 1)) cl.set_args!(k, dst, src, uint32(h), uint32(w), lmem) cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE)) end function benchmark_transpose() gpu_device = first(cl.devices(:gpu)) ctx = cl.Context(gpu_device) for dev in cl.devices(ctx) @assert dev[:local_mem_size] > 0 end prg = cl.Program(ctx, source=naive_transpose) |> cl.build! naive_kern = cl.Kernel(prg, "transpose") prg = cl.Program(ctx, source=block_transpose) |> cl.build! block_kernel = cl.Kernel(prg, "transpose") queue = cl.CmdQueue(ctx, :profile) array_sizes = [int((2^i) / 32) * 32 for i in 6:12] mem_bandwiths = Float32[] for (name, method, kern) in (("naive", enqueue_naive_kernel, naive_kern), ("block", enqueue_block_kernel, block_kernel)) for s in array_sizes src = rand(Float32, (s, s)) a_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=src) a_t_buf = cl.Buffer(Float32, ctx, :w, sizeof(src)) # warm up.... for i in 1:4 method(queue, kern, a_t_buf, a_buf, size(src)) end # profile... count = 20 events = cl.Event[] for i in 1:count push!(events, method(queue, kern, a_t_buf, a_buf, size(src))) end cl.wait(events[end]) time = sum([evt[:profile_duration] for evt in events]) mem_bw = 2 * sizeof(src) * count / (time * 1e-9) push!(mem_bandwiths, mem_bw) @printf("benchmarking %s, array size: %s^2, %.3f GB/s\n", name, s, mem_bw / 1e9) # causes problems with ijulia's show... #cl.release!(a_buf) #cl.release!(a_t_buf) end end # plot results, sleep a little to prevent overlap with show.. sleep(1) clf() n_sizes = length(array_sizes) naive_bandwidths = mem_bandwiths[1:n_sizes] block_bandwidths = mem_bandwiths[n_sizes+1:end] plot(array_sizes, naive_bandwidths/1e9, "o-", label="naive") plot(array_sizes, block_bandwidths/1e9, "o-", label="block") title("Tanspose Tests for $(gpu_device[:name])") xlabel("Matrix width/height N") ylabel("Memory Bandwidth [GB/s]") legend(loc="best") grid(true) show() end benchmark_transpose()