using PyPlot

import OpenCL
const cl = OpenCL;

BLOCK_SIZE = 16;

naive_transpose = "
__kernel void transpose(__global *a_t,
                        __global *a,
                        unsigned a_width,
                        unsigned a_height)
{
   int read_idx = get_global_id(0) + get_global_id(1) * a_width;
   int write_idx = get_global_id(1) + get_global_id(0) * a_height;
   a_t[write_idx] = a[read_idx];
}";

block_transpose = "
#define BLOCK_SIZE $(BLOCK_SIZE)
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1)))
void transpose(__global float *a_t, 
               __global float *a,
               unsigned a_width,
               unsigned a_height,
               __local float *a_local)
{
  int base_idx_a   = get_group_id(0) * BLOCK_SIZE + get_group_id(1) * (BLOCK_SIZE * a_width);
  int base_idx_a_t = get_group_id(1) * BLOCK_SIZE + get_group_id(0) * (BLOCK_SIZE * a_height);

  int glob_idx_a   = base_idx_a + get_local_id(0) + a_width * get_local_id(1);
  int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1);

  a_local[get_local_id(1) * BLOCK_SIZE + get_local_id(0)] = a[glob_idx_a];

  barrier(CLK_LOCAL_MEM_FENCE);

  a_t[glob_idx_a_t] = a_local[get_local_id(0) * BLOCK_SIZE + get_local_id(1)];
}";

function enqueue_naive_kernel{T}(queue::cl.CmdQueue,
                                 k::cl.Kernel,
                                 dst::cl.Buffer{T},
                                 src::cl.Buffer{T},
                                 dims)
    h, w = dims
    @assert w % BLOCK_SIZE == 0
    @assert h % BLOCK_SIZE == 0
    cl.set_args!(k, dst, src, uint32(h), uint32(h))
    cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE))
end

function enqueue_block_kernel{T}(queue::cl.CmdQueue,
                                 k::cl.Kernel,
                                 dst::cl.Buffer{T},
                                 src::cl.Buffer{T},
                                 dims::Dims)
    h, w = dims
    @assert w % BLOCK_SIZE == 0
    @assert h % BLOCK_SIZE == 0
    lmem = cl.LocalMem(Float32, BLOCK_SIZE * (BLOCK_SIZE + 1))
    cl.set_args!(k, dst, src, uint32(h), uint32(w), lmem)
    cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE))
end    

function benchmark_transpose()

    gpu_device = first(cl.devices(:gpu))
    ctx = cl.Context(gpu_device)
    
    for dev in cl.devices(ctx)
        @assert dev[:local_mem_size] > 0
    end
    
    prg  = cl.Program(ctx, source=naive_transpose) |> cl.build!
    naive_kern = cl.Kernel(prg, "transpose")
    
    prg = cl.Program(ctx, source=block_transpose)  |> cl.build!
    block_kernel = cl.Kernel(prg, "transpose")
    
    queue = cl.CmdQueue(ctx, :profile)    
    
    array_sizes = [int((2^i) / 32) * 32 for i in 6:12]

    mem_bandwiths = Float32[]
    for (name, method, kern) in (("naive", enqueue_naive_kernel, naive_kern), 
                                 ("block", enqueue_block_kernel, block_kernel))
        for s in array_sizes  
            src  = rand(Float32, (s, s))
            a_buf   = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=src)
            a_t_buf = cl.Buffer(Float32, ctx, :w, sizeof(src))
            
            # warm up....
            for i in 1:4
                method(queue, kern, a_t_buf, a_buf, size(src))
            end
            
            # profile...
            count  = 20
            events = cl.Event[]
            for i in 1:count
                push!(events, method(queue, kern, a_t_buf, a_buf, size(src)))
            end
            cl.wait(events[end])
            
            time = sum([evt[:profile_duration] for evt in events])
            mem_bw = 2 * sizeof(src) * count / (time * 1e-9)
            push!(mem_bandwiths, mem_bw)
            
            @printf("benchmarking %s, array size: %s^2, %.3f GB/s\n", name, s, mem_bw / 1e9)
            
            # causes problems with ijulia's show...
            #cl.release!(a_buf)
            #cl.release!(a_t_buf)
        end
    end
    
    # plot results, sleep a little to prevent overlap with show..
    sleep(1)
    clf()
    n_sizes = length(array_sizes)
    naive_bandwidths = mem_bandwiths[1:n_sizes]
    block_bandwidths = mem_bandwiths[n_sizes+1:end]
    plot(array_sizes, naive_bandwidths/1e9, "o-", label="naive")
    plot(array_sizes, block_bandwidths/1e9, "o-", label="block")
    title("Tanspose Tests for $(gpu_device[:name])")
    xlabel("Matrix width/height N")
    ylabel("Memory Bandwidth [GB/s]")
    legend(loc="best")
    grid(true)
    show()
end

benchmark_transpose()