import OpenCL const cl = OpenCL; test_source = " __kernel void sum(__global const float *a, __global const float *b, __global float *c, const unsigned int count) { int gid = get_global_id(0); if (gid < count) { c[gid] = a[gid] + b[gid]; } }"; device = first(cl.devices()) length = 1024 h_a = Array(cl.CL_float, length) h_b = Array(cl.CL_float, length) h_c = Array(cl.CL_float, length) h_d = Array(cl.CL_float, length) h_e = Array(cl.CL_float, length) h_f = Array(cl.CL_float, length) h_g = Array(cl.CL_float, length) for i in 1:length h_a[i] = cl.cl_float(rand()) h_b[i] = cl.cl_float(rand()) h_e[i] = cl.cl_float(rand()) h_g[i] = cl.cl_float(rand()) end err_code = Array(cl.CL_int, 1) # create compute context (TODO: fails if function ptr's not passed...) ctx_id = cl.api.clCreateContext(C_NULL, 1, [device.id], cl.ctx_callback_ptr, cl.raise_context_error, err_code) if err_code[1] != cl.CL_SUCCESS error("Failed to create context") end q_id = cl.api.clCreateCommandQueue(ctx_id, device.id, 0, err_code) if err_code[1] != cl.CL_SUCCESS error("Failed to create command queue") end # create program bytesource = bytestring(test_source) prg_id = cl.api.clCreateProgramWithSource(ctx_id, 1, [bytesource], C_NULL, err_code) if err_code[1] != cl.CL_SUCCESS error("Failed to create program") end # build program err = cl.api.clBuildProgram(prg_id, 0, C_NULL, C_NULL, C_NULL, C_NULL) if err != cl.CL_SUCCESS error("Failed to build program") end # create compute kernel k_id = cl.api.clCreateKernel(prg_id, "sum", err_code) if err_code[1] != cl.CL_SUCCESS error("Failed to create compute kernel") end # create input array in device memory Aid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, sizeof(cl.CL_float) * length, h_a, err_code) if err_code[1] != cl.CL_SUCCESS error("Error creating buffer A") end Bid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, sizeof(cl.CL_float) * length, h_b, err_code) if err_code[1] != cl.CL_SUCCESS error("Error creating buffer B") end Eid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_COPY_HOST_PTR, sizeof(cl.CL_float) * length, h_e, err_code) if err_code[1] != cl.CL_SUCCESS error("Error creating buffer E") end Gid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_COPY_HOST_PTR, sizeof(cl.CL_float) * length, h_g, err_code) if err_code[1] != cl.CL_SUCCESS error("Error creating buffer G") end # create output arrays in device memory Cid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_WRITE, sizeof(cl.CL_float) * length, C_NULL, err_code) if err_code[1] != cl.CL_SUCCESS error("Error creating buffer C") end Did = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_WRITE, sizeof(cl.CL_float) * length, C_NULL, err_code) if err_code[1] != cl.CL_SUCCESS error("Error creating buffer D") end Fid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_WRITE_ONLY, sizeof(cl.CL_float) * length, C_NULL, err_code) if err_code[1] != cl.CL_SUCCESS error("Error creating buffer F") end err = cl.api.clSetKernelArg(k_id, 0, sizeof(cl.CL_mem), [Aid]) err |= cl.api.clSetKernelArg(k_id, 1, sizeof(cl.CL_mem), [Bid]) err |= cl.api.clSetKernelArg(k_id, 2, sizeof(cl.CL_mem), [Cid]) err |= cl.api.clSetKernelArg(k_id, 3, sizeof(cl.CL_uint), cl.CL_uint[length]) if err != cl.CL_SUCCESS error("Error setting kernel 1 args") end nglobal = Csize_t[length,] err = cl.api.clEnqueueNDRangeKernel(q_id, k_id, 1, C_NULL, nglobal, C_NULL, 0, C_NULL, C_NULL) if err != cl.CL_SUCCESS error("Failed to execute kernel 1") end err = cl.api.clSetKernelArg(k_id, 0, sizeof(cl.CL_mem), [Eid]) err |= cl.api.clSetKernelArg(k_id, 1, sizeof(cl.CL_mem), [Cid]) err |= cl.api.clSetKernelArg(k_id, 2, sizeof(cl.CL_mem), [Did]) if err != cl.CL_SUCCESS error("Error setting kernel 2 args") end err = cl.api.clEnqueueNDRangeKernel(q_id, k_id, 1, C_NULL, nglobal, C_NULL, 0, C_NULL, C_NULL) if err != cl.CL_SUCCESS error("Failed to execute kernel 2") end err = cl.api.clSetKernelArg(k_id, 0, sizeof(cl.CL_mem), [Gid]) err |= cl.api.clSetKernelArg(k_id, 1, sizeof(cl.CL_mem), [Did]) err |= cl.api.clSetKernelArg(k_id, 2, sizeof(cl.CL_mem), [Fid]) if err != cl.CL_SUCCESS error("Error setting kernel 3 args") end err = cl.api.clEnqueueNDRangeKernel(q_id, k_id, 1, C_NULL, nglobal, C_NULL, 0, C_NULL, C_NULL) if err != cl.CL_SUCCESS error("Failed to execute kernel 3") end # read back the result from compute device... err = cl.api.clEnqueueReadBuffer(q_id, Fid, cl.CL_TRUE, 0, sizeof(cl.CL_float) * length, h_f, 0, C_NULL, C_NULL) if err != cl.CL_SUCCESS error("Failed to read output array") end # test results ncorrect = 0 for i in 1:length tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i] if isapprox(tmp, h_f[i]) ncorrect += 1 end end if ncorrect == length info("Success!") else error("Results are incorrect!") end