I have heard of OpenCL for a long time, but I haven't tried it until recently. Here's my note to write and run a simple vector addition OpenCL program.
The following source code is compiled and tested with Intel SDK for OpenCL Applications. The code should work with other OpenCL implementation as well.
Kernel Functions
First, we have to write a kernel function which will be run on the devices:
__kernel void vec_add(__global int *out,
__global const int *in1,
__global const int *in2) {
int i = get_global_id(0);
out[i] = in1[i] + in2[i];
}
Since the OpenCL run-time will launch many work items, we can simply process
one element at once. As a result, we are only adding in1[i]
and
in2[i]
and assigning the result to out[i]
.
The __kernel
is the attribute to mark the kernel functions. The
__global
is the attribute to annotate the address space, and
get_global_id()
is the built-in function which will return the index of
current work item.
Save this file as vec_add.cl
. We will load this file later in the host
program.
Host Program
Second, we have to write a host program. The host program is responsible to control the devices through the OpenCL run-time. In the host program, we have to:
- Initialize the OpenCL run-time.
- Get the platform IDs with
clGetPlatformIDs()
. - Get the device IDs with
clGetDeviceIDs()
. - Create the context with
clCreateContext()
.
- Get the platform IDs with
- Prepare the program (kernel functions.)
- Load the program.
- Build the program with
clBuildProgram()
.
- Allocate the memory buffers with
clCreateBuffer()
. - Create command queue with
clCreateCommandQueue()
[1]. - Create kernel data structure.
- Find kernel function from the program with
clCreateKernel()
. - Setup kernel arguments with
clSetKernelArg()
.
- Find kernel function from the program with
- Enqueue the input buffer write command with
clEnqueueWriteBuffer()
. - Enqueue the kernel function execution command with
clEnqueueNDRangeKernel()
. - Enqueue the output buffer read command with
clEnqueueReadBuffer()
. - Wait until all commands are finished with
clFinish()
. - Release the resources.
Here is the complete host program for the vector addition example [2]:
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
int read_file(unsigned char **output, size_t *size, const char *name) {
FILE* fp = fopen(name, "rb");
if (!fp) {
return -1;
}
fseek(fp, 0, SEEK_END);
*size = ftell(fp);
fseek(fp, 0, SEEK_SET);
*output = (unsigned char *)malloc(*size);
if (!*output) {
fclose(fp);
return -1;
}
fread(*output, *size, 1, fp);
fclose(fp);
return 0;
}
void run_vec_add(size_t num_elems, size_t buf_size, cl_int* data) {
cl_int err;
// Query platforms and devices
cl_platform_id platform;
err = clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
const cl_context_properties prop[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0
};
// Create context
cl_context ctx = clCreateContext(prop, 1, &device, NULL, NULL, &err);
// Create program
unsigned char* program_file = NULL;
size_t program_size = 0;
read_file(&program_file, &program_size, "vec_add.cl");
cl_program program =
clCreateProgramWithSource(ctx, 1, (const char **)&program_file,
&program_size, &err);
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
free(program_file);
// Allocate memory buffers (on the device)
cl_mem a = clCreateBuffer(ctx, CL_MEM_READ_ONLY, buf_size, NULL, &err);
cl_mem b = clCreateBuffer(ctx, CL_MEM_READ_ONLY, buf_size, NULL, &err);
cl_mem c = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, buf_size, NULL, &err);
// Create command queue
cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, NULL);
// Enqueue the write buffer commands
cl_event wb_events[2];
err = clEnqueueWriteBuffer(queue, a, CL_FALSE, 0, buf_size, data, 0,
NULL, &wb_events[0]);
err = clEnqueueWriteBuffer(queue, b, CL_FALSE, 0, buf_size, data, 0,
NULL, &wb_events[1]);
// Enqueue the kernel execution command
cl_kernel kernel = clCreateKernel(program, "vec_add", &err);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &c);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &a);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &b);
const size_t global_offset = 0;
cl_event kernel_event;
err = clEnqueueNDRangeKernel(queue, kernel, 1, &global_offset,
&num_elems, NULL, 2, wb_events,
&kernel_event);
// Enqueue the read buffer command
err = clEnqueueReadBuffer(queue, c, CL_TRUE, 0, buf_size, data, 1,
&kernel_event, NULL);
// Wait until every commands are finished
err = clFinish(queue);
// Release the resources
clReleaseMemObject(a);
clReleaseMemObject(b);
clReleaseMemObject(c);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
clReleaseDevice(device);
}
int main() {
cl_int i;
// Prepare the input data
const size_t num_elems = 10000000;
const size_t buf_size = sizeof(cl_int) * num_elems;
cl_int *data = (cl_int *)malloc(buf_size);
for (i = 0; i < num_elems; ++i) {
data[i] = i;
}
// Run the OpenCL program
run_vec_add(num_elems, buf_size, data);
// Check the answer
for (i = 0; i < num_elems; ++i) {
if (data[i] != 2 * i) {
fprintf(stderr, "Failed: %u\n", (unsigned)i);
}
}
return 0;
}
Save the code as vec_add.c
and compile with:
$ gcc vec_add.c -lOpenCL
Enjoy the result!
[1] | In OpenCL 2.0 implementations, clCreateCommandQueue() should be
replaced with clCreateCommandQueueWithProperties() ; otherwise, a
warning will be raised. Alternatively, add
-DCL_USE_DEPRECATED_OPENCL_2_0_APIS=1 to silence the warning. |
[2] | For simplicity, I am ignoring every possible error. The returned error code should handled properly in a robust software. |