Getting Started with OpenCL

I have heard of OpenCL for a long time, but I haven't tried it until recently. Here's my note to write and run a simple vector addition OpenCL program.

The following source code is compiled and tested with Intel SDK for OpenCL Applications. The code should work with other OpenCL implementation as well.

Kernel Functions

First, we have to write a kernel function which will be run on the devices:

__kernel void vec_add(__global int *out,
                      __global const int *in1,
                      __global const int *in2) {
  int i = get_global_id(0);
  out[i] = in1[i] + in2[i];

Since the OpenCL run-time will launch many work items, we can simply process one element at once. As a result, we are only adding in1[i] and in2[i] and assigning the result to out[i].

The __kernel is the attribute to mark the kernel functions. The __global is the attribute to annotate the address space, and get_global_id() is the built-in function which will return the index of current work item.

Save this file as We will load this file later in the host program.

Host Program

Second, we have to write a host program. The host program is responsible to control the devices through the OpenCL run-time. In the host program, we have to:

  • Initialize the OpenCL run-time.
    • Get the platform IDs with clGetPlatformIDs().
    • Get the device IDs with clGetDeviceIDs().
    • Create the context with clCreateContext().
  • Prepare the program (kernel functions.)
    • Load the program.
    • Build the program with clBuildProgram().
  • Allocate the memory buffers with clCreateBuffer().
  • Create command queue with clCreateCommandQueue() [1].
  • Create kernel data structure.
    • Find kernel function from the program with clCreateKernel().
    • Setup kernel arguments with clSetKernelArg().
  • Enqueue the input buffer write command with clEnqueueWriteBuffer().
  • Enqueue the kernel function execution command with clEnqueueNDRangeKernel().
  • Enqueue the output buffer read command with clEnqueueReadBuffer().
  • Wait until all commands are finished with clFinish().
  • Release the resources.

Here is the complete host program for the vector addition example [2]:

#include <CL/cl.h>

#include <stdio.h>
#include <stdlib.h>

int read_file(unsigned char **output, size_t *size, const char *name) {
  FILE* fp = fopen(name, "rb");
  if (!fp) {
    return -1;

  fseek(fp, 0, SEEK_END);
  *size = ftell(fp);
  fseek(fp, 0, SEEK_SET);

  *output = (unsigned char *)malloc(*size);
  if (!*output) {
    return -1;

  fread(*output, *size, 1, fp);
  return 0;

void run_vec_add(size_t num_elems, size_t buf_size, cl_int* data) {
  cl_int err;

  // Query platforms and devices
  cl_platform_id platform;
  err = clGetPlatformIDs(1, &platform, NULL);

  cl_device_id device;
  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);

  const cl_context_properties prop[] = {
    CL_CONTEXT_PLATFORM, (cl_context_properties)platform,

  // Create context
  cl_context ctx = clCreateContext(prop, 1, &device, NULL, NULL, &err);

  // Create program
  unsigned char* program_file = NULL;
  size_t program_size = 0;
  read_file(&program_file, &program_size, "");

  cl_program program =
      clCreateProgramWithSource(ctx, 1, (const char **)&program_file,
                                &program_size, &err);

  err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);


  // Allocate memory buffers (on the device)
  cl_mem a = clCreateBuffer(ctx, CL_MEM_READ_ONLY, buf_size, NULL, &err);
  cl_mem b = clCreateBuffer(ctx, CL_MEM_READ_ONLY, buf_size, NULL, &err);
  cl_mem c = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, buf_size, NULL, &err);

  // Create command queue
  cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, NULL);

  // Enqueue the write buffer commands
  cl_event wb_events[2];

  err = clEnqueueWriteBuffer(queue, a, CL_FALSE, 0, buf_size, data, 0,
                             NULL, &wb_events[0]);
  err = clEnqueueWriteBuffer(queue, b, CL_FALSE, 0, buf_size, data, 0,
                             NULL, &wb_events[1]);

  // Enqueue the kernel execution command
  cl_kernel kernel = clCreateKernel(program, "vec_add", &err);
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &c);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &a);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &b);

  const size_t global_offset = 0;
  cl_event kernel_event;
  err = clEnqueueNDRangeKernel(queue, kernel, 1, &global_offset,
                               &num_elems, NULL, 2, wb_events,

  // Enqueue the read buffer command
  err = clEnqueueReadBuffer(queue, c, CL_TRUE, 0, buf_size, data, 1,
                            &kernel_event, NULL);

  // Wait until every commands are finished
  err = clFinish(queue);

  // Release the resources

int main() {
  cl_int i;

  // Prepare the input data
  const size_t num_elems = 10000000;
  const size_t buf_size = sizeof(cl_int) * num_elems;
  cl_int *data = (cl_int *)malloc(buf_size);
  for (i = 0; i < num_elems; ++i) {
    data[i] = i;

  // Run the OpenCL program
  run_vec_add(num_elems, buf_size, data);

  // Check the answer
  for (i = 0; i < num_elems; ++i) {
    if (data[i] != 2 * i) {
      fprintf(stderr, "Failed: %u\n", (unsigned)i);

  return 0;

Save the code as vec_add.c and compile with:

$ gcc vec_add.c -lOpenCL

Enjoy the result!

[1]In OpenCL 2.0 implementations, clCreateCommandQueue() should be replaced with clCreateCommandQueueWithProperties(); otherwise, a warning will be raised. Alternatively, add -DCL_USE_DEPRECATED_OPENCL_2_0_APIS=1 to silence the warning.
[2]For simplicity, I am ignoring every possible error. The returned error code should handled properly in a robust software.