#define CL_TARGET_OPENCL_VERSION 300
#define N 100000000 // 100 million elements
const char* kernelSource =
"__kernel void vec_add(__global const float* A, __global const float* B, __global float* C) {\n"
" int i = get_global_id(0);\n"
void checkError(cl_int err, const char* msg) {
fprintf(stderr, "Error: %s (%d)\n", msg, err);
double getTimeInSeconds() {
static LARGE_INTEGER frequency;
static BOOL initialized = FALSE;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&counter);
return (double)counter.QuadPart / (double)frequency.QuadPart;
float *A = (float*) malloc(sizeof(float) * N);
float *B = (float*) malloc(sizeof(float) * N);
float *C_gpu = (float*) malloc(sizeof(float) * N);
float *C_cpu = (float*) malloc(sizeof(float) * N);
for (size_t i = 0; i < N; ++i) {
// ===== CPU Version =====
double start_cpu = getTimeInSeconds();
for (size_t i = 0; i < N; ++i) {
double end_cpu = getTimeInSeconds();
printf("CPU execution time : %.4f seconds\n", end_cpu - start_cpu);
// ===== GPU Version =====
err = clGetPlatformIDs(1, &platform, NULL);
checkError(err, "clGetPlatformIDs");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
checkError(err, "clGetDeviceIDs");
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
checkError(err, "clCreateContext");
cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, 0, &err);
checkError(err, "clCreateCommandQueue");
cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
checkError(err, "clCreateProgramWithSource");
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char*) malloc(log_size);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
fprintf(stderr, "Build log:\n%s\n", log);
cl_kernel kernel = clCreateKernel(program, "vec_add", &err);
checkError(err, "clCreateKernel");
cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N, A, &err);
cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N, B, &err);
cl_mem bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * N, NULL, &err);
checkError(err, "clCreateBuffer");
clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
double start_gpu = getTimeInSeconds();
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, NULL);
checkError(err, "clEnqueueNDRangeKernel");
double end_gpu = getTimeInSeconds();
printf("GPU execution time : %.4f seconds\n", end_gpu - start_gpu);
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(float) * N, C_gpu, 0, NULL, NULL);
checkError(err, "clEnqueueReadBuffer");
// ===== Verify Result =====
for (int i = 0; i < 100; ++i) {
if (C_cpu[i] != C_gpu[i]) {
printf("Mismatch at index %d: CPU=%f, GPU=%f\n", i, C_cpu[i], C_gpu[i]);
printf("Results match!\n");
clReleaseMemObject(bufA);
clReleaseMemObject(bufB);
clReleaseMemObject(bufC);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(A); free(B); free(C_cpu); free(C_gpu);