#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <windows.h>
 
#define N 100000000  // 100 million elements
 
const char* kernelSource =
"__kernel void vec_add(__global const float* A, __global const float* B, __global float* C) {\n"
"    int i = get_global_id(0);\n"
"    C[i] = A[i] + B[i];\n"
"}\n";
 
void checkError(cl_int err, const char* msg) {
    if (err != CL_SUCCESS) {
        fprintf(stderr, "Error: %s (%d)\n", msg, err);
        exit(1);
    }
}
 
double getTimeInSeconds() {
    static LARGE_INTEGER frequency;
    static BOOL initialized = FALSE;
 
    if (!initialized) {
        QueryPerformanceFrequency(&frequency);
        initialized = TRUE;
    }
 
    LARGE_INTEGER counter;
    QueryPerformanceCounter(&counter);
 
    return (double)counter.QuadPart / (double)frequency.QuadPart;
}
 
int main() {
    cl_int err;
 
    float *A = (float*) malloc(sizeof(float) * N);
    float *B = (float*) malloc(sizeof(float) * N);
    float *C_gpu = (float*) malloc(sizeof(float) * N);
    float *C_cpu = (float*) malloc(sizeof(float) * N);
 
    for (size_t i = 0; i < N; ++i) {
        A[i] = i * 1.0f;
        B[i] = (N - i) * 1.0f;
    }
 
    // ===== CPU Version =====
    double start_cpu = getTimeInSeconds();
    for (size_t i = 0; i < N; ++i) {
        C_cpu[i] = A[i] + B[i];
    }
    double end_cpu = getTimeInSeconds();
    printf("CPU execution time : %.4f seconds\n", end_cpu - start_cpu);
 
    // ===== GPU Version =====
    cl_platform_id platform;
    cl_device_id device;
    err = clGetPlatformIDs(1, &platform, NULL);
    checkError(err, "clGetPlatformIDs");
 
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    checkError(err, "clGetDeviceIDs");
 
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    checkError(err, "clCreateContext");
 
    cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, 0, &err);
    checkError(err, "clCreateCommandQueue");
 
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    checkError(err, "clCreateProgramWithSource");
 
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char *log = (char*) malloc(log_size);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        fprintf(stderr, "Build log:\n%s\n", log);
        free(log);
        exit(1);
    }
 
    cl_kernel kernel = clCreateKernel(program, "vec_add", &err);
    checkError(err, "clCreateKernel");
 
    cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N, A, &err);
    cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N, B, &err);
    cl_mem bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * N, NULL, &err);
    checkError(err, "clCreateBuffer");
 
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
 
    size_t globalSize = N;
    double start_gpu = getTimeInSeconds();
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, NULL);
    checkError(err, "clEnqueueNDRangeKernel");
 
    clFinish(queue);
    double end_gpu = getTimeInSeconds();
    printf("GPU execution time : %.4f seconds\n", end_gpu - start_gpu);
 
    err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(float) * N, C_gpu, 0, NULL, NULL);
    checkError(err, "clEnqueueReadBuffer");
 
    // ===== Verify Result =====
    int correct = 1;
    for (int i = 0; i < 100; ++i) {
        if (C_cpu[i] != C_gpu[i]) {
            correct = 0;
            printf("Mismatch at index %d: CPU=%f, GPU=%f\n", i, C_cpu[i], C_gpu[i]);
            break;
        }
    }
 
    if (correct) {
        printf("Results match!\n");
    }
 
    // ===== Cleanup =====
    clReleaseMemObject(bufA);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufC);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    free(A); free(B); free(C_cpu); free(C_gpu);
 
    return 0;
}