OpenCL入門：(二：用GPU計算兩個數組和)

本文轉載自查看原文 2017-08-18 22:45 2076 OpenCL入門

本文編寫一個計算兩個數組和的程序，用CPU和GPU分別運算，計算運算時間，並且校驗最后的運算結果。文中代碼偏多，原理建議閱讀下面文章，文中介紹了OpenCL相關名詞概念。

http://opencl.codeplex.com/wikipage?title=OpenCL%20Tutorials%20-%201 （英文版）

http://www.cnblogs.com/leiben/archive/2012/06/05/2536508.html （博友翻譯的中文版）

一、創建工程

按照OpenCL入門：(一：Intel核心顯卡OpenCL環境搭建)的創建一個名為OpenCLSum的工程，並且添加一個OpenCLSum.cpp文件，一個OpenCLSum.cl文件（添加時選擇添加OpenCL文件）。

二、CPU計算代碼

用CPU求兩個數組和的代碼如下：

void RunAsCpu(
    const float *nums1, 
    const float *nums2, 
    float* sum, 
    const int num)
{
    for (int i = 0; i < num; i++)
    {
        sum[i] = nums1[i] + nums2[i];
    }
}

三、GPU計算代碼

在cl文件中添加如下代碼，

//因為運行這個kernel時需要設置一個線程數目，
//所以每個線程都會調用一次這個函數，只需要使
//用get_global_id獲取它的線程id就可以求和了
__kernel void RunAsGpu(
    __global const float *nums1, 
    __global const float *nums2, 
    __global float* sum)
{
    int id = get_global_id(0);
    sum[id] = nums1[id] + nums2[id];
}

四、主函數流程

流程請參考本文開始推薦的文章，有詳細說明，下面只在注釋中簡單說明

//計時函數
double time_stamp()
{
    LARGE_INTEGER curclock;
    LARGE_INTEGER freq;
    if (
        !QueryPerformanceCounter(&curclock) ||
        !QueryPerformanceFrequency(&freq)
        )
    {
        return -1;
    }

    return double(curclock.QuadPart) / freq.QuadPart;
}
#define OPENCL_CHECK_ERRORS(ERR)        \
    if(ERR != CL_SUCCESS)                  \
    {                                      \
    cerr                                   \
    << "OpenCL error with code " << ERR    \
    << " happened in file " << __FILE__    \
    << " at line " << __LINE__             \
    << ". Exiting...\n";                   \
    exit(1);                               \
    }
int main(int argc, const char** argv)
{
    cl_int error = 0;   // Used to handle error codes
    cl_context context;
    cl_command_queue queue;
    cl_device_id device;

    // 遍歷系統中所有OpenCL平台
    cl_uint num_of_platforms = 0;
    // 得到平台數目
    error = clGetPlatformIDs(0, 0, &num_of_platforms);
    OPENCL_CHECK_ERRORS(error);
    cout << "可用平台數: " << num_of_platforms << endl;

    cl_platform_id* platforms = new cl_platform_id[num_of_platforms];
    // 得到所有平台的ID
    error = clGetPlatformIDs(num_of_platforms, platforms, 0);
    OPENCL_CHECK_ERRORS(error);
    //遍歷平台，選擇一個Intel平台的
    cl_uint selected_platform_index = num_of_platforms;
    for (cl_uint i = 0; i < num_of_platforms; ++i)
    {
        size_t platform_name_length = 0;
        error = clGetPlatformInfo(
            platforms[i],
            CL_PLATFORM_NAME,
            0,
            0,
            &platform_name_length
        );
        OPENCL_CHECK_ERRORS(error);

        // 調用兩次，第一次是得到名稱的長度
        char* platform_name = new char[platform_name_length];
        error = clGetPlatformInfo(
            platforms[i],
            CL_PLATFORM_NAME,
            platform_name_length,
            platform_name,
            0
        );
        OPENCL_CHECK_ERRORS(error);

        cout << "    [" << i << "] " << platform_name;

        if (
            strstr(platform_name, "Intel") &&
            selected_platform_index == num_of_platforms // have not selected yet
            )
        {
            cout << " [Selected]";
            selected_platform_index = i;
        }

        cout << endl;
        delete[] platform_name;
    }
    if (selected_platform_index == num_of_platforms)
    {
        cerr
            << "沒有找到Intel平台\n";
        return 1;
    }
    // Device
    cl_platform_id platform = platforms[selected_platform_index];
    error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    OPENCL_CHECK_ERRORS(error)

    // Context
    context = clCreateContext(0, 1, &device, NULL, NULL, &error);
    OPENCL_CHECK_ERRORS(error)

    // Command-queue
    queue = clCreateCommandQueue(context, device, 0, &error);
    OPENCL_CHECK_ERRORS(error)

    //下面初始化測試數據(主機數據)
    const int size = 38888888;//大小和內存有關，僅作示例
    float* nums1_h = new float[size];
    float* nums2_h = new float[size];
    float* sum_h = new float[size];
    // Initialize both vectors
    for (int i = 0; i < size; i++) {
        nums1_h[i] = nums2_h[i] = (float)i;
    }
    //初始化設備數據
    const int mem_size = sizeof(float)*size;
    // 標志位表示數據只讀，並且從nums1_h和nums2_h復制數據
    cl_mem nums1_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, nums1_h, &error);
    OPENCL_CHECK_ERRORS(error)
    cl_mem nums2_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, nums2_h, &error);
    OPENCL_CHECK_ERRORS(error)
    cl_mem sum_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);
    OPENCL_CHECK_ERRORS(error)

    //讀取OpenCLSum.cl文件內容

    FILE* fp = fopen("OpenCLSum.cl", "rb");
    fseek(fp, 0, SEEK_END);
    size_t src_size = ftell(fp);
    fseek(fp, 0, SEEK_SET);
    const char* source = new char[src_size];
    fread((void*)source, 1, src_size, fp);
    fclose(fp);

    //創建編譯運行kernel函數
    cl_program program = clCreateProgramWithSource(context, 1, &source, &src_size, &error);
    OPENCL_CHECK_ERRORS(error)
    delete[] source;

    // Builds the program
    error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    OPENCL_CHECK_ERRORS(error)

    // Shows the log
    char* build_log;
    size_t log_size;
    // First call to know the proper size
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    build_log = new char[log_size + 1];
    // Second call to get the log
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
    build_log[log_size] = '\0';
    cout << build_log << endl;
    delete[] build_log;

    // Extracting the kernel
    cl_kernel run_as_gpu = clCreateKernel(program, "RunAsGpu", &error);
    OPENCL_CHECK_ERRORS(error)


    //運行kernel程序

    // Enqueuing parameters
    // Note that we inform the size of the cl_mem object, not the size of the memory pointed by it
    error = clSetKernelArg(run_as_gpu, 0, sizeof(cl_mem), &nums1_d);
    error |= clSetKernelArg(run_as_gpu, 1, sizeof(cl_mem), &nums2_d);
    error |= clSetKernelArg(run_as_gpu, 2, sizeof(cl_mem), &sum_d);
    OPENCL_CHECK_ERRORS(error)

    // Launching kernel
    size_t global_work_size = size;
    cout << "GPU 運行開始:" << time_stamp() << endl;
    error = clEnqueueNDRangeKernel(queue, run_as_gpu, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
    cout << "GPU 運行結束:" << time_stamp() << endl;
    OPENCL_CHECK_ERRORS(error)

    //取得kernel返回值
    float* gpu_sum = new float[size];
    clEnqueueReadBuffer(queue, sum_d, CL_TRUE, 0, mem_size, gpu_sum, 0, NULL, NULL);

    cout << "CPU 運行開始:" << time_stamp() << endl;
    RunAsCpu(nums1_h, nums2_h, sum_h, size);
    cout << "CPU 運行結束:" << time_stamp() << endl;

    assert(memcmp(sum_h, gpu_sum, size * sizeof(float)) == 0);

    delete[] nums1_h;
    delete[] nums2_h;
    delete[] sum_h;
    delete[] gpu_sum;
    delete[] platforms;
    clReleaseKernel(run_as_gpu);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseMemObject(nums1_d);
    clReleaseMemObject(nums2_d);
    clReleaseMemObject(sum_d);
    return 0;

四、運行結果

由於運算比較簡單，CPU和GPU幾乎沒差別，在后續復雜運算中應該是會有差別的。

五、相關下載

工程下載

六、后續

看了幾篇文章后似乎簡單使用OpenCL還是不復雜的，OpenCL關鍵應該在於如何優化性能，如何調用kernel函數，可以將GPU效果最優化。以后的文章一部分涉及OpenCL原理，一部分涉及到更復雜的運算，當然了，博主也是學習階段，沒有練手項目，只能從官方demos中找找了。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 CUDA學習（三）之使用GPU進行兩個數組相加 R語言中兩個數組(或向量)的外積怎樣計算回顧面試題：計算兩個數組交集有N個數的數組，找出這個數組中的兩個數，使得這兩個數的和最接近0 c語言計算兩個數的差值 JS - 計算兩個數組的交集、差集、並集、補集（多種實現方式） Leetcode練習(Python)：第349題：兩個數組的交集：給定兩個數組，編寫一個函數來計算它們的交集。 Java合並兩個數組或多個數組兩個數組對象合並找出不是兩個數組共有的元素