OpenCL 學習step by step (2) 一個簡單的OpenCL的程序

本文轉載自查看原文 2012-09-05 06:49 7562 OpenCL

現在，我們開始寫一個簡單的OpenCL程序，計算兩個數組相加的和，放到另一個數組中去。程序用cpu和gpu分別計算，最后驗證它們是否相等。OpenCL程序的流程大致如下：

下面是source code中的主要代碼：

int main(int argc, char* argv[])
    {
    //在host內存中創建三個緩沖區
    float *buf1 = 0;
    float *buf2 = 0;
    float *buf = 0;

    buf1 =(float *)malloc(BUFSIZE * sizeof(float));
    buf2 =(float *)malloc(BUFSIZE * sizeof(float));
    buf =(float *)malloc(BUFSIZE * sizeof(float));

    //用一些隨機值初始化buf1和buf2的內容
    int i;
    srand( (unsigned)time( NULL ) );
    for(i = 0; i < BUFSIZE; i++)
        buf1[i] = rand()%65535;

    srand( (unsigned)time( NULL ) +1000);
    for(i = 0; i < BUFSIZE; i++)
        buf2[i] = rand()%65535;

    //cpu計算buf1,buf2的和
    for(i = 0; i < BUFSIZE; i++)
        buf[i] = buf1[i] + buf2[i];

cl_uint status;
cl_platform_id platform;

//創建平台對象
status = clGetPlatformIDs( 1, &platform, NULL );

注意：如果我們系統中安裝不止一個opencl平台，比如我的os中，有intel和amd兩家opencl平台，用上面這行代碼，有可能會出錯，因為它得到了intel的opencl平台，而intel的平台只支持cpu，而我們后面的操作都是基於gpu，這時我們可以用下面的代碼，得到AMD的opencl平台。

cl_uint numPlatforms;
 std::string platformVendor; 
 status = clGetPlatformIDs(0, NULL, &numPlatforms);
 if(status != CL_SUCCESS)
     {
     return 0;
     }
 if (0 < numPlatforms) 
     {
     cl_platform_id* platforms = new cl_platform_id[numPlatforms];
     status = clGetPlatformIDs(numPlatforms, platforms, NULL);

     char platformName[100];
     for (unsigned i = 0; i < numPlatforms; ++i) 
         {
         status = clGetPlatformInfo(platforms[i],
             CL_PLATFORM_VENDOR,
             sizeof(platformName),
             platformName,
             NULL);

         platform = platforms[i];
         platformVendor.assign(platformName);

         if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) 
             {
             break;
             }
         }

     std::cout << "Platform found : " << platformName << "\n";
     delete[] platforms;
     }

cl_device_id device;

    //創建GPU設備
    clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU,
        1,
        &device,
        NULL);
    //創建context
    cl_context context = clCreateContext( NULL,
        1,
        &device,
        NULL, NULL, NULL);
    //創建命令隊列
    cl_command_queue queue = clCreateCommandQueue( context,
        device,
        CL_QUEUE_PROFILING_ENABLE, NULL );
    //創建三個OpenCL內存對象，並把buf1的內容通過隱式拷貝的方式
    //buf1內容拷貝到clbuf1,buf2的內容通過顯示拷貝的方式拷貝到clbuf2
    cl_mem clbuf1 = clCreateBuffer(context,
        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        BUFSIZE*sizeof(cl_float),buf1,
        NULL );

    cl_mem clbuf2 = clCreateBuffer(context,
        CL_MEM_READ_ONLY ,
        BUFSIZE*sizeof(cl_float),NULL,
        NULL );

cl_event writeEvt;

status = clEnqueueWriteBuffer(queue, clbuf2, 1,
0, BUFSIZE*sizeof(cl_float), buf2, 0, 0, 0);

上面這行代碼把buf2中的內容拷貝到clbuf2,因為buf2位於host端，clbuf2位於device端，所以這個函數會執行一次host到device的傳輸操作，或者說一次system memory到video memory的拷貝操作，所以我在該函數的后面放置了clFush函數，表示把command queue中的所有命令提交到device(注意：該命令並不保證命令執行完成),所以我們調用函數waitForEventAndRelease來等待write緩沖的完成，waitForEventAndReleae 是一個用戶定義的函數，它的內容如下，主要代碼就是通過event來查詢我們的操作是否完成，沒完成的話，程序就一直block在這行代碼處，另外我們也可以用opencl中內置的函數clWaitForEvents來代替clFlush和waitForEventAndReleae。

//等待事件完成
int waitForEventAndRelease(cl_event *event)
    {
    cl_int status = CL_SUCCESS;
    cl_int eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
        {
        status = clGetEventInfo(
            *event, 
            CL_EVENT_COMMAND_EXECUTION_STATUS, 
            sizeof(cl_int),
            &eventStatus,
            NULL);
        }

    status = clReleaseEvent(*event);

    return 0;
    }

     status = clFlush(queue);
     //等待數據傳輸完成再繼續往下執行
     waitForEventAndRelease(&writeEvt);

    cl_mem buffer = clCreateBuffer( context,
        CL_MEM_WRITE_ONLY,
        BUFSIZE * sizeof(cl_float),
        NULL, NULL );

kernel文件中放的是gpu中執行的代碼，它被放在一個單獨的文件add.cl中，本程序中kernel代碼非常簡單，只是執行兩個數組相加。kernel的代碼為：

__kernel void vecadd(__global const float* A, __global const float* B, __global float* C)
{
    int id = get_global_id(0);
    C[id] = A[id] + B[id];
}

   //kernel文件為add.cl
    const char * filename = "add.cl";
    std::string sourceStr;
    status = convertToString(filename, sourceStr);

convertToString也是用戶定義的函數，該函數把kernel源文件讀入到一個string中，它的代碼如下：

//把文本文件讀入一個string中,用來讀入kernel源文件
int convertToString(const char *filename, std::string& s)
    {
    size_t size;
    char*  str;

    std::fstream f(filename, (std::fstream::in | std::fstream::binary));

    if(f.is_open())
        {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = (size_t)f.tellg();
        f.seekg(0, std::fstream::beg);

        str = new char[size+1];
        if(!str)
            {
            f.close();
            return NULL;
            }

        f.read(str, fileSize);
        f.close();
        str[size] = '\0';

        s = str;
        delete[] str;
        return 0;
        }
    printf("Error: Failed to open file %s\n", filename);
    return 1;
    }

const char * source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };

    //創建程序對象
    cl_program program = clCreateProgramWithSource(
        context,
        1,
        &source,
        sourceSize,
        NULL);
    //編譯程序對象
    status = clBuildProgram( program, 1, &device, NULL, NULL, NULL );
    if(status != 0)
        {
        printf("clBuild failed:%d\n", status);
        char tbuf[0x10000];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0x10000, tbuf, NULL);
        printf("\n%s\n", tbuf);
        return -1;
        }

    //創建Kernel對象
    cl_kernel kernel = clCreateKernel( program, "vecadd", NULL );
    //設置Kernel參數
    cl_int clnum = BUFSIZE;
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &clbuf1);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &clbuf2);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &buffer);

注意：在執行kernel時候，我們只設置了global work items數量，沒有設置group size，這時候，系統會使用默認的work group size，通常可能是256之類的。

    //執行kernel,Range用1維，work itmes size為BUFSIZE
    cl_event ev;
    size_t global_work_size = BUFSIZE;
    clEnqueueNDRangeKernel( queue,
        kernel,
        1,
        NULL,
        &global_work_size,
        NULL, 0, NULL, &ev);
   status = clFlush( queue );
   waitForEventAndRelease(&ev);

//數據拷回host內存
cl_float *ptr;

    cl_event mapevt;
    ptr = (cl_float *) clEnqueueMapBuffer( queue,
        buffer,
        CL_TRUE,
        CL_MAP_READ,
        0,
        BUFSIZE * sizeof(cl_float),
        0, NULL, NULL, NULL );

status = clFlush( queue );
waitForEventAndRelease(&mapevt);

    //結果驗證，和cpu計算的結果比較
    if(!memcmp(buf, ptr, BUFSIZE))
        printf("Verify passed\n");
    else printf("verify failed");

    if(buf)
        free(buf);
    if(buf1)
        free(buf1);
    if(buf2)
        free(buf2);

程序結束后，這些opencl對象一般會自動釋放，但是為了程序完整，養成一個好習慣，這兒我加上了手動釋放opencl對象的代碼。

    //刪除OpenCL資源對象
    clReleaseMemObject(clbuf1);
    clReleaseMemObject(clbuf2);
    clReleaseMemObject(buffer);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 0;
    }

程序執行后的界面如下：

完整的代碼請參考：

工程文件gclTutorial1

代碼下載：

http://files.cnblogs.com/mikewolf2002/gclTutorial.zip

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 OpenCL 學習step by step (1) 安裝AMD OpenCL APP ASP.NET MVC3 Step By Step 學習筆記 (2)——創建簡單MVC3應用程序 Django 學習筆記之六建立一個簡單的博客應用程序通過編寫一個簡單的漏洞掃描程序學習Python基本語句 OpenCl入門——實現簡單卷積 STEP標准的簡單介紹一個簡單的LED程序 Step by step 如何創建一個新森林 Step by Step 創建一個WCF Service OpenCL