Windows 7中使用AMD APP OpenCL的一個簡單例子

本文轉載自查看原文 2013-06-14 19:05 8083 Windows開發相關

自從Apple從08年正式將自己的OpenCL提交到Khronos Group開放標准組織后，先后獲得AMD、nVidia、Intel等大公司的支持。OpenCL能充分發揮GPU數據密集型大規模計算的能力，從而使得很多多媒體應用乃至科學計算能獲得大幅度的性能提升。

這里將主要介紹如何在Windows 7中使用AMD APP SDK中的OpenCL。

首先，我們可以先去AMD開發者官網——development.amd.com，到這個網頁http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/，根據你的系統來選擇下載AMD APP SDK。如果你使用的是Windows 7操作系統，那么在你安裝完畢后，安裝包會自動在系統環境變量中添加AMDAPPSDKROOT，我們后面將會利用這個環境變量來找包含的頭文件路徑以及連接庫的路徑。

然后，我們得有Visual Stduio 2012 Express Edition或Professional Edition。我們可以先創建一個Win32 Console項目或Win32 Application項目，然后在菜單欄Project中的<項目名> Properties中找到C/C++，然后再選到Genral，在Additional Include Directories中輸入$(AMDAPPSDKROOT)\include。

然后我們再找到Preprocessor，在Preprocessor Definitions中添加宏_CRT_SECURE_NO_WARNINGS，這個宏將會對稍候代碼中使用讀文件有用。

我們再來，找到Linker，點擊后找到Additional Library Directories，添加$(AMDAPPSDKROOT)\lib\x86。

然后我們再點擊Input，在Additional Dependencies中添加OpenCL.lib。

這樣，我們就把所有的准備工作做好了。

接下來，我們可以先寫這個例子所需要的OpenCL Kernel代碼：

__kernel void MyCLAdd(__global int *dst, __global int *src1, __global int *src2)
{
    int index = get_global_id(0);
    dst[index] = src1[index] + src2[index];
}

我們將上述代碼保存為cl_kernel.cl，放在這個工程放資源文件和源代碼的文件夾下。在VC++工程中，我們可以將這個文件添加到Resource篩選器下。

接下來，我們可以寫main函數，或其它什么函數來創建並運行這段OpenCL內核代碼。

#include <CL/cl.h>
#include <stdio.h>
#include <iostream>
using namespace std;

int main(void)
{
        cl_uint numPlatforms = 0;           //the NO. of platforms
        cl_platform_id platform = nullptr;  //the chosen platform
        cl_context context = nullptr;       // OpenCL context
        cl_command_queue commandQueue = nullptr;
        cl_program program = nullptr;       // OpenCL kernel program object that'll be running on the compute device
        cl_mem input1MemObj = nullptr;      // input1 memory object for input argument 1
        cl_mem input2MemObj = nullptr;      // input2 memory object for input argument 2
        cl_mem outputMemObj = nullptr;      // output memory object for output
        cl_kernel kernel = nullptr;         // kernel object

        cl_int    status = clGetPlatformIDs(0, NULL, &numPlatforms);
        if (status != CL_SUCCESS)
        {
            cout<<"Error: Getting platforms!"<<endl;
            return 0;
        }

        /*For clarity, choose the first available platform. */
        if(numPlatforms > 0)
        {
            cl_platform_id* platforms = (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
            status = clGetPlatformIDs(numPlatforms, platforms, NULL);
            platform = platforms[0];
            free(platforms);
        }
        else
        {
            puts("Your system does not have any OpenCL platform!");
            return 0;
        }

        /*Step 2:Query the platform and choose the first GPU device if has one.Otherwise use the CPU as device.*/
        cl_uint                numDevices = 0;
        cl_device_id        *devices;
        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);    
        if (numDevices == 0) //no GPU available.
        {
            cout << "No GPU device available."<<endl;
            cout << "Choose CPU as default device."<<endl;
            status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);    
            devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

            status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL);
        }
        else
        {
            devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
            status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
            cout << "The number of devices: " << numDevices << endl;
        }

        /*Step 3: Create context.*/
        context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);

        /*Step 4: Creating command queue associate with the context.*/
        commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);

        /*Step 5: Create program object */
        // Read the kernel code to the buffer
        FILE *fp = fopen("cl_kernel.cl", "rb");
        if(fp == nullptr)
        {
            puts("The kernel file not found!");
            goto RELEASE_RESOURCES;
        }
        fseek(fp, 0, SEEK_END);
        size_t kernelLength = ftell(fp);
        fseek(fp, 0, SEEK_SET);
        char *kernelCodeBuffer = (char*)malloc(kernelLength + 1);
        fread(kernelCodeBuffer, 1, kernelLength, fp);
        kernelCodeBuffer[kernelLength] = '\0';
        fclose(fp);
        
        const char *aSource = kernelCodeBuffer;
        program = clCreateProgramWithSource(context, 1, &aSource, &kernelLength, NULL);

        /*Step 6: Build program. */
        status = clBuildProgram(program, 1,devices,NULL,NULL,NULL);

        /*Step 7: Initial inputs and output for the host and create memory objects for the kernel*/
        int __declspec(align(32)) input1Buffer[128];    // 32 bytes alignment to improve data copy
        int __declspec(align(32)) input2Buffer[128];
        int __declspec(align(32)) outputBuffer[128];

        // Do initialization
        int i;
        for(i = 0; i < 128; i++)
            input1Buffer[i] = input2Buffer[i] = i + 1;
        memset(outputBuffer, 0, sizeof(outputBuffer));

        // Create mmory object
        input1MemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, 128 * sizeof(int), input1Buffer, nullptr);
        input2MemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, 128 * sizeof(int), input2Buffer, nullptr);
        outputMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 128 * sizeof(int), NULL, NULL);

        /*Step 8: Create kernel object */
        kernel = clCreateKernel(program,"MyCLAdd", NULL);

        /*Step 9: Sets Kernel arguments.*/
        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputMemObj);
        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&input1MemObj);
        status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&input2MemObj);

        /*Step 10: Running the kernel.*/
        size_t global_work_size[1] = { 128 };
        status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
        clFinish(commandQueue);     // Force wait until the OpenCL kernel is completed

        /*Step 11: Read the cout put back to host memory.*/
        status = clEnqueueReadBuffer(commandQueue, outputMemObj, CL_TRUE, 0, global_work_size[0] * sizeof(int), outputBuffer, 0, NULL, NULL);

        printf("Veryfy the rsults... ");
        for(i = 0; i < 128; i++)
        {
            if(outputBuffer[i] != (i + 1) * 2)
            {
                puts("Results not correct!");
                break;
            }
        }
        if(i == 128)
            puts("Correct!");

RELEASE_RESOURCES:
        /*Step 12: Clean the resources.*/
        status = clReleaseKernel(kernel);//*Release kernel.
        status = clReleaseProgram(program);    //Release the program object.
        status = clReleaseMemObject(input1MemObj);//Release mem object.
        status = clReleaseMemObject(input2MemObj);
        status = clReleaseMemObject(outputMemObj);
        status = clReleaseCommandQueue(commandQueue);//Release  Command queue.
        status = clReleaseContext(context);//Release context.

        free(devices);
}

我們直接編譯運行即可。

在校驗函數中，我們可以發現，輸出結果完全正確。

這里需要注意的是，這個源文件必須保存為.cpp后綴，並且得用支持C++11標准的VC編譯器，比如VS2012 。當然如果用VS2010應該也能通過編譯，盡管我還沒試過。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Ubuntu下使用AMD APP編寫OpenCL程序 AMD OpenCL 大學課程(1) Spring MVC：使用SimpleUrlHandlerMapping的一個簡單例子一個簡單的例子說明windows環境變量配置一個簡單的cmake例子一個簡單的SpringBoot例子一個簡單的CORBA例子一個簡單的死鎖例子 [MFC]_在vs2019中使用MFC快速構建簡單windows窗口程序簡單的使用rabbitmq的例子