本文編寫一個計算兩個數組和的程序,用CPU和GPU分別運算,計算運算時間,並且校驗最后的運算結果。文中代碼偏多,原理建議閱讀下面文章,文中介紹了OpenCL相關名詞概念。
http://opencl.codeplex.com/wikipage?title=OpenCL%20Tutorials%20-%201 (英文版)
http://www.cnblogs.com/leiben/archive/2012/06/05/2536508.html (博友翻譯的中文版)
一、創建工程
按照OpenCL入門:(一:Intel核心顯卡OpenCL環境搭建)的創建一個名為OpenCLSum的工程,並且添加一個OpenCLSum.cpp文件,一個OpenCLSum.cl文件(添加時選擇添加OpenCL文件)。
二、CPU計算代碼
用CPU求兩個數組和的代碼如下:
void RunAsCpu( const float *nums1, const float *nums2, float* sum, const int num) { for (int i = 0; i < num; i++) { sum[i] = nums1[i] + nums2[i]; } }
三、GPU計算代碼
在cl文件中添加如下代碼,//因為運行這個kernel時需要設置一個線程數目, //所以每個線程都會調用一次這個函數,只需要使 //用get_global_id獲取它的線程id就可以求和了 __kernel void RunAsGpu( __global const float *nums1, __global const float *nums2, __global float* sum) { int id = get_global_id(0); sum[id] = nums1[id] + nums2[id]; }
四、主函數流程
流程請參考本文開始推薦的文章,有詳細說明,下面只在注釋中簡單說明
//計時函數 double time_stamp() { LARGE_INTEGER curclock; LARGE_INTEGER freq; if ( !QueryPerformanceCounter(&curclock) || !QueryPerformanceFrequency(&freq) ) { return -1; } return double(curclock.QuadPart) / freq.QuadPart; } #define OPENCL_CHECK_ERRORS(ERR) \ if(ERR != CL_SUCCESS) \ { \ cerr \ << "OpenCL error with code " << ERR \ << " happened in file " << __FILE__ \ << " at line " << __LINE__ \ << ". Exiting...\n"; \ exit(1); \ } int main(int argc, const char** argv) { cl_int error = 0; // Used to handle error codes cl_context context; cl_command_queue queue; cl_device_id device; // 遍歷系統中所有OpenCL平台 cl_uint num_of_platforms = 0; // 得到平台數目 error = clGetPlatformIDs(0, 0, &num_of_platforms); OPENCL_CHECK_ERRORS(error); cout << "可用平台數: " << num_of_platforms << endl; cl_platform_id* platforms = new cl_platform_id[num_of_platforms]; // 得到所有平台的ID error = clGetPlatformIDs(num_of_platforms, platforms, 0); OPENCL_CHECK_ERRORS(error); //遍歷平台,選擇一個Intel平台的 cl_uint selected_platform_index = num_of_platforms; for (cl_uint i = 0; i < num_of_platforms; ++i) { size_t platform_name_length = 0; error = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, 0, 0, &platform_name_length ); OPENCL_CHECK_ERRORS(error); // 調用兩次,第一次是得到名稱的長度 char* platform_name = new char[platform_name_length]; error = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, platform_name_length, platform_name, 0 ); OPENCL_CHECK_ERRORS(error); cout << " [" << i << "] " << platform_name; if ( strstr(platform_name, "Intel") && selected_platform_index == num_of_platforms // have not selected yet ) { cout << " [Selected]"; selected_platform_index = i; } cout << endl; delete[] platform_name; } if (selected_platform_index == num_of_platforms) { cerr << "沒有找到Intel平台\n"; return 1; } // Device cl_platform_id platform = platforms[selected_platform_index]; error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); OPENCL_CHECK_ERRORS(error) // Context context = clCreateContext(0, 1, &device, NULL, NULL, &error); OPENCL_CHECK_ERRORS(error) // Command-queue queue = clCreateCommandQueue(context, device, 0, &error); OPENCL_CHECK_ERRORS(error) //下面初始化測試數據(主機數據) const int size = 38888888;//大小和內存有關,僅作示例 float* nums1_h = new float[size]; float* nums2_h = new float[size]; float* sum_h = new float[size]; // Initialize both vectors for (int i = 0; i < size; i++) { nums1_h[i] = nums2_h[i] = (float)i; } //初始化設備數據 const int mem_size = sizeof(float)*size; // 標志位表示數據只讀,並且從nums1_h和nums2_h復制數據 cl_mem nums1_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, nums1_h, &error); OPENCL_CHECK_ERRORS(error) cl_mem nums2_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, nums2_h, &error); OPENCL_CHECK_ERRORS(error) cl_mem sum_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error); OPENCL_CHECK_ERRORS(error) //讀取OpenCLSum.cl文件內容 FILE* fp = fopen("OpenCLSum.cl", "rb"); fseek(fp, 0, SEEK_END); size_t src_size = ftell(fp); fseek(fp, 0, SEEK_SET); const char* source = new char[src_size]; fread((void*)source, 1, src_size, fp); fclose(fp); //創建編譯運行kernel函數 cl_program program = clCreateProgramWithSource(context, 1, &source, &src_size, &error); OPENCL_CHECK_ERRORS(error) delete[] source; // Builds the program error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); OPENCL_CHECK_ERRORS(error) // Shows the log char* build_log; size_t log_size; // First call to know the proper size clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = new char[log_size + 1]; // Second call to get the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = '\0'; cout << build_log << endl; delete[] build_log; // Extracting the kernel cl_kernel run_as_gpu = clCreateKernel(program, "RunAsGpu", &error); OPENCL_CHECK_ERRORS(error) //運行kernel程序 // Enqueuing parameters // Note that we inform the size of the cl_mem object, not the size of the memory pointed by it error = clSetKernelArg(run_as_gpu, 0, sizeof(cl_mem), &nums1_d); error |= clSetKernelArg(run_as_gpu, 1, sizeof(cl_mem), &nums2_d); error |= clSetKernelArg(run_as_gpu, 2, sizeof(cl_mem), &sum_d); OPENCL_CHECK_ERRORS(error) // Launching kernel size_t global_work_size = size; cout << "GPU 運行開始:" << time_stamp() << endl; error = clEnqueueNDRangeKernel(queue, run_as_gpu, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); cout << "GPU 運行結束:" << time_stamp() << endl; OPENCL_CHECK_ERRORS(error) //取得kernel返回值 float* gpu_sum = new float[size]; clEnqueueReadBuffer(queue, sum_d, CL_TRUE, 0, mem_size, gpu_sum, 0, NULL, NULL); cout << "CPU 運行開始:" << time_stamp() << endl; RunAsCpu(nums1_h, nums2_h, sum_h, size); cout << "CPU 運行結束:" << time_stamp() << endl; assert(memcmp(sum_h, gpu_sum, size * sizeof(float)) == 0); delete[] nums1_h; delete[] nums2_h; delete[] sum_h; delete[] gpu_sum; delete[] platforms; clReleaseKernel(run_as_gpu); clReleaseCommandQueue(queue); clReleaseContext(context); clReleaseMemObject(nums1_d); clReleaseMemObject(nums2_d); clReleaseMemObject(sum_d); return 0;
四、運行結果
由於運算比較簡單,CPU和GPU幾乎沒差別,在后續復雜運算中應該是會有差別的。
五、相關下載
六、后續
看了幾篇文章后似乎簡單使用OpenCL還是不復雜的,OpenCL關鍵應該在於如何優化性能,如何調用kernel函數,可以將GPU效果最優化。以后的文章一部分涉及OpenCL原理,一部分涉及到更復雜的運算,當然了,博主也是學習階段,沒有練手項目,只能從官方demos中找找了。