如果我們需要優化kernel程序,我們必須知道一些GPU的底層知識,本文簡單介紹一下GPU內存相關和線程調度知識,並且用一個小示例演示如何簡單根據內存結構優化。
一、GPU總線尋址和合並內存訪問
假設X指向一個32位整數數組的指針,數組首地址是0x00001232,那么一個線程需要訪問第0個成員時是也許是如下訪問的:
int tmp = X[0]假設內存總線寬度是256位,內存訪問時必須和總線寬度對齊,所以內存只能訪問0x00000020,0x00000040這種地址(0x20=256位),如果要訪問0x00001232,那么內存必須同時獲取0x00001220-0x0000123f的數據,一次獲取了32字節的數據,但是我們有用的只有4字節,這就造成了28個字節的浪費。
事實上,GPU為了利用總線帶寬,它會合並內存訪問,盡量將多個線程讀取內存合並到一起進行訪問,例如我們有16個線程,每個線程訪問4字節,總共需要訪問0x00001232-0x00001272,如果不合並內存訪問,那么他需要訪問內存16次,每次浪費28字節空間;如果合並內存訪問,它第一次訪問0x00001220-0x0000123f,第二次訪問0x00001240-0x0000125f,第三次訪問0x00001260-0x0000133f,總共只需要訪問三次,這樣可以大大減少內存訪問次數。優化kernel的性能。
二、性能優化
考慮一個矩陣相乘的問題,一個MXP的矩陣A,和一個P*N的矩陣B相乘得到MXN的C矩陣,在CPU中計算的代碼入下:
#define M 1024 #define P 512 #define N 2048 void RunAsCpu( const float *A, const float *B, float* C) { for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { C[i*N + j] = 0.0; for (int k = 0; k < P; k++) { C[i*N + j] += A[i*P + k] * B[k*N + j]; } } } }如果使用GPU運行,那么通過降維操作,創建M*N個線程,第一個維度大小的M,第二個維度大小為N,kernel中代碼可能如下:
__kernel void RunAsGpu_1( __global float *A, __global float *B, int M, int N, int P, __global float* C) { int x = get_global_id(0); int y = get_global_id(1); float sum = 0; for(int i = 0;i<P;i++) { sum += A[x*P + i]*B[i*N + y]; } C[x*N + y] = sum; }此時,如果思考一下,可能會發現,還有第二種方案,即第一個維度大小的N,第二個維度大小為M
這兩個kernel運行結果是一樣的,那運行效率有什么不同呢?host文件用如下代碼,然后運行一下看看效果:__kernel void RunAsGpu_2( __global float *A, __global float *B, int M, int N, int P, __global float* C) { int x = get_global_id(0); int y = get_global_id(1); float sum = 0; for(int i = 0;i<P;i++) { sum += A[y*P + i]*B[i*N + x]; } C[y*N + x] = sum; }#include <iostream> #include <CL/cl.h> #include <cassert> #include <windows.h> #include <ctime> using namespace std; #define M 1024 #define P 512 #define N 2048 void RunAsCpu( const float *A, const float *B, float* C) { for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { C[i*N + j] = 0.0; for (int k = 0; k < P; k++) { C[i*N + j] += A[i*P + k] * B[k*N + j]; } } } } //計時函數 double time_stamp() { LARGE_INTEGER curclock; LARGE_INTEGER freq; if ( !QueryPerformanceCounter(&curclock) || !QueryPerformanceFrequency(&freq) ) { return -1; } return double(curclock.QuadPart) / freq.QuadPart; } #define OPENCL_CHECK_ERRORS(ERR) \ if(ERR != CL_SUCCESS) \ { \ cerr \ << "OpenCL error with code " << ERR \ << " happened in file " << __FILE__ \ << " at line " << __LINE__ \ << ". Exiting...\n"; \ exit(1); \ } int main(int argc, const char** argv) { cl_int error = 0; // Used to handle error codes cl_context context; cl_command_queue queue; cl_device_id device; // 遍歷系統中所有OpenCL平台 cl_uint num_of_platforms = 0; // 得到平台數目 error = clGetPlatformIDs(0, 0, &num_of_platforms); OPENCL_CHECK_ERRORS(error); cout << "可用平台數: " << num_of_platforms << endl; cl_platform_id* platforms = new cl_platform_id[num_of_platforms]; // 得到所有平台的ID error = clGetPlatformIDs(num_of_platforms, platforms, 0); OPENCL_CHECK_ERRORS(error); //遍歷平台,選擇一個Intel平台的 cl_uint selected_platform_index = num_of_platforms; for (cl_uint i = 0; i < num_of_platforms; ++i) { size_t platform_name_length = 0; error = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, 0, 0, &platform_name_length ); OPENCL_CHECK_ERRORS(error); // 調用兩次,第一次是得到名稱的長度 char* platform_name = new char[platform_name_length]; error = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, platform_name_length, platform_name, 0 ); OPENCL_CHECK_ERRORS(error); cout << " [" << i << "] " << platform_name; if ( strstr(platform_name, "Intel") && selected_platform_index == num_of_platforms // have not selected yet ) { cout << " [Selected]"; selected_platform_index = i; } cout << endl; delete[] platform_name; } if (selected_platform_index == num_of_platforms) { cerr << "沒有找到Intel平台\n"; return 1; } // Device cl_platform_id platform = platforms[selected_platform_index]; error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); OPENCL_CHECK_ERRORS(error) // Context context = clCreateContext(0, 1, &device, NULL, NULL, &error); OPENCL_CHECK_ERRORS(error) // Command-queue CL_QUEUE_PROFILING_ENABLE開啟才能計時 queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &error); OPENCL_CHECK_ERRORS(error) //下面初始化測試數據(主機數據) float* A_h = new float[M*P]; float* B_h = new float[P*N]; float* C_h = new float[M*N]; //srand((unsigned)time(NULL)); srand(100); for (int i = 0; i < M*P; i++) A_h[i] = rand() % 50; for (int i = 0; i < P*N; i++) B_h[i] = rand() % 50; //初始化設備數據 // 標志位表示數據只讀,並且從nums1_h和nums2_h復制數據 cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*M*P, A_h, &error); OPENCL_CHECK_ERRORS(error) cl_mem B_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*P*N, B_h, &error); OPENCL_CHECK_ERRORS(error) cl_mem C_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*M*N, NULL, &error); OPENCL_CHECK_ERRORS(error) cout << "CPU 運行開始:" << time_stamp() << endl; RunAsCpu(A_h, B_h, C_h); cout << "CPU 運行結束:" << time_stamp() << endl; //讀取OpenCLSum.cl文件內容 FILE* fp = fopen("OpenCLMulMatrix.cl", "rb"); fseek(fp, 0, SEEK_END); size_t src_size = ftell(fp); fseek(fp, 0, SEEK_SET); const char* source = new char[src_size]; fread((void*)source, 1, src_size, fp); fclose(fp); //創建編譯運行kernel函數 cl_program program = clCreateProgramWithSource(context, 1, &source, &src_size, &error); OPENCL_CHECK_ERRORS(error) delete[] source; // Builds the program error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); OPENCL_CHECK_ERRORS(error) // Shows the log char* build_log; size_t log_size; // First call to know the proper size clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = new char[log_size + 1]; // Second call to get the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = '\0'; cout << build_log << endl; delete[] build_log; // Extracting the kernel cl_kernel run_as_gpu_1 = clCreateKernel(program, "RunAsGpu_1", &error); OPENCL_CHECK_ERRORS(error) //設置kernel參數 cl_int M_d = M; cl_int P_d = P; cl_int N_d = N; error = clSetKernelArg(run_as_gpu_1, 0, sizeof(cl_mem), &A_d); error |= clSetKernelArg(run_as_gpu_1, 1, sizeof(cl_mem), &B_d); error |= clSetKernelArg(run_as_gpu_1, 2, sizeof(int), &M_d); error |= clSetKernelArg(run_as_gpu_1, 3, sizeof(int), &N_d); error |= clSetKernelArg(run_as_gpu_1, 4, sizeof(int), &P_d); error |= clSetKernelArg(run_as_gpu_1, 5, sizeof(cl_mem), &C_d); OPENCL_CHECK_ERRORS(error) // 啟動kernel size_t globalws_1[2] = { M,N }; cl_event ev; error = clEnqueueNDRangeKernel(queue, run_as_gpu_1, 2, NULL, globalws_1, NULL, 0, NULL, &ev); clFinish(queue); OPENCL_CHECK_ERRORS(error) //計算kerenl執行時間 cl_ulong startTime, endTime; clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); cl_ulong kernelExecTimeNs = endTime - startTime; printf("Gpu_1運行時間 :%8.6f ms\n", kernelExecTimeNs*1e-6); //取得kernel返回值 float* gpu_C_1 = new float[M*N]; clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, M*N*sizeof(float), gpu_C_1, 0, NULL, NULL); assert(memcmp(C_h, gpu_C_1, M*N * sizeof(float)) == 0); // Extracting the kernel cl_kernel run_as_gpu_2 = clCreateKernel(program, "RunAsGpu_2", &error); OPENCL_CHECK_ERRORS(error) //設置kernel參數 error = clSetKernelArg(run_as_gpu_2, 0, sizeof(cl_mem), &A_d); error |= clSetKernelArg(run_as_gpu_2, 1, sizeof(cl_mem), &B_d); error |= clSetKernelArg(run_as_gpu_2, 2, sizeof(int), &M_d); error |= clSetKernelArg(run_as_gpu_2, 3, sizeof(int), &N_d); error |= clSetKernelArg(run_as_gpu_2, 4, sizeof(int), &P_d); error |= clSetKernelArg(run_as_gpu_2, 5, sizeof(cl_mem), &C_d); OPENCL_CHECK_ERRORS(error) // 啟動kernel size_t globalws_2[2] = { N,M }; error = clEnqueueNDRangeKernel(queue, run_as_gpu_2, 2, NULL, globalws_2, NULL, 0, NULL, &ev); clFinish(queue); OPENCL_CHECK_ERRORS(error) //計算kerenl執行時間 clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); kernelExecTimeNs = endTime - startTime; printf("Gpu_2運行時間 :%8.6f ms\n", kernelExecTimeNs*1e-6); //取得kernel返回值 float* gpu_C_2 = new float[M*N]; clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, M*N * sizeof(float), gpu_C_2, 0, NULL, NULL); assert(memcmp(C_h, gpu_C_2, M*N * sizeof(float)) == 0); error = clEnqueueNDRangeKernel(queue, run_as_gpu_1, 2, NULL, globalws_1, NULL, 0, NULL, &ev); clFinish(queue); OPENCL_CHECK_ERRORS(error) //計算kerenl執行時間 clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); kernelExecTimeNs = endTime - startTime; printf("Gpu_1運行時間 :%8.6f ms\n", kernelExecTimeNs*1e-6); delete[] A_h; delete[] B_h; delete[] C_h; delete[] gpu_C_1; delete[] gpu_C_2; delete[] platforms; clReleaseKernel(run_as_gpu_1); clReleaseKernel(run_as_gpu_2); clReleaseCommandQueue(queue); clReleaseContext(context); clReleaseMemObject(A_d); clReleaseMemObject(B_d); clReleaseMemObject(C_d); return 0; }
三、運行結果
這里可以看出,兩個方案雖然結果一樣,但是效率是有很大差別的,原因是什么呢?上面說到,GPU會合並內存訪問來優化性能,多維情況下,內存空間是按照行主序的方式儲存的,如下圖,一個5列的二維數組內存排列方式如下:
而在GPU執行過程中,他是先執行第一個緯度,再執行第二個緯度。所以,在第一種情況下,第一維是M,第二維是N,此時,B和C的內存無法合並訪問(訪問順序是00 10 20 30 40 01 11 21 …)
在第二種情況下,B和C的內存可以合並訪問(訪問順序是00 01 02 03 04 11 12 13 …)
合並訪問會減小內存請求,優化性能。
四、其他示例
試試添加一個kernel函數,測試它的運行時間。
__kernel void RunAsGpu_3( __global float *A, __global float *B, int M, int N, int P, __global float* C) { int x = get_global_id(0); int y = get_global_id(1); C[x*N + y] = 0; for(int i = 0;i<P;i++) { C[x*N + y] += A[x*P + i]*B[i*N + y]; } }