▶ 包括帶有 CL_MEM_READ_ONLY,CL_MEM_WRITE_ONLY,CL_MEM_READ_WRITE 標識的顯示拷貝(函數 clEnqueueWriteBuffer 和 clEnqueueReadBuffer);帶有 CL_MEM_COPY_HOST_PTR 標識的隱式拷貝(不用拷貝函數,在設備上直接使用);以及使用函數 clEnqueueMapBuffer 直接在設備和主機之間映射(轉換)一段內存的指針
● 代碼
1 #include <cl.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <time.h> 5 #include <iostream> 6 7 using namespace std; 8 const int nElement = 2048; 9 10 const char *programSource = " \ 11 __kernel void vectorAdd(__global int *A, __global int *B, __global int *C) \ 12 { \ 13 int idx = get_global_id(0); \ 14 C[idx] = A[idx] + B[idx]; \ 15 return; \ 16 } \ 17 "; 18 19 int main(int argc, char* argv[]) 20 { 21 const size_t dataSize = sizeof(int) * nElement; 22 int i, *A, *B, *C, *returnC; 23 24 A = (int *)malloc(dataSize); 25 B = (int *)malloc(dataSize); 26 C = (int *)malloc(dataSize); 27 for (srand((unsigned)time(NULL)), i = 0; i < nElement; A[i] = rand() % 65535, B[i] = rand() % 65535, C[i] = A[i] + B[i], i++); 28 29 cl_int status; 30 cl_platform_id platform; 31 clGetPlatformIDs(1, &platform, NULL); 32 cl_device_id device; 33 clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); 34 cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); 35 cl_command_queue queue = clCreateCommandQueue(context, device, 0, &status); 36 cl_program program = clCreateProgramWithSource(context, 1, &programSource, NULL, &status); 37 clBuildProgram(program, 1, &device, NULL, NULL, NULL); 38 cl_kernel kernel = clCreateKernel(program, "vectorAdd", NULL); 39 cl_event writeEvent, runEvent, mapEvent; 40 41 //創建三個內存對象,把 A 隱式拷貝到 clA,把 B 顯示拷貝到 clB,clC 接收計算結果映射給 returnC 42 cl_mem clA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, A, NULL); 43 cl_mem clB = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, NULL); 44 cl_mem clC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, NULL); 45 46 clEnqueueWriteBuffer(queue, clB, 1, 0, dataSize, B, 0, 0, &writeEvent); 47 48 clFlush(queue);// 出現需要等待的事件之前,把隊列中的任務全部提交掉 49 clWaitForEvents(1, &writeEvent);// 等待指定事件完成 50 51 // 執行內核 52 size_t global_work_size = nElement; 53 clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clA); 54 clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clB); 55 clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clC); 56 clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, &runEvent); 57 clFlush(queue); 58 clWaitForEvents(1, &runEvent); 59 60 // 結果拷回 61 returnC = (cl_int *)clEnqueueMapBuffer(queue, clC, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, &mapEvent, NULL); 62 clFlush(queue); 63 clWaitForEvents(1, &mapEvent); 64 65 //結果驗證 66 printf("Verify %s.\n", !memcmp(C, returnC, dataSize) ? "passed" : "failed");// 定義在 iostream 67 68 free(C); 69 free(A); 70 free(B); 71 clReleaseMemObject(clA); 72 clReleaseMemObject(clB); 73 clReleaseMemObject(clC); 74 clReleaseContext(context); 75 clReleaseCommandQueue(queue); 76 clReleaseProgram(program); 77 clReleaseEvent(writeEvent); 78 clReleaseEvent(runEvent); 79 clReleaseEvent(mapEvent); 80 getchar(); 81 return 0; 82 }
● 輸出結果
Verify passed.
● 注意
■ 事件在聲明以后一定要通過某個函數的 cl_event * /* event */ 參數定義它,才能進行 clWaitForEvents 或 clReleaseEvent,否則會報錯
● 使用 CL_MEM_COPY_HOST_PTR 和 CL_MEM_USE_HOST_PTR 的區別,前者創建一個獨立的緩沖區,只是使用了 host_ptr 的值來初始化,后續使用過程中主機端對 host_ptr 的修改不會影響到緩沖區的內容;后者直接使用 host_ptr (轉化為設備指針來使用),后續使用過程中主機端對 host_ptr 的修改仍會影響緩沖區內容
● 代碼
1 #include <cl.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 5 using namespace std; 6 const int nElement = 2048; 7 8 const char *programSource = " \ 9 __kernel void vectorAdd(__global int *A) \ 10 { \ 11 return; \ 12 } \ 13 "; 14 15 int main(int argc, char* argv[]) 16 { 17 const size_t dataSize = sizeof(int) * nElement; 18 int i, sum, *A, *returnA, *mapReturnA; 19 20 A = (int *)malloc(dataSize); 21 returnA = (int *)malloc(dataSize); 22 //mapReturnA = (int *)malloc(dataSize);// 注意用作內存映射的指針不需要申請內存,使用這個表達式會造成結尾處的運行時錯誤 23 for (i = 0; i < nElement; A[i++] = 1); 24 25 cl_int status; 26 cl_platform_id platform; 27 clGetPlatformIDs(1, &platform, NULL); 28 cl_device_id device; 29 clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); 30 cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); 31 cl_command_queue queue = clCreateCommandQueue(context, device, 0, &status); 32 cl_program program = clCreateProgramWithSource(context, 1, &programSource, NULL, &status); 33 status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); 34 cl_kernel kernel = clCreateKernel(program, "vectorAdd", &status); 35 36 cl_mem clA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, dataSize, A, NULL); 37 //cl_mem clA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, dataSize, A, NULL); 38 39 for (i = 0; i < nElement; A[i++]++);// 在創建緩沖區以后再修改 A 的值 40 41 size_t global_work_size = nElement; 42 clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clA); 43 clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); 44 45 clEnqueueReadBuffer(queue, clA, CL_TRUE, 0, dataSize, returnA, 0, NULL, NULL); 46 mapReturnA = (cl_int *)clEnqueueMapBuffer(queue, clA, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, &status); 47 48 for (i = sum = 0; i < nElement; sum += A[i++]); 49 printf("sum A = %d\n", sum); 50 for (i = sum = 0; i < nElement; sum += returnA[i++]); 51 printf("sum returnA = %d\n", sum); 52 for (i = sum = 0; i < nElement; sum += mapReturnA[i++]); 53 printf("sum mapReturnA = %d\n", sum); 54 55 free(A); 56 free(returnA); 57 clReleaseMemObject(clA); 58 clReleaseContext(context); 59 clReleaseCommandQueue(queue); 60 clReleaseProgram(program); 61 getchar(); 62 return 0; 63 }
● 輸出結果
■ 使用 CL_MEM_COPY_HOST_PTR,僅有主機端的值被修改
sum A = 4096 sum returnA = 2048 sum mapReturnA = 2048
■ 使用 CL_MEM_USE_HOST_PTR,設備端使用的緩沖區也遭到了修改
sum A = 4096 sum returnA = 4096 sum mapReturnA = 4096
