▶ 包括带有 CL_MEM_READ_ONLY,CL_MEM_WRITE_ONLY,CL_MEM_READ_WRITE 标识的显示拷贝(函数 clEnqueueWriteBuffer 和 clEnqueueReadBuffer);带有 CL_MEM_COPY_HOST_PTR 标识的隐式拷贝(不用拷贝函数,在设备上直接使用);以及使用函数 clEnqueueMapBuffer 直接在设备和主机之间映射(转换)一段内存的指针
● 代码
1 #include <cl.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <time.h> 5 #include <iostream> 6 7 using namespace std; 8 const int nElement = 2048; 9 10 const char *programSource = " \ 11 __kernel void vectorAdd(__global int *A, __global int *B, __global int *C) \ 12 { \ 13 int idx = get_global_id(0); \ 14 C[idx] = A[idx] + B[idx]; \ 15 return; \ 16 } \ 17 "; 18 19 int main(int argc, char* argv[]) 20 { 21 const size_t dataSize = sizeof(int) * nElement; 22 int i, *A, *B, *C, *returnC; 23 24 A = (int *)malloc(dataSize); 25 B = (int *)malloc(dataSize); 26 C = (int *)malloc(dataSize); 27 for (srand((unsigned)time(NULL)), i = 0; i < nElement; A[i] = rand() % 65535, B[i] = rand() % 65535, C[i] = A[i] + B[i], i++); 28 29 cl_int status; 30 cl_platform_id platform; 31 clGetPlatformIDs(1, &platform, NULL); 32 cl_device_id device; 33 clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); 34 cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); 35 cl_command_queue queue = clCreateCommandQueue(context, device, 0, &status); 36 cl_program program = clCreateProgramWithSource(context, 1, &programSource, NULL, &status); 37 clBuildProgram(program, 1, &device, NULL, NULL, NULL); 38 cl_kernel kernel = clCreateKernel(program, "vectorAdd", NULL); 39 cl_event writeEvent, runEvent, mapEvent; 40 41 //创建三个内存对象,把 A 隐式拷贝到 clA,把 B 显示拷贝到 clB,clC 接收计算结果映射给 returnC 42 cl_mem clA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, A, NULL); 43 cl_mem clB = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, NULL); 44 cl_mem clC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, NULL); 45 46 clEnqueueWriteBuffer(queue, clB, 1, 0, dataSize, B, 0, 0, &writeEvent); 47 48 clFlush(queue);// 出现需要等待的事件之前,把队列中的任务全部提交掉 49 clWaitForEvents(1, &writeEvent);// 等待指定事件完成 50 51 // 执行内核 52 size_t global_work_size = nElement; 53 clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clA); 54 clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clB); 55 clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clC); 56 clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, &runEvent); 57 clFlush(queue); 58 clWaitForEvents(1, &runEvent); 59 60 // 结果拷回 61 returnC = (cl_int *)clEnqueueMapBuffer(queue, clC, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, &mapEvent, NULL); 62 clFlush(queue); 63 clWaitForEvents(1, &mapEvent); 64 65 //结果验证 66 printf("Verify %s.\n", !memcmp(C, returnC, dataSize) ? "passed" : "failed");// 定义在 iostream 67 68 free(C); 69 free(A); 70 free(B); 71 clReleaseMemObject(clA); 72 clReleaseMemObject(clB); 73 clReleaseMemObject(clC); 74 clReleaseContext(context); 75 clReleaseCommandQueue(queue); 76 clReleaseProgram(program); 77 clReleaseEvent(writeEvent); 78 clReleaseEvent(runEvent); 79 clReleaseEvent(mapEvent); 80 getchar(); 81 return 0; 82 }
● 输出结果
Verify passed.
● 注意
■ 事件在声明以后一定要通过某个函数的 cl_event * /* event */ 参数定义它,才能进行 clWaitForEvents 或 clReleaseEvent,否则会报错
● 使用 CL_MEM_COPY_HOST_PTR 和 CL_MEM_USE_HOST_PTR 的区别,前者创建一个独立的缓冲区,只是使用了 host_ptr 的值来初始化,后续使用过程中主机端对 host_ptr 的修改不会影响到缓冲区的内容;后者直接使用 host_ptr (转化为设备指针来使用),后续使用过程中主机端对 host_ptr 的修改仍会影响缓冲区内容
● 代码
1 #include <cl.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 5 using namespace std; 6 const int nElement = 2048; 7 8 const char *programSource = " \ 9 __kernel void vectorAdd(__global int *A) \ 10 { \ 11 return; \ 12 } \ 13 "; 14 15 int main(int argc, char* argv[]) 16 { 17 const size_t dataSize = sizeof(int) * nElement; 18 int i, sum, *A, *returnA, *mapReturnA; 19 20 A = (int *)malloc(dataSize); 21 returnA = (int *)malloc(dataSize); 22 //mapReturnA = (int *)malloc(dataSize);// 注意用作内存映射的指针不需要申请内存,使用这个表达式会造成结尾处的运行时错误 23 for (i = 0; i < nElement; A[i++] = 1); 24 25 cl_int status; 26 cl_platform_id platform; 27 clGetPlatformIDs(1, &platform, NULL); 28 cl_device_id device; 29 clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); 30 cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); 31 cl_command_queue queue = clCreateCommandQueue(context, device, 0, &status); 32 cl_program program = clCreateProgramWithSource(context, 1, &programSource, NULL, &status); 33 status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); 34 cl_kernel kernel = clCreateKernel(program, "vectorAdd", &status); 35 36 cl_mem clA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, dataSize, A, NULL); 37 //cl_mem clA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, dataSize, A, NULL); 38 39 for (i = 0; i < nElement; A[i++]++);// 在创建缓冲区以后再修改 A 的值 40 41 size_t global_work_size = nElement; 42 clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clA); 43 clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); 44 45 clEnqueueReadBuffer(queue, clA, CL_TRUE, 0, dataSize, returnA, 0, NULL, NULL); 46 mapReturnA = (cl_int *)clEnqueueMapBuffer(queue, clA, CL_TRUE, CL_MAP_READ, 0, dataSize, 0, NULL, NULL, &status); 47 48 for (i = sum = 0; i < nElement; sum += A[i++]); 49 printf("sum A = %d\n", sum); 50 for (i = sum = 0; i < nElement; sum += returnA[i++]); 51 printf("sum returnA = %d\n", sum); 52 for (i = sum = 0; i < nElement; sum += mapReturnA[i++]); 53 printf("sum mapReturnA = %d\n", sum); 54 55 free(A); 56 free(returnA); 57 clReleaseMemObject(clA); 58 clReleaseContext(context); 59 clReleaseCommandQueue(queue); 60 clReleaseProgram(program); 61 getchar(); 62 return 0; 63 }
● 输出结果
■ 使用 CL_MEM_COPY_HOST_PTR,仅有主机端的值被修改
sum A = 4096 sum returnA = 2048 sum mapReturnA = 2048
■ 使用 CL_MEM_USE_HOST_PTR,设备端使用的缓冲区也遭到了修改
sum A = 4096 sum returnA = 4096 sum mapReturnA = 4096