clEnqueueNDRangeKernel
cl_int clEnqueueNDRangeKernel ( cl_command_queue command_queue, //命令队列 cl_kernel kernel, //核 cl_uint work_dim, //数据的维度 const size_t *global_work_offset, // 各维度上的全局ID偏移量 const size_t *global_work_size, //各维度上的工作项数量 const size_t *local_work_size, // 各维度上一个工作组中工作项的数量 cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event )
clEnqueueTask
cl_int clEnqueueTask ( cl_command_queue command_queue, //命令队列 cl_kernel kernel, //核 cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event )
clEnqueueTask 和 clEnqueueNDRangeKernel的功能都是将核执行命令加入命令队列。而clEnqueueNDRangeKernel可以更好的划分数据,充分利用设备的资源
工作组和工作项的特点:
1、工作组中的工作项可以访问局部内存的同一块地址
2、工作组中的工作项可以进行同步
工作项相关函数
工作组相关函数
demo:
// 核函数 __kernel void id_check(__global float *output) { /* Access work-item/work-group information */ size_t global_id_0 = get_global_id(0); size_t global_id_1 = get_global_id(1); size_t global_size_0 = get_global_size(0); size_t offset_0 = get_global_offset(0); size_t offset_1 = get_global_offset(1); size_t local_id_0 = get_local_id(0); size_t local_id_1 = get_local_id(1); /* Determine array index */ int index_0 = global_id_0 - offset_0; int index_1 = global_id_1 - offset_1; int index = index_1 * global_size_0 + index_0; /* Set float data */ float f = global_id_0 * 10.0f + global_id_1 * 1.0f; f += local_id_0 * 0.1f + local_id_1 * 0.01f; output[index] = f; }
#define _CRT_SECURE_NO_WARNINGS #define PROGRAM_FILE "id_check.cl" #define KERNEL_FUNC "id_check" #include <stdio.h> #include <stdlib.h> #include <string.h> #ifdef MAC #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif /* Find a GPU or CPU associated with the first available platform */ // 获取平台 获取设备 cl_device_id create_device() { cl_platform_id platform; cl_device_id dev; int err; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); if(err == CL_DEVICE_NOT_FOUND) { err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL); } if(err < 0) { perror("Couldn't access any devices"); exit(1); } return dev; } /* Create program from a file and compile it */ // 创建 cl_program 编译cl_program cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { cl_program program; FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; int err; /* Read program file and place content into buffer */ program_handle = fopen(filename, "r"); if(program_handle == NULL) { perror("Couldn't find the program file"); exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)malloc(program_size + 1); program_buffer[program_size] = '\0'; fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); /* Create program from file */ program = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn't create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size + 1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } return program; } int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, err; /* Data and buffers */ size_t dim = 2; size_t global_offset[] = {3, 5}; size_t global_size[] = {6, 4}; size_t local_size[] = {3, 2}; float test[24]; cl_mem test_buffer; /* Create a device and context */ // 获取设备 device = create_device(); // 获取上下文 context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ // 获取编译后的 cl_program program = build_program(context, device, PROGRAM_FILE); // 创建核 kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a write-only buffer to hold the output data */ // 创建 cl_mem test_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(test), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ // 设置核参数 err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ // 创建命令队列 queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ //Enqueues a command to execute a kernel on a device. /** * cl_int clEnqueueNDRangeKernel ( cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) * **/ // dim =2 //size_t global_offset[] = {3, 5}; //size_t global_size[] = {6, 4}; //size_t local_size[] = {3, 2}; err = clEnqueueNDRangeKernel(queue, kernel, dim, global_offset, global_size, local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read and print the result */ // 从设备中读取结果 err = clEnqueueReadBuffer(queue, test_buffer, CL_TRUE, 0, sizeof(test), &test, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } for(i=0; i<24; i+=6) { printf("%.2f %.2f %.2f %.2f %.2f %.2f\n", test[i], test[i+1], test[i+2], test[i+3], test[i+4], test[i+5]); } /* Deallocate resources */ clReleaseMemObject(test_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }