▶ 书上的代码改进而成,从文件读入一张 256 阶灰度图,按照给定的卷积窗口计算卷积,并输出到文件中。
● 代码,使用 9 格的均值窗口,居然硬读写 .bmp 文件,算是了解一下该文件的具体格式,留作纪念吧。
1 // convolution.cl 2 __kernel void convolution01(__read_only image2d_t inputImage, __write_only image2d_t outputImage, 3 int imageRow, int imageCol, __constant float* filter, int filterWidth, sampler_t sampler) 4 { 5 const int row = get_global_id(0), col = get_global_id(1); // 注意工作项的顺序,可以和图像读取不一样 6 const int halfWidth = filterWidth / 2; 7 float4 sum = { 0.0f, 0.0f, 0.0f, 0.0f }, pixel; // 输出数据类型是四元浮点数 8 int i, j, filterIdx; // 卷积窗口单独用一个下标遍历 9 for (filterIdx = 0, i = -halfWidth; i <= halfWidth; i++) 10 { 11 for (j = -halfWidth; j <= halfWidth; j++) 12 { 13 pixel = read_imagef(inputImage, sampler, (int2)(col + j, row + i)); // 读取目标坐标,注意列在前行在后 14 sum.x += pixel.x * filter[filterIdx++]; // 采用了单通道,只有第一分量有效 15 } 16 } 17 if (row < imageRow && col < imageCol) // 将落在有效范围内的计算数据输出 18 write_imagef(outputImage, (int2)(col, row), sum); 19 return; 20 }
1 // main.cpp 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <math.h> 5 #include <cl.h> 6 7 #pragma warning(disable : 4996) 8 9 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl"; 10 const char *inputFile = "R:/input.bmp"; 11 const char *outputFile = "R:/output.bmp"; 12 13 bool floatEq(const float a, const float b)// 相等返回 1 14 { 15 return (b == 0) ? fabs(a) < 0.001 : fabs(a / b - 1) < 0.001; 16 } 17 18 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度 19 { 20 FILE *fp; 21 int size; 22 //printf("<readText> File: %s\n", kernelPath); 23 fopen_s(&fp, kernelPath, "rb"); 24 if (!fp) 25 { 26 printf("<readText> Open file failed\n"); 27 getchar(); 28 exit(-1); 29 } 30 if (fseek(fp, 0, SEEK_END) != 0) 31 { 32 printf("<readText> Seek end of file failed\n"); 33 getchar(); 34 exit(-1); 35 } 36 if ((size = ftell(fp)) < 0) 37 { 38 printf("<readText> Get file position failed\n"); 39 getchar(); 40 exit(-1); 41 } 42 rewind(fp); 43 if ((*pcode = (char *)malloc(size + 1)) == NULL) 44 { 45 printf("<readText> Allocate space failed\n"); 46 getchar(); 47 exit(-1); 48 } 49 fread(*pcode, 1, size, fp); 50 (*pcode)[size] = '\0'; 51 fclose(fp); 52 return size + 1; 53 } 54 55 void storeImage(float *imageOut, const char *filename, const char *refFilename)// 输出图片 56 { 57 FILE *ifp, *ofp; 58 unsigned char *metaData, temp; 59 int offset, i, j, row, col, mod; 60 61 if (fopen_s(&ifp, refFilename, "rb") != 0)// 从 参考图片(输入文件)中读取需要的行列数 62 { 63 printf(filename); 64 exit(-1); 65 } 66 fseek(ifp, 10, SEEK_SET); 67 fread(&offset, 4, 1, ifp); 68 fseek(ifp, 18, SEEK_SET); 69 fread(&col, 4, 1, ifp); 70 fread(&row, 4, 1, ifp); 71 fseek(ifp, 0, SEEK_SET); 72 if ((metaData = (unsigned char *)malloc(offset)) == NULL) 73 { 74 printf("<storeImage> Allocate space failed\n"); 75 getchar(); 76 exit(-1); 77 } 78 fread(metaData, 1, offset, ifp); // 从输入文件中读取元信息 79 //printf("Output image %s\n", filename); 80 if (fopen_s(&ofp, filename, "wb") != 0) 81 { 82 printf("<storeImage> Open output file failed\n"); 83 getchar(); 84 exit(-1); 85 } 86 if (fwrite(metaData, 1, offset, ofp) != offset) // 将元信息原封不动的放入输出文件中 87 { 88 printf("<storeImage> Write output metaData failed\n"); 89 getchar(); 90 exit(-1); 91 } 92 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)// .bmp 行是颠倒的,倒着填充 93 { 94 for (j = 0; j < col; j++) 95 { 96 temp = (unsigned char)imageOut[i * col + j]; 97 fwrite(&temp, sizeof(unsigned char), 1, ofp); 98 } 99 for (j = 0; j < mod; fwrite(&temp, sizeof(unsigned char), 1, ofp), j++);// 列数非 4 的倍数时补上 junk padding 100 101 } 102 103 fclose(ifp); 104 fclose(ofp); 105 free(metaData); 106 return; 107 } 108 109 float *readImage(const char *filename, int *outputRow, int *outputCol)// 从文件读取图片 110 { 111 unsigned char temp; 112 int i, j, row, col, offset, mod; 113 float *outputImage; 114 FILE *fp; 115 116 if (fopen_s(&fp, filename, "rb") != 0) 117 { 118 printf("<readImage> Open file failed\n"); 119 getchar(); 120 exit(-1); 121 } 122 fseek(fp, 10, SEEK_SET); // 第 10 字节的位置 123 fread(&offset, 4, 1, fp); // 元信息大小 124 fseek(fp, 18, SEEK_SET); // 第 18 字节位置 125 fread(&col, 4, 1, fp); // 读取列数和行数 126 fread(&row, 4, 1, fp); 127 printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row); 128 129 if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL) 130 { 131 printf("<readImage> Allocate space failed\n"); 132 getchar(); 133 exit(-1); 134 } 135 fseek(fp, offset, SEEK_SET);// 元信息结束的地方,开始读图像数据 136 fflush(NULL); 137 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)// .bmp 行是颠倒的,顺着读文件,倒着填充,mod 为列的 junk pading 厚度 138 { 139 for (j = 0; j < col; j++) 140 { 141 fread(&temp, sizeof(unsigned char), 1, fp); 142 outputImage[i * col + j] = (float)temp; 143 } 144 for (j = 0; j < mod; fread(&temp, sizeof(unsigned char), 1, fp), j++);// 读取 junk padding,不传入数据中(.bmp 文件中有这几列,但是不显示) 145 } 146 fclose(fp); 147 148 *outputRow = row; 149 *outputCol = col; 150 return outputImage; 151 } 152 153 int main() 154 { 155 int imageRow, imageCol, dataSize, row, col, i, j, correct; 156 float *inputImage, *outputImage, sum; 157 158 inputImage = readImage(inputFile, &imageRow, &imageCol);// 从文件读取图像数据和行列信息 159 dataSize = imageRow * imageCol * sizeof(float); 160 outputImage = (float*)malloc(dataSize); 161 162 const int filterWidth = 7, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2; 163 float filter[49] = 164 { 0, 0, 0, 0, 0, 0, 0, 165 0, 0, 0, 0, 0, 0, 0, 166 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 167 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 168 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0 171 }; 172 173 cl_int status; 174 cl_uint nPlatform; 175 clGetPlatformIDs(0, NULL, &nPlatform); 176 cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id)); 177 clGetPlatformIDs(nPlatform, listPlatform, NULL); 178 cl_uint nDevice = 0; 179 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice); 180 cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id)); 181 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL); 182 cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); 183 cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], 0, &status); 184 185 cl_image_format format; // 图像格式描述符 186 format.image_channel_order = CL_R; // 单通道 187 format.image_channel_data_type = CL_FLOAT; // 浮点类型(读进来还是 uchar,但是计算需要浮点) 188 189 cl_mem d_inputImage, d_outputImage, d_filter; 190 if (true)// 旧 OpenCL 函数 clCreateImage2D 191 { 192 d_inputImage = clCreateImage2D(context, 0, &format, imageCol, imageRow, 0, NULL, &status); 193 d_outputImage = clCreateImage2D(context, 0, &format, imageCol, imageRow, 0, NULL, &status); 194 } 195 if (false)// 新 OpenCL 使用描述符图像描述符 cl_image_desc 和函数 clCreateImage 196 { 197 cl_image_desc desc; 198 desc.image_type = CL_MEM_OBJECT_IMAGE2D; 199 desc.image_width = imageCol; 200 desc.image_height = imageRow; 201 desc.image_depth = 0; 202 desc.image_array_size = 0; 203 desc.image_row_pitch = 0; 204 desc.image_slice_pitch = 0; 205 desc.num_mip_levels = 0; 206 desc.num_samples = 0; 207 desc.buffer = NULL; 208 209 d_inputImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, &status); 210 d_outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &status); 211 } 212 d_filter = clCreateBuffer(context, 0, filterSize * sizeof(float), NULL, &status); 213 214 size_t origin[3] = { 0, 0, 0 }, region[3] = { imageCol, imageRow, 1 }; // 拷贝图像数据用的原点和尺寸,注意尺寸是先数列再数行 215 clEnqueueWriteImage(queue, d_inputImage, CL_TRUE, origin, region, 0, 0, inputImage, 0, NULL, NULL); 216 clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0, filterSize * sizeof(float), filter, 0, NULL, NULL); 217 218 cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);// 采样器 219 220 char *code; 221 size_t length = readText(sourceText, &code); 222 cl_program program = clCreateProgramWithSource(context, 1, (const char **)&code, &length, NULL); 223 clBuildProgram(program, 1, listDevice, NULL, NULL, NULL); 224 cl_kernel kernel = clCreateKernel(program, "convolution01", &status); 225 226 clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); 227 clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); 228 clSetKernelArg(kernel, 2, sizeof(int), &imageRow); 229 clSetKernelArg(kernel, 3, sizeof(int), &imageCol); 230 clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter); 231 clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); 232 clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler); 233 234 size_t globalSize[2] = { imageRow, imageCol };// localSize = { 1, 1}; 可以用 NULL 代替 235 status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, NULL, NULL); 236 clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, region, 0, 0, outputImage, 0, NULL, NULL); 237 238 storeImage(outputImage, outputFile, inputFile);// 将输出图像写入文件中 239 240 for (row = 0, correct = 1; row < imageRow && correct; row++)// 检查计算结果 241 { 242 for (col = 0; col < imageCol && correct; col++) 243 { 244 sum = 0; 245 for (i = -halfFilterWidth; i <= halfFilterWidth; i++) 246 { 247 for (j = -halfFilterWidth; j <= halfFilterWidth; j++) 248 { 249 if (row + i >= 0 && row + i < imageRow && col + j >= 0 && col + j < imageCol) 250 sum += inputImage[(row + i) * imageCol + col + j] * filter[(i + halfFilterWidth) * filterWidth + j + halfFilterWidth]; 251 } 252 } 253 if (row >= halfFilterWidth && row < imageRow - halfFilterWidth && col >= halfFilterWidth && col < imageCol - halfFilterWidth && 254 !floatEq(outputImage[row * imageCol + col], sum)) 255 { 256 printf("Error at [%d,%d], output:%f, ref:%f\n", row, col, outputImage[row*imageCol + col], sum); 257 correct = 0; 258 } 259 } 260 } 261 if (correct) 262 printf("Result correct.\n"); 263 264 free(listPlatform); 265 free(listDevice); 266 free(inputImage); 267 free(outputImage); 268 free(code); 269 clReleaseContext(context); 270 clReleaseCommandQueue(queue); 271 clReleaseProgram(program); 272 clReleaseKernel(kernel); 273 clReleaseMemObject(d_inputImage); 274 clReleaseMemObject(d_outputImage); 275 clReleaseMemObject(d_filter); 276 clReleaseSampler(sampler); 277 getchar(); 278 return 0; 279 }
● 输出结果,给了一张 400 × 400 的图片参与,另外,计算一张 5040 × 7000 的图片需要 23 ms。
<readImage> Input image R:/input.bmp, width = 400, height = 400 Output image R:/output.bmp Result correct.
● 代码,使用局部内存优化
1 // convolution.cl 2 __kernel void convolution02(__global float* inputImage, __global float* outputImage, int imageRow, int imageCol, 3 __constant float* filter, int filterWidth, __local float* localMem, int localMemRow, int localMemCol) 4 { 5 const int groupCol = get_group_id(0) * get_local_size(0), groupRow = get_group_id(1) * get_local_size(1); 6 const int localCol = get_local_id(0), localRow = get_local_id(1); 7 const int globalCol = groupCol + localCol, globalRow = groupRow + localRow; 8 const int halfWidth = filterWidth / 2; 9 int i, j, curRow, curCol, filterIndex; 10 float sum; 11 12 // 将源图数据读入局部内存 13 for (i = localRow; i < localMemRow; i += get_local_size(1)) 14 { 15 curRow = groupRow + i; 16 for (j = localCol; j < localMemCol; j += get_local_size(0)) 17 { 18 curCol = groupCol + j; 19 if (curRow < imageRow && curCol < imageCol) 20 localMem[i * localMemCol + j] = inputImage[curRow * imageCol + curCol]; 21 } 22 } 23 barrier(CLK_LOCAL_MEM_FENCE); 24 25 // 计算卷积 26 if (globalRow < imageRow - filterWidth + 1 && globalCol < imageCol - filterWidth + 1)// 选取位于有效范围内的工作组 27 { 28 sum = 0.0f, filterIndex = 0; 29 for (i = localRow; i < localRow + filterWidth; i++) 30 { 31 for (j = localCol; j < localCol + filterWidth; j++) 32 sum += localMem[i * localMemCol + j] * filter[filterIndex++]; 33 } 34 // 循环展开 35 /* 36 for (i = localRow; i < localRow + filterWidth; i++) 37 { 38 int offset = i * localMemCol + localCol; 39 sum += localMem[offset++] * filter[filterIndex++];// 行数等于 filterWidth 40 sum += localMem[offset++] * filter[filterIndex++]; 41 sum += localMem[offset++] * filter[filterIndex++]; 42 sum += localMem[offset++] * filter[filterIndex++]; 43 sum += localMem[offset++] * filter[filterIndex++]; 44 sum += localMem[offset++] * filter[filterIndex++]; 45 sum += localMem[offset++] * filter[filterIndex++]; 46 } 47 */ 48 // 循环完全展开 49 /* 50 int offset = localRow*localMemCol + localCol; 51 sum += localMem[offset + 0] * filter[filterIndex++]; 52 sum += localMem[offset + 1] * filter[filterIndex++]; 53 sum += localMem[offset + 2] * filter[filterIndex++]; 54 sum += localMem[offset + 3] * filter[filterIndex++]; 55 sum += localMem[offset + 4] * filter[filterIndex++]; 56 sum += localMem[offset + 5] * filter[filterIndex++]; 57 sum += localMem[offset + 6] * filter[filterIndex++]; 58 offset += localMemCol; 59 sum += localMem[offset + 0] * filter[filterIndex++]; 60 sum += localMem[offset + 1] * filter[filterIndex++]; 61 sum += localMem[offset + 2] * filter[filterIndex++]; 62 sum += localMem[offset + 3] * filter[filterIndex++]; 63 sum += localMem[offset + 4] * filter[filterIndex++]; 64 sum += localMem[offset + 5] * filter[filterIndex++]; 65 sum += localMem[offset + 6] * filter[filterIndex++]; 66 offset += localMemCol; 67 sum += localMem[offset + 0] * filter[filterIndex++]; 68 sum += localMem[offset + 1] * filter[filterIndex++]; 69 sum += localMem[offset + 2] * filter[filterIndex++]; 70 sum += localMem[offset + 3] * filter[filterIndex++]; 71 sum += localMem[offset + 4] * filter[filterIndex++]; 72 sum += localMem[offset + 5] * filter[filterIndex++]; 73 sum += localMem[offset + 6] * filter[filterIndex++]; 74 offset += localMemCol; 75 sum += localMem[offset + 0] * filter[filterIndex++]; 76 sum += localMem[offset + 1] * filter[filterIndex++]; 77 sum += localMem[offset + 2] * filter[filterIndex++]; 78 sum += localMem[offset + 3] * filter[filterIndex++]; 79 sum += localMem[offset + 4] * filter[filterIndex++]; 80 sum += localMem[offset + 5] * filter[filterIndex++]; 81 sum += localMem[offset + 6] * filter[filterIndex++]; 82 offset += localMemCol; 83 sum += localMem[offset + 0] * filter[filterIndex++]; 84 sum += localMem[offset + 1] * filter[filterIndex++]; 85 sum += localMem[offset + 2] * filter[filterIndex++]; 86 sum += localMem[offset + 3] * filter[filterIndex++]; 87 sum += localMem[offset + 4] * filter[filterIndex++]; 88 sum += localMem[offset + 5] * filter[filterIndex++]; 89 sum += localMem[offset + 6] * filter[filterIndex++]; 90 offset += localMemCol; 91 sum += localMem[offset + 0] * filter[filterIndex++]; 92 sum += localMem[offset + 1] * filter[filterIndex++]; 93 sum += localMem[offset + 2] * filter[filterIndex++]; 94 sum += localMem[offset + 3] * filter[filterIndex++]; 95 sum += localMem[offset + 4] * filter[filterIndex++]; 96 sum += localMem[offset + 5] * filter[filterIndex++]; 97 sum += localMem[offset + 6] * filter[filterIndex++]; 98 offset += localMemCol; 99 sum += localMem[offset + 0] * filter[filterIndex++]; 100 sum += localMem[offset + 1] * filter[filterIndex++]; 101 sum += localMem[offset + 2] * filter[filterIndex++]; 102 sum += localMem[offset + 3] * filter[filterIndex++]; 103 sum += localMem[offset + 4] * filter[filterIndex++]; 104 sum += localMem[offset + 5] * filter[filterIndex++]; 105 sum += localMem[offset + 6] * filter[filterIndex++]; 106 */ 107 // 数据输出 108 outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = sum; 109 } 110 return; 111 } 112 113 __kernel void convolution03(__global float4* inputImage, __global float* outputImage, int imageRow, int imageCol, 114 __constant float* filter, int filterWidth, __local float* localMem, int localMemRow, int localMemCol) 115 { 116 const int groupCol4 = get_group_id(0) * get_local_size(0) / 4, groupRow4 = get_group_id(1) * get_local_size(1); 117 const int localId = get_local_id(1) * get_local_size(0) + get_local_id(0); 118 int localCol = (localId % (localMemCol / 4)), localRow = (localId / (localMemCol / 4)); 119 int globalCol = groupCol4 + localCol, globalRow = groupRow4 + localRow; 120 const int halfWidth = filterWidth / 2; 121 122 __local float4 *localImage4 = (__local float4*)&localMem[localRow*localMemCol + localCol * 4];// 局部内存数据 123 124 if (globalRow < imageRow && globalCol < imageCol / 4 && localRow < localMemRow) 125 localImage4[0] = inputImage[globalRow*imageCol / 4 + globalCol]; 126 barrier(CLK_LOCAL_MEM_FENCE); 127 128 // 重设 坐标以输出 129 localCol = get_local_id(0); 130 localRow = get_local_id(1); 131 globalCol = get_group_id(0)*get_local_size(0) + localCol; 132 globalRow = get_group_id(1)*get_local_size(1) + localRow; 133 134 // 计算卷积 135 int i, j, filterIndex; 136 float sum; 137 if (globalRow < imageRow - filterWidth + 1 && globalCol < imageCol - filterWidth + 1) 138 { 139 sum = 0.0f, filterIndex = 0; 140 for (i = localRow; i < localRow + filterWidth; i++) 141 { 142 for (int j = localCol; j < localCol + filterWidth; j++) 143 sum += localMem[i * localMemCol + j] * filter[filterIndex++]; 144 } 145 // 循环展开同 convolution02 146 // 输出数据 147 outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = 0;//sum; 148 } 149 return; 150 }
1 // main.cpp 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <math.h> 5 #include <cl.h> 6 7 #pragma warning(disable : 4996) 8 9 #define NON_OPTIMIZED // 不使用优化,使用函数 convolution02 10 //#define READ_ALIGNED // 使用内存对齐优化,使用函数 convolution02 11 //#define READ4 // 局部内存使用 float4 读取优化,使用函数 convolution03,有点问题 12 #define WGX 16 // 工作组尺寸 13 #define WGY 16 14 15 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl"; 16 const char *inputFile = "R:/input.bmp"; 17 const char *outputFile = "R:/output.bmp"; 18 19 unsigned int roundUp(unsigned int value, unsigned int base)// 将 value 向上取整到 multiple 的整数倍 20 { 21 unsigned int remainder = value % base; 22 return remainder == 0 ? value : (value + base - remainder); 23 } 24 25 int readText(const char* kernelPath, char **pcode) 26 { 27 FILE *fp; 28 int size; 29 //printf("<readText> File: %s\n", kernelPath); 30 fopen_s(&fp, kernelPath, "rb"); 31 if (!fp) 32 { 33 printf("<readText> Open file failed\n"); 34 getchar(); 35 exit(-1); 36 } 37 if (fseek(fp, 0, SEEK_END) != 0) 38 { 39 printf("<readText> Seek end of file failed\n"); 40 getchar(); 41 exit(-1); 42 } 43 if ((size = ftell(fp)) < 0) 44 { 45 printf("<readText> Get file position failed\n"); 46 getchar(); 47 exit(-1); 48 } 49 rewind(fp); 50 if ((*pcode = (char *)malloc(size + 1)) == NULL) 51 { 52 printf("<readText> Allocate space failed\n"); 53 getchar(); 54 exit(-1); 55 } 56 fread(*pcode, 1, size, fp); 57 (*pcode)[size] = '\0'; 58 fclose(fp); 59 return size + 1; 60 } 61 62 void storeImage(float *imageOut, const char *filename, const char *refFilename) 63 { 64 FILE *ifp, *ofp; 65 unsigned char *metaData, temp; 66 int offset, i, j, row, col, mod; 67 68 if (fopen_s(&ifp, refFilename, "rb") != 0) 69 { 70 printf(filename); 71 exit(-1); 72 } 73 fseek(ifp, 10, SEEK_SET); 74 fread(&offset, 4, 1, ifp); 75 fseek(ifp, 18, SEEK_SET); 76 fread(&col, 4, 1, ifp); 77 fread(&row, 4, 1, ifp); 78 fseek(ifp, 0, SEEK_SET); 79 if ((metaData = (unsigned char *)malloc(offset)) == NULL) 80 { 81 printf("<storeImage> Allocate space failed\n"); 82 getchar(); 83 exit(-1); 84 } 85 fread(metaData, 1, offset, ifp); 86 87 if (fopen_s(&ofp, filename, "wb") != 0) 88 { 89 printf("<storeImage> Open output file failed\n"); 90 getchar(); 91 exit(-1); 92 } 93 if (fwrite(metaData, 1, offset, ofp) != offset) 94 { 95 printf("<storeImage> Write output metaData failed\n"); 96 getchar(); 97 exit(-1); 98 } 99 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--) 100 { 101 for (j = 0; j < col; j++) 102 { 103 temp = (unsigned char)imageOut[i * col + j]; 104 fwrite(&temp, sizeof(unsigned char), 1, ofp); 105 } 106 for (j = 0; j < mod; fwrite(&temp, sizeof(unsigned char), 1, ofp), j++); 107 108 } 109 110 fclose(ifp); 111 fclose(ofp); 112 free(metaData); 113 return; 114 } 115 116 float *readImage(const char *filename, int *outputRow, int *outputCol) 117 { 118 unsigned char temp; 119 int i, j, row, col, offset, mod; 120 float *outputImage; 121 FILE *fp; 122 123 if (fopen_s(&fp, filename, "rb") != 0) 124 { 125 printf("<readImage> Open file failed\n"); 126 getchar(); 127 exit(-1); 128 } 129 fseek(fp, 10, SEEK_SET); 130 fread(&offset, 4, 1, fp); 131 fseek(fp, 18, SEEK_SET); 132 fread(&col, 4, 1, fp); 133 fread(&row, 4, 1, fp); 134 printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row); 135 136 if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL) 137 { 138 printf("<readImage> Allocate space failed\n"); 139 getchar(); 140 exit(-1); 141 } 142 fseek(fp, offset, SEEK_SET); 143 fflush(NULL); 144 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--) 145 { 146 for (j = 0; j < col; j++) 147 { 148 fread(&temp, sizeof(unsigned char), 1, fp); 149 outputImage[i * col + j] = (float)temp; 150 } 151 for (j = 0; j < mod; fread(&temp, sizeof(unsigned char), 1, fp), j++); 152 } 153 fclose(fp); 154 155 *outputRow = row; 156 *outputCol = col; 157 return outputImage; 158 } 159 160 int main() 161 { 162 int imageRow, imageCol, dataSize, deviceRow, deviceCol, deviceDataSize; 163 float *inputImage, *outputImage; 164 inputImage = readImage(inputFile, &imageRow, &imageCol); 165 dataSize = imageRow * imageCol * sizeof(float); 166 outputImage = (float*)malloc(dataSize); 167 168 // 调整列数 169 #ifdef NON_OPTIMIZED // 不调整 170 deviceCol = imageCol; 171 #else // 增加道工作组尺寸的整数倍 172 deviceCol = roundUp(imageCol, WGX); 173 #endif 174 deviceRow = imageRow; // 行数不变 175 deviceDataSize = sizeof(float) * deviceRow * deviceCol; 176 177 const int filterWidth = 7, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2; 178 float filter[49] = 179 { 180 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 182 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 183 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 184 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 185 0, 0, 0, 0, 0, 0, 0, 186 0, 0, 0, 0, 0, 0, 0 187 }; 188 189 cl_int status; 190 cl_uint nPlatform; 191 clGetPlatformIDs(0, NULL, &nPlatform); 192 cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id)); 193 clGetPlatformIDs(nPlatform, listPlatform, NULL); 194 cl_uint nDevice = 0; 195 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice); 196 cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id)); 197 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL); 198 cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); 199 cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], 0, &status); 200 201 // 使用普通的缓冲区,而不用 image 202 cl_mem d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL); 203 cl_mem d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL); 204 cl_mem d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * filterSize, NULL, NULL); 205 206 // 缓冲区写入 207 #if defined NON_OPTIMIZED // 直接写入 208 clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize, inputImage, 0, NULL, NULL); 209 #else // 对齐写入 210 size_t d_origin[3] = { 0,0,0 }, h_origin[3] = { 0,0,0 }, region[3] = { sizeof(float) * deviceCol, deviceRow, 1 }; 211 clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, 0, sizeof(float) * imageCol, 0, inputImage, 0, NULL, NULL); 212 #endif 213 clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, sizeof(float) * filterSize, filter, 0, NULL, NULL); 214 215 char *code; 216 size_t length = readText(sourceText, &code); 217 cl_program program = clCreateProgramWithSource(context, 1, (const char **)&code, &length, &status); 218 status = clBuildProgram(program, 1, listDevice, NULL, NULL, NULL); 219 220 // 创建不同的内核 221 #if defined NON_OPTIMIZED || defined READ_ALIGNED 222 cl_kernel kernel = clCreateKernel(program, "convolution02", NULL); 223 #else 224 cl_kernel kernel = clCreateKernel(program, "convolution03", NULL); 225 #endif 226 227 size_t globalSize[2] = { roundUp(imageCol - filterWidth + 1, WGX), roundUp(imageRow - filterWidth + 1, WGY) }, localSize[2] = { WGX, WGY }; 228 229 // 局部内存大小 230 int localRow, localCol; 231 localRow = localSize[1] + filterWidth - 1; // 把一个工作组的大小垫起光环元素的宽度 232 #if defined NON_OPTIMIZED || defined READ_ALIGNED 233 localCol = localSize[0] + filterWidth - 1; 234 #else 235 localCol = roundUp(localSize[0] + filterWidth - 1, 4); // 垫起之外还要对齐到 4 的倍数上 236 #endif 237 clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); 238 clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); 239 clSetKernelArg(kernel, 2, sizeof(int), &deviceRow); 240 clSetKernelArg(kernel, 3, sizeof(int), &deviceCol); 241 clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter); 242 clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); 243 clSetKernelArg(kernel, 6, sizeof(float) * localCol * localRow, NULL); 244 clSetKernelArg(kernel, 7, sizeof(int), &localRow); 245 clSetKernelArg(kernel, 8, sizeof(int), &localCol); 246 247 status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL); 248 249 // 结果写回 250 #if defined NON_OPTIMIZED 251 clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, deviceDataSize, outputImage, 0, NULL, NULL); 252 #else // 最边上一圈 filterWidth / 2 的部分不写回 253 d_origin[0] = 3 * sizeof(float), d_origin[1] = 3, d_origin[2] = 0; 254 h_origin[0] = 3 * sizeof(float), h_origin[1] = 3, h_origin[2] = 0; 255 region[0] = (imageCol - filterWidth + 1) * sizeof(float), region[1] = (imageRow - filterWidth + 1), region[2] = 1; 256 clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, 0, sizeof(float) * imageCol, 0, outputImage, 0, NULL, NULL); 257 #endif 258 storeImage(outputImage, outputFile, inputFile); 259 260 // 去掉了检查结果的部分 261 free(listPlatform); 262 free(listDevice); 263 free(inputImage); 264 free(outputImage); 265 free(code); 266 clReleaseContext(context); 267 clReleaseCommandQueue(queue); 268 clReleaseProgram(program); 269 clReleaseKernel(kernel); 270 clReleaseMemObject(d_inputImage); 271 clReleaseMemObject(d_outputImage); 272 clReleaseMemObject(d_filter); 273 printf("Finshed.\n"); 274 getchar(); 275 return 0; 276 }
● 输出结果,与上面的简单方法相同
● 用到的函数和定义
1 //cl.h 2 // 采样器越界处理方案 3 #define CL_ADDRESS_NONE 0x1130 4 #define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 5 #define CL_ADDRESS_CLAMP 0x1132 6 #define CL_ADDRESS_REPEAT 0x1133 7 #define CL_ADDRESS_MIRRORED_REPEAT 0x1134 8 9 // 插值方案 10 #define CL_FILTER_NEAREST 0x1140 11 #define CL_FILTER_LINEAR 0x1141 12 13 // 用到的采样器和描述符的定义 14 typedef struct _cl_sampler* cl_sampler; 15 typedef struct _cl_image_format 16 { 17 cl_channel_order image_channel_order; 18 cl_channel_type image_channel_data_type; 19 } cl_image_format; 20 21 typedef struct _cl_image_desc 22 { 23 cl_mem_object_type image_type; 24 size_t image_width; 25 size_t image_height; 26 size_t image_depth; 27 size_t image_array_size; 28 size_t image_row_pitch; 29 size_t image_slice_pitch; 30 cl_uint num_mip_levels; 31 cl_uint num_samples; 32 cl_mem buffer; 33 } cl_image_desc; 34 35 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(// OpenCL1.2 中废弃的 image 创建函数 36 cl_context, // 上下文 37 cl_mem_flags, // 特殊标志 38 const cl_image_format *,// image 描述符 39 size_t, // 宽度 40 size_t, // 高度 41 size_t, // 行跨步 42 void *, // 自动传入主机数据 43 cl_int * // 返回结果状态的指针 44 ) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; 45 46 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(// OpenCL1.2 中开始使用的 image 创建函数 47 cl_context, // 上下文 48 cl_mem_flags, // 特殊标志 49 const cl_image_format *,// image 格式描述符 50 const cl_image_desc *, // image 描述符 51 void *, // 主机数据 52 cl_int * // 返回结果状态的指针 53 ) CL_API_SUFFIX__VERSION_1_2; 54 55 extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSampler(// 初始化采样器 56 cl_context, // 上下文 57 cl_bool, // 是否使用归一化坐标 58 cl_addressing_mode, // 越界处理方案 59 cl_filter_mode, // 差值方案 60 cl_int * // 返回结果状态的指针 61 ) CL_API_SUFFIX__VERSION_1_0; 62 63 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(// 矩形缓冲区写入 64 cl_command_queue, // 命令队列 65 cl_mem, // 目标缓冲区 66 cl_bool, // 阻塞标记 67 const size_t *, // 缓冲区写入起点 68 const size_t *, // 源数据写入起点 69 const size_t *, // 写入范围,是一个三维数组,分别为:一行数据量(Byte),行数,层数 70 size_t, // 缓冲区行间跨度 71 size_t, // 缓冲区层间跨度 72 size_t, // 源数据行间跨度 73 size_t, // 源数据层间跨度 74 const void *, // 源数据指针 75 cl_uint, // 时间列表长度 76 const cl_event *, // 时间列表 77 cl_event * // 本事件标记 78 ) CL_API_SUFFIX__VERSION_1_1; 79 80 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(// 矩形缓冲区读出,参数定义同上 81 cl_command_queue, 82 cl_mem, 83 cl_bool, 84 const size_t *, 85 const size_t *, 86 const size_t *, 87 size_t, 88 size_t, 89 size_t, 90 size_t, 91 void *, 92 cl_uint, 93 const cl_event *, 94 cl_event * 95 ) CL_API_SUFFIX__VERSION_1_1;