OpenCL 图像卷积 1


▶ 书上的代码改进而成,从文件读入一张 256 阶灰度图,按照给定的卷积窗口计算卷积,并输出到文件中。

● 代码,使用 9 格的均值窗口,居然硬读写 .bmp 文件,算是了解一下该文件的具体格式,留作纪念吧。

 1 // convolution.cl
 2 __kernel void convolution01(__read_only image2d_t inputImage, __write_only image2d_t outputImage,
 3     int imageRow, int imageCol, __constant float* filter, int filterWidth, sampler_t sampler)
 4 {
 5     const int row = get_global_id(0), col = get_global_id(1);   // 注意工作项的顺序,可以和图像读取不一样
 6     const int halfWidth = filterWidth / 2;    
 7     float4 sum = { 0.0f, 0.0f, 0.0f, 0.0f }, pixel;             // 输出数据类型是四元浮点数
 8     int i, j, filterIdx;                                        // 卷积窗口单独用一个下标遍历
 9     for (filterIdx = 0, i = -halfWidth; i <= halfWidth; i++)
10     {        
11         for (j = -halfWidth; j <= halfWidth; j++)
12         {
13             pixel = read_imagef(inputImage, sampler, (int2)(col + j, row + i)); // 读取目标坐标,注意列在前行在后
14             sum.x += pixel.x * filter[filterIdx++];                             // 采用了单通道,只有第一分量有效
15         }
16     }
17     if (row < imageRow && col < imageCol)                       // 将落在有效范围内的计算数据输出
18         write_imagef(outputImage, (int2)(col, row), sum);
19     return;
20 }
  1 // main.cpp
  2 #include <stdio.h>
  3 #include <stdlib.h>
  4 #include <math.h>
  5 #include <cl.h>
  6 
  7 #pragma warning(disable : 4996)
  8 
  9 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";
 10 const char *inputFile = "R:/input.bmp";
 11 const char *outputFile = "R:/output.bmp";
 12 
 13 bool floatEq(const float a, const float b)// 相等返回 1
 14 {
 15     return (b == 0) ? fabs(a) < 0.001 : fabs(a / b - 1) < 0.001;
 16 }
 17 
 18 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
 19 {
 20     FILE *fp;
 21     int size;
 22     //printf("<readText> File: %s\n", kernelPath);
 23     fopen_s(&fp, kernelPath, "rb");
 24     if (!fp)
 25     {
 26         printf("<readText> Open file failed\n");
 27         getchar();
 28         exit(-1);
 29     }
 30     if (fseek(fp, 0, SEEK_END) != 0)
 31     {
 32         printf("<readText> Seek end of file failed\n");
 33         getchar();
 34         exit(-1);
 35     }
 36     if ((size = ftell(fp)) < 0)
 37     {
 38         printf("<readText> Get file position failed\n");
 39         getchar();
 40         exit(-1);
 41     }
 42     rewind(fp);
 43     if ((*pcode = (char *)malloc(size + 1)) == NULL)
 44     {
 45         printf("<readText> Allocate space failed\n");
 46         getchar();
 47         exit(-1);
 48     }
 49     fread(*pcode, 1, size, fp);
 50     (*pcode)[size] = '\0';
 51     fclose(fp);
 52     return size + 1;
 53 }
 54 
 55 void storeImage(float *imageOut, const char *filename, const char *refFilename)// 输出图片
 56 {
 57     FILE *ifp, *ofp;
 58     unsigned char *metaData, temp;
 59     int offset, i, j, row, col, mod;
 60 
 61     if (fopen_s(&ifp, refFilename, "rb") != 0)// 从 参考图片(输入文件)中读取需要的行列数
 62     {
 63         printf(filename);
 64         exit(-1);
 65     }
 66     fseek(ifp, 10, SEEK_SET);
 67     fread(&offset, 4, 1, ifp);
 68     fseek(ifp, 18, SEEK_SET);
 69     fread(&col, 4, 1, ifp);
 70     fread(&row, 4, 1, ifp);
 71     fseek(ifp, 0, SEEK_SET);
 72     if ((metaData = (unsigned char *)malloc(offset)) == NULL)
 73     {
 74         printf("<storeImage> Allocate space failed\n");
 75         getchar();
 76         exit(-1);
 77     }
 78     fread(metaData, 1, offset, ifp);                  // 从输入文件中读取元信息
 79     //printf("Output image %s\n", filename);
 80     if (fopen_s(&ofp, filename, "wb") != 0)
 81     {
 82         printf("<storeImage> Open output file failed\n");
 83         getchar();
 84         exit(-1);
 85     }
 86     if (fwrite(metaData, 1, offset, ofp) != offset)   //  将元信息原封不动的放入输出文件中
 87     {
 88         printf("<storeImage> Write output metaData failed\n");
 89         getchar();
 90         exit(-1);
 91     }
 92     for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)// .bmp 行是颠倒的,倒着填充
 93     {
 94         for (j = 0; j < col; j++)
 95         {
 96             temp = (unsigned char)imageOut[i * col + j];
 97             fwrite(&temp, sizeof(unsigned char), 1, ofp);
 98         }
 99         for (j = 0; j < mod; fwrite(&temp, sizeof(unsigned char), 1, ofp), j++);// 列数非 4 的倍数时补上 junk padding
100         
101     }
102 
103     fclose(ifp);
104     fclose(ofp);
105     free(metaData);
106     return;
107 }
108 
109 float *readImage(const char *filename, int *outputRow, int *outputCol)// 从文件读取图片
110 {
111     unsigned char temp;
112     int i, j, row, col, offset, mod;
113     float *outputImage;
114     FILE *fp;
115 
116     if (fopen_s(&fp, filename, "rb") != 0)
117     {
118         printf("<readImage> Open file failed\n");
119         getchar();
120         exit(-1);
121     }
122     fseek(fp, 10, SEEK_SET);    // 第 10 字节的位置
123     fread(&offset, 4, 1, fp);   // 元信息大小
124     fseek(fp, 18, SEEK_SET);    // 第 18 字节位置
125     fread(&col, 4, 1, fp);      // 读取列数和行数
126     fread(&row, 4, 1, fp);
127     printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row);
128 
129     if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)
130     {
131         printf("<readImage> Allocate space failed\n");
132         getchar();
133         exit(-1);
134     }
135     fseek(fp, offset, SEEK_SET);// 元信息结束的地方,开始读图像数据
136     fflush(NULL);
137     for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)// .bmp 行是颠倒的,顺着读文件,倒着填充,mod 为列的 junk pading 厚度
138     {
139         for (j = 0; j < col; j++)
140         {
141             fread(&temp, sizeof(unsigned char), 1, fp);
142             outputImage[i * col + j] = (float)temp;
143         }
144         for (j = 0; j < mod; fread(&temp, sizeof(unsigned char), 1, fp), j++);// 读取 junk padding,不传入数据中(.bmp 文件中有这几列,但是不显示)
145     }
146     fclose(fp);
147 
148     *outputRow = row;
149     *outputCol = col;
150     return outputImage;
151 }
152 
153 int main()
154 {
155     int imageRow, imageCol, dataSize, row, col, i, j, correct;
156     float *inputImage, *outputImage, sum;
157 
158     inputImage = readImage(inputFile, &imageRow, &imageCol);// 从文件读取图像数据和行列信息
159     dataSize = imageRow * imageCol * sizeof(float);
160     outputImage = (float*)malloc(dataSize);
161 
162     const int filterWidth = 7, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2;
163     float filter[49] =
164     { 0,      0,      0,      0,      0,      0,      0,
165         0,      0,      0,      0,      0,      0,      0,
166         0,      0,1.f / 9,1.f / 9,1.f / 9,      0,      0,
167         0,      0,1.f / 9,1.f / 9,1.f / 9,      0,      0,
168         0,      0,1.f / 9,1.f / 9,1.f / 9,      0,      0,
169         0,      0,      0,      0,      0,      0,      0,
170         0,      0,      0,      0,      0,      0,      0
171     };
172 
173     cl_int status;
174     cl_uint nPlatform;
175     clGetPlatformIDs(0, NULL, &nPlatform);
176     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
177     clGetPlatformIDs(nPlatform, listPlatform, NULL);
178     cl_uint nDevice = 0;
179     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice);
180     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
181     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
182     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
183     cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
184 
185     cl_image_format format;                     // 图像格式描述符
186     format.image_channel_order = CL_R;          // 单通道
187     format.image_channel_data_type = CL_FLOAT;  // 浮点类型(读进来还是 uchar,但是计算需要浮点)
188 
189     cl_mem d_inputImage, d_outputImage, d_filter;
190     if (true)// 旧 OpenCL 函数 clCreateImage2D
191     {
192         d_inputImage = clCreateImage2D(context, 0, &format, imageCol, imageRow, 0, NULL, &status);
193         d_outputImage = clCreateImage2D(context, 0, &format, imageCol, imageRow, 0, NULL, &status);
194     }
195     if (false)// 新 OpenCL 使用描述符图像描述符 cl_image_desc 和函数 clCreateImage
196     {
197         cl_image_desc desc;
198         desc.image_type = CL_MEM_OBJECT_IMAGE2D;
199         desc.image_width = imageCol;
200         desc.image_height = imageRow;
201         desc.image_depth = 0;
202         desc.image_array_size = 0;
203         desc.image_row_pitch = 0;
204         desc.image_slice_pitch = 0;
205         desc.num_mip_levels = 0;
206         desc.num_samples = 0;
207         desc.buffer = NULL;
208 
209         d_inputImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, &status);
210         d_outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &status);
211     }
212     d_filter = clCreateBuffer(context, 0, filterSize * sizeof(float), NULL, &status);
213 
214     size_t origin[3] = { 0, 0, 0 }, region[3] = { imageCol, imageRow, 1 };  // 拷贝图像数据用的原点和尺寸,注意尺寸是先数列再数行
215     clEnqueueWriteImage(queue, d_inputImage, CL_TRUE, origin, region, 0, 0, inputImage, 0, NULL, NULL);
216     clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0, filterSize * sizeof(float), filter, 0, NULL, NULL);
217 
218     cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);// 采样器
219 
220     char *code;
221     size_t length = readText(sourceText, &code);
222     cl_program program = clCreateProgramWithSource(context, 1, (const char **)&code, &length, NULL);
223     clBuildProgram(program, 1, listDevice, NULL, NULL, NULL);
224     cl_kernel kernel = clCreateKernel(program, "convolution01", &status);
225 
226     clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
227     clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
228     clSetKernelArg(kernel, 2, sizeof(int), &imageRow);
229     clSetKernelArg(kernel, 3, sizeof(int), &imageCol);
230     clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
231     clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
232     clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
233 
234     size_t globalSize[2] = { imageRow, imageCol };// localSize = { 1, 1}; 可以用 NULL 代替
235     status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, NULL, NULL);
236     clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, region, 0, 0, outputImage, 0, NULL, NULL);
237 
238     storeImage(outputImage, outputFile, inputFile);// 将输出图像写入文件中
239 
240     for (row = 0, correct = 1; row < imageRow && correct; row++)// 检查计算结果
241     {
242         for (col = 0; col < imageCol && correct; col++)
243         {
244             sum = 0;
245             for (i = -halfFilterWidth; i <= halfFilterWidth; i++)
246             {
247                 for (j = -halfFilterWidth; j <= halfFilterWidth; j++)
248                 {
249                     if (row + i >= 0 && row + i < imageRow && col + j >= 0 && col + j < imageCol)
250                         sum += inputImage[(row + i) * imageCol + col + j] * filter[(i + halfFilterWidth) * filterWidth + j + halfFilterWidth];
251                 }
252             }
253             if (row >= halfFilterWidth && row < imageRow - halfFilterWidth && col >= halfFilterWidth && col < imageCol - halfFilterWidth &&
254                 !floatEq(outputImage[row * imageCol + col], sum))
255             {
256                 printf("Error at [%d,%d], output:%f, ref:%f\n", row, col, outputImage[row*imageCol + col], sum);
257                 correct = 0;
258             }
259         }
260     }
261     if (correct)
262         printf("Result correct.\n");
263 
264     free(listPlatform);
265     free(listDevice);
266     free(inputImage);
267     free(outputImage);
268     free(code);
269     clReleaseContext(context);
270     clReleaseCommandQueue(queue);
271     clReleaseProgram(program);
272     clReleaseKernel(kernel);
273     clReleaseMemObject(d_inputImage);
274     clReleaseMemObject(d_outputImage);
275     clReleaseMemObject(d_filter);
276     clReleaseSampler(sampler);
277     getchar();
278     return 0;
279 }

● 输出结果,给了一张 400 × 400 的图片参与,另外,计算一张 5040 × 7000 的图片需要 23 ms。

<readImage> Input image R:/input.bmp, width = 400, height = 400
Output image R:/output.bmp
Result correct.

  

● 代码,使用局部内存优化

  1 // convolution.cl
  2 __kernel void convolution02(__global float* inputImage, __global float* outputImage, int  imageRow, int  imageCol, 
  3     __constant float* filter, int filterWidth, __local float* localMem, int  localMemRow, int  localMemCol)
  4 {
  5     const int groupCol = get_group_id(0) * get_local_size(0), groupRow = get_group_id(1) * get_local_size(1);
  6     const int localCol = get_local_id(0), localRow = get_local_id(1);
  7     const int globalCol = groupCol + localCol, globalRow = groupRow + localRow;
  8     const int halfWidth = filterWidth / 2;    
  9     int i, j, curRow, curCol, filterIndex;
 10     float sum;
 11 
 12     // 将源图数据读入局部内存
 13     for (i = localRow; i < localMemRow; i += get_local_size(1))
 14     {
 15         curRow = groupRow + i;
 16         for (j = localCol; j < localMemCol; j += get_local_size(0))
 17         {
 18             curCol = groupCol + j;
 19             if (curRow < imageRow && curCol < imageCol)
 20                 localMem[i * localMemCol + j] = inputImage[curRow * imageCol + curCol];
 21         }
 22     }
 23     barrier(CLK_LOCAL_MEM_FENCE);     
 24 
 25     // 计算卷积
 26     if (globalRow < imageRow - filterWidth + 1 && globalCol < imageCol - filterWidth + 1)// 选取位于有效范围内的工作组
 27     {
 28         sum = 0.0f, filterIndex = 0;
 29         for (i = localRow; i < localRow + filterWidth; i++)
 30         {
 31             for (j = localCol; j < localCol + filterWidth; j++)
 32                 sum += localMem[i * localMemCol + j] * filter[filterIndex++];
 33         }       
 34         // 循环展开
 35         /*
 36         for (i = localRow; i < localRow + filterWidth; i++)
 37         {
 38             int offset = i * localMemCol + localCol;
 39             sum += localMem[offset++] * filter[filterIndex++];// 行数等于 filterWidth
 40             sum += localMem[offset++] * filter[filterIndex++];
 41             sum += localMem[offset++] * filter[filterIndex++];
 42             sum += localMem[offset++] * filter[filterIndex++];
 43             sum += localMem[offset++] * filter[filterIndex++];
 44             sum += localMem[offset++] * filter[filterIndex++];
 45             sum += localMem[offset++] * filter[filterIndex++];
 46         }
 47         */
 48         // 循环完全展开
 49         /*        
 50         int offset = localRow*localMemCol + localCol;
 51         sum += localMem[offset + 0] * filter[filterIndex++];
 52         sum += localMem[offset + 1] * filter[filterIndex++];
 53         sum += localMem[offset + 2] * filter[filterIndex++];
 54         sum += localMem[offset + 3] * filter[filterIndex++];
 55         sum += localMem[offset + 4] * filter[filterIndex++];
 56         sum += localMem[offset + 5] * filter[filterIndex++];
 57         sum += localMem[offset + 6] * filter[filterIndex++];
 58         offset += localMemCol;
 59         sum += localMem[offset + 0] * filter[filterIndex++];
 60         sum += localMem[offset + 1] * filter[filterIndex++];
 61         sum += localMem[offset + 2] * filter[filterIndex++];
 62         sum += localMem[offset + 3] * filter[filterIndex++];
 63         sum += localMem[offset + 4] * filter[filterIndex++];
 64         sum += localMem[offset + 5] * filter[filterIndex++];
 65         sum += localMem[offset + 6] * filter[filterIndex++];
 66         offset += localMemCol;
 67         sum += localMem[offset + 0] * filter[filterIndex++];
 68         sum += localMem[offset + 1] * filter[filterIndex++];
 69         sum += localMem[offset + 2] * filter[filterIndex++];
 70         sum += localMem[offset + 3] * filter[filterIndex++];
 71         sum += localMem[offset + 4] * filter[filterIndex++];
 72         sum += localMem[offset + 5] * filter[filterIndex++];
 73         sum += localMem[offset + 6] * filter[filterIndex++];
 74         offset += localMemCol;
 75         sum += localMem[offset + 0] * filter[filterIndex++];
 76         sum += localMem[offset + 1] * filter[filterIndex++];
 77         sum += localMem[offset + 2] * filter[filterIndex++];
 78         sum += localMem[offset + 3] * filter[filterIndex++];
 79         sum += localMem[offset + 4] * filter[filterIndex++];
 80         sum += localMem[offset + 5] * filter[filterIndex++];
 81         sum += localMem[offset + 6] * filter[filterIndex++];
 82         offset += localMemCol;
 83         sum += localMem[offset + 0] * filter[filterIndex++];
 84         sum += localMem[offset + 1] * filter[filterIndex++];
 85         sum += localMem[offset + 2] * filter[filterIndex++];
 86         sum += localMem[offset + 3] * filter[filterIndex++];
 87         sum += localMem[offset + 4] * filter[filterIndex++];
 88         sum += localMem[offset + 5] * filter[filterIndex++];
 89         sum += localMem[offset + 6] * filter[filterIndex++];
 90         offset += localMemCol;
 91         sum += localMem[offset + 0] * filter[filterIndex++];
 92         sum += localMem[offset + 1] * filter[filterIndex++];
 93         sum += localMem[offset + 2] * filter[filterIndex++];
 94         sum += localMem[offset + 3] * filter[filterIndex++];
 95         sum += localMem[offset + 4] * filter[filterIndex++];
 96         sum += localMem[offset + 5] * filter[filterIndex++];
 97         sum += localMem[offset + 6] * filter[filterIndex++];
 98         offset += localMemCol;
 99         sum += localMem[offset + 0] * filter[filterIndex++];
100         sum += localMem[offset + 1] * filter[filterIndex++];
101         sum += localMem[offset + 2] * filter[filterIndex++];
102         sum += localMem[offset + 3] * filter[filterIndex++];
103         sum += localMem[offset + 4] * filter[filterIndex++];
104         sum += localMem[offset + 5] * filter[filterIndex++];
105         sum += localMem[offset + 6] * filter[filterIndex++];
106         */
107         // 数据输出
108         outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = sum;
109     }
110     return;
111 }
112 
113 __kernel void convolution03(__global float4* inputImage, __global float* outputImage, int  imageRow, int imageCol, 
114     __constant float* filter, int filterWidth, __local float* localMem, int  localMemRow, int  localMemCol)
115 {        
116     const int groupCol4 = get_group_id(0) * get_local_size(0) / 4, groupRow4 = get_group_id(1) * get_local_size(1);
117     const int localId = get_local_id(1) * get_local_size(0) + get_local_id(0); 
118     int localCol = (localId % (localMemCol / 4)), localRow = (localId / (localMemCol / 4));
119     int globalCol = groupCol4 + localCol, globalRow = groupRow4 + localRow;
120     const int halfWidth = filterWidth / 2; 
121     
122     __local float4 *localImage4 = (__local float4*)&localMem[localRow*localMemCol + localCol * 4];// 局部内存数据
123 
124     if (globalRow < imageRow && globalCol < imageCol / 4 && localRow < localMemRow)
125         localImage4[0] = inputImage[globalRow*imageCol / 4 + globalCol];
126     barrier(CLK_LOCAL_MEM_FENCE);
127 
128     // 重设 坐标以输出
129     localCol = get_local_id(0);
130     localRow = get_local_id(1);
131     globalCol = get_group_id(0)*get_local_size(0) + localCol;
132     globalRow = get_group_id(1)*get_local_size(1) + localRow;
133 
134     // 计算卷积
135     int i, j, filterIndex;
136     float sum;
137     if (globalRow < imageRow - filterWidth + 1 && globalCol < imageCol - filterWidth + 1)
138     {
139         sum = 0.0f, filterIndex = 0;
140         for (i = localRow; i < localRow + filterWidth; i++)
141         {
142             for (int j = localCol; j < localCol + filterWidth; j++)
143                 sum += localMem[i * localMemCol + j] * filter[filterIndex++];
144         }
145         // 循环展开同 convolution02        
146         // 输出数据 
147         outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = 0;//sum;
148     }
149     return;
150 }
  1 // main.cpp
  2 #include <stdio.h>
  3 #include <stdlib.h>
  4 #include <math.h>
  5 #include <cl.h>
  6 
  7 #pragma warning(disable : 4996)
  8 
  9 #define NON_OPTIMIZED   // 不使用优化,使用函数 convolution02
 10 //#define READ_ALIGNED  // 使用内存对齐优化,使用函数 convolution02
 11 //#define READ4         // 局部内存使用 float4 读取优化,使用函数 convolution03,有点问题
 12 #define WGX 16          // 工作组尺寸
 13 #define WGY 16
 14 
 15 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";
 16 const char *inputFile = "R:/input.bmp";
 17 const char *outputFile = "R:/output.bmp";
 18 
 19 unsigned int roundUp(unsigned int value, unsigned int base)// 将 value 向上取整到 multiple 的整数倍
 20 {
 21     unsigned int remainder = value % base;
 22     return remainder == 0 ? value : (value + base - remainder);
 23 }
 24 
 25 int readText(const char* kernelPath, char **pcode)
 26 {
 27     FILE *fp;
 28     int size;
 29     //printf("<readText> File: %s\n", kernelPath);
 30     fopen_s(&fp, kernelPath, "rb");
 31     if (!fp)
 32     {
 33         printf("<readText> Open file failed\n");
 34         getchar();
 35         exit(-1);
 36     }
 37     if (fseek(fp, 0, SEEK_END) != 0)
 38     {
 39         printf("<readText> Seek end of file failed\n");
 40         getchar();
 41         exit(-1);
 42     }
 43     if ((size = ftell(fp)) < 0)
 44     {
 45         printf("<readText> Get file position failed\n");
 46         getchar();
 47         exit(-1);
 48     }
 49     rewind(fp);
 50     if ((*pcode = (char *)malloc(size + 1)) == NULL)
 51     {
 52         printf("<readText> Allocate space failed\n");
 53         getchar();
 54         exit(-1);
 55     }
 56     fread(*pcode, 1, size, fp);
 57     (*pcode)[size] = '\0';
 58     fclose(fp);
 59     return size + 1;
 60 }
 61 
 62 void storeImage(float *imageOut, const char *filename, const char *refFilename)
 63 {
 64     FILE *ifp, *ofp;
 65     unsigned char *metaData, temp;
 66     int offset, i, j, row, col, mod;
 67 
 68     if (fopen_s(&ifp, refFilename, "rb") != 0)
 69     {
 70         printf(filename);
 71         exit(-1);
 72     }
 73     fseek(ifp, 10, SEEK_SET);
 74     fread(&offset, 4, 1, ifp);
 75     fseek(ifp, 18, SEEK_SET);
 76     fread(&col, 4, 1, ifp);
 77     fread(&row, 4, 1, ifp);
 78     fseek(ifp, 0, SEEK_SET);
 79     if ((metaData = (unsigned char *)malloc(offset)) == NULL)
 80     {
 81         printf("<storeImage> Allocate space failed\n");
 82         getchar();
 83         exit(-1);
 84     }
 85     fread(metaData, 1, offset, ifp);
 86 
 87     if (fopen_s(&ofp, filename, "wb") != 0)
 88     {
 89         printf("<storeImage> Open output file failed\n");
 90         getchar();
 91         exit(-1);
 92     }
 93     if (fwrite(metaData, 1, offset, ofp) != offset)
 94     {
 95         printf("<storeImage> Write output metaData failed\n");
 96         getchar();
 97         exit(-1);
 98     }
 99     for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)
100     {
101         for (j = 0; j < col; j++)
102         {
103             temp = (unsigned char)imageOut[i * col + j];
104             fwrite(&temp, sizeof(unsigned char), 1, ofp);
105         }
106         for (j = 0; j < mod; fwrite(&temp, sizeof(unsigned char), 1, ofp), j++);
107 
108     }
109 
110     fclose(ifp);
111     fclose(ofp);
112     free(metaData);
113     return;
114 }
115 
116 float *readImage(const char *filename, int *outputRow, int *outputCol)
117 {
118     unsigned char temp;
119     int i, j, row, col, offset, mod;
120     float *outputImage;
121     FILE *fp;
122 
123     if (fopen_s(&fp, filename, "rb") != 0)
124     {
125         printf("<readImage> Open file failed\n");
126         getchar();
127         exit(-1);
128     }
129     fseek(fp, 10, SEEK_SET);
130     fread(&offset, 4, 1, fp);
131     fseek(fp, 18, SEEK_SET);
132     fread(&col, 4, 1, fp);
133     fread(&row, 4, 1, fp);
134     printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row);
135 
136     if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)
137     {
138         printf("<readImage> Allocate space failed\n");
139         getchar();
140         exit(-1);
141     }
142     fseek(fp, offset, SEEK_SET);
143     fflush(NULL);
144     for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)
145     {
146         for (j = 0; j < col; j++)
147         {
148             fread(&temp, sizeof(unsigned char), 1, fp);
149             outputImage[i * col + j] = (float)temp;
150         }
151         for (j = 0; j < mod; fread(&temp, sizeof(unsigned char), 1, fp), j++);
152     }
153     fclose(fp);
154 
155     *outputRow = row;
156     *outputCol = col;
157     return outputImage;
158 }
159 
160 int main()
161 {
162     int imageRow, imageCol, dataSize, deviceRow, deviceCol, deviceDataSize;
163     float *inputImage, *outputImage;
164     inputImage = readImage(inputFile, &imageRow, &imageCol);
165     dataSize = imageRow * imageCol * sizeof(float);
166     outputImage = (float*)malloc(dataSize);
167 
168     // 调整列数
169 #ifdef NON_OPTIMIZED        // 不调整
170     deviceCol = imageCol;
171 #else                       // 增加道工作组尺寸的整数倍
172     deviceCol = roundUp(imageCol, WGX);
173 #endif
174     deviceRow = imageRow;   // 行数不变
175     deviceDataSize = sizeof(float) * deviceRow * deviceCol;
176 
177     const int filterWidth = 7, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2;
178     float filter[49] =
179     {
180         0,      0,      0,      0,      0,      0,      0,
181         0,      0,      0,      0,      0,      0,      0,
182         0,      0,1.f / 9,1.f / 9,1.f / 9,      0,      0,
183         0,      0,1.f / 9,1.f / 9,1.f / 9,      0,      0,
184         0,      0,1.f / 9,1.f / 9,1.f / 9,      0,      0,
185         0,      0,      0,      0,      0,      0,      0,
186         0,      0,      0,      0,      0,      0,      0
187     };
188 
189     cl_int status;
190     cl_uint nPlatform;
191     clGetPlatformIDs(0, NULL, &nPlatform);
192     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
193     clGetPlatformIDs(nPlatform, listPlatform, NULL);
194     cl_uint nDevice = 0;
195     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice);
196     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
197     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
198     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
199     cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
200 
201     // 使用普通的缓冲区,而不用 image
202     cl_mem d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL);
203     cl_mem d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL);
204     cl_mem d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * filterSize, NULL, NULL);
205 
206     // 缓冲区写入
207 #if defined NON_OPTIMIZED   // 直接写入
208     clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize, inputImage, 0, NULL, NULL);
209 #else                       // 对齐写入
210     size_t d_origin[3] = { 0,0,0 }, h_origin[3] = { 0,0,0 }, region[3] = { sizeof(float) * deviceCol, deviceRow, 1 };
211     clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, 0, sizeof(float) * imageCol, 0, inputImage, 0, NULL, NULL);
212 #endif
213     clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, sizeof(float) * filterSize, filter, 0, NULL, NULL);
214 
215     char *code;
216     size_t length = readText(sourceText, &code);
217     cl_program program = clCreateProgramWithSource(context, 1, (const char **)&code, &length, &status);
218     status = clBuildProgram(program, 1, listDevice, NULL, NULL, NULL);
219 
220     // 创建不同的内核
221 #if defined NON_OPTIMIZED || defined READ_ALIGNED
222     cl_kernel kernel = clCreateKernel(program, "convolution02", NULL);
223 #else
224     cl_kernel kernel = clCreateKernel(program, "convolution03", NULL);
225 #endif
226 
227     size_t globalSize[2] = { roundUp(imageCol - filterWidth + 1, WGX), roundUp(imageRow - filterWidth + 1, WGY) }, localSize[2] = { WGX, WGY };
228 
229     // 局部内存大小
230     int localRow, localCol;
231     localRow = localSize[1] + filterWidth - 1;              // 把一个工作组的大小垫起光环元素的宽度
232 #if defined NON_OPTIMIZED || defined READ_ALIGNED
233     localCol = localSize[0] + filterWidth - 1;
234 #else 
235     localCol = roundUp(localSize[0] + filterWidth - 1, 4);  //  垫起之外还要对齐到 4 的倍数上
236 #endif      
237     clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
238     clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);    
239     clSetKernelArg(kernel, 2, sizeof(int), &deviceRow);
240     clSetKernelArg(kernel, 3, sizeof(int), &deviceCol);    
241     clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
242     clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
243     clSetKernelArg(kernel, 6, sizeof(float) * localCol * localRow, NULL);
244     clSetKernelArg(kernel, 7, sizeof(int), &localRow);
245     clSetKernelArg(kernel, 8, sizeof(int), &localCol);
246 
247     status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL);
248 
249     // 结果写回
250 #if defined NON_OPTIMIZED
251     clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, deviceDataSize, outputImage, 0, NULL, NULL);
252 #else   // 最边上一圈 filterWidth / 2 的部分不写回
253     d_origin[0] = 3 * sizeof(float), d_origin[1] = 3, d_origin[2] = 0;
254     h_origin[0] = 3 * sizeof(float), h_origin[1] = 3, h_origin[2] = 0;
255     region[0] = (imageCol - filterWidth + 1) * sizeof(float), region[1] = (imageRow - filterWidth + 1), region[2] = 1;
256     clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, 0, sizeof(float) * imageCol, 0, outputImage, 0, NULL, NULL);
257 #endif
258     storeImage(outputImage, outputFile, inputFile);
259 
260     // 去掉了检查结果的部分
261     free(listPlatform);
262     free(listDevice);
263     free(inputImage);
264     free(outputImage);
265     free(code);
266     clReleaseContext(context);
267     clReleaseCommandQueue(queue);
268     clReleaseProgram(program);
269     clReleaseKernel(kernel);
270     clReleaseMemObject(d_inputImage);
271     clReleaseMemObject(d_outputImage);
272     clReleaseMemObject(d_filter);
273     printf("Finshed.\n");
274     getchar();
275     return 0;
276 }

● 输出结果,与上面的简单方法相同

● 用到的函数和定义

 1 //cl.h
 2 // 采样器越界处理方案
 3 #define CL_ADDRESS_NONE                             0x1130
 4 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
 5 #define CL_ADDRESS_CLAMP                            0x1132
 6 #define CL_ADDRESS_REPEAT                           0x1133
 7 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
 8 
 9 // 插值方案
10 #define CL_FILTER_NEAREST                           0x1140
11 #define CL_FILTER_LINEAR                            0x1141
12 
13 // 用到的采样器和描述符的定义
14 typedef struct _cl_sampler* cl_sampler;
15 typedef struct _cl_image_format
16 {
17     cl_channel_order        image_channel_order;
18     cl_channel_type         image_channel_data_type;
19 } cl_image_format;
20 
21 typedef struct _cl_image_desc
22 {
23     cl_mem_object_type      image_type;
24     size_t                  image_width;
25     size_t                  image_height;
26     size_t                  image_depth;
27     size_t                  image_array_size;
28     size_t                  image_row_pitch;
29     size_t                  image_slice_pitch;
30     cl_uint                 num_mip_levels;
31     cl_uint                 num_samples;
32     cl_mem                  buffer;
33 } cl_image_desc;
34 
35 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(// OpenCL1.2 中废弃的 image 创建函数
36     cl_context,             // 上下文
37     cl_mem_flags,           // 特殊标志
38     const cl_image_format *,// image 描述符
39     size_t,                 // 宽度
40     size_t,                 // 高度
41     size_t,                 // 行跨步
42     void *,                 // 自动传入主机数据
43     cl_int *                // 返回结果状态的指针
44 ) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
45 
46 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(// OpenCL1.2 中开始使用的 image 创建函数
47     cl_context,             // 上下文
48     cl_mem_flags,           // 特殊标志
49     const cl_image_format *,// image 格式描述符
50     const cl_image_desc *,  // image 描述符
51     void *,                 // 主机数据
52     cl_int *                // 返回结果状态的指针
53 ) CL_API_SUFFIX__VERSION_1_2;
54 
55 extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSampler(// 初始化采样器
56     cl_context,         // 上下文
57     cl_bool,            // 是否使用归一化坐标
58     cl_addressing_mode, // 越界处理方案
59     cl_filter_mode,     // 差值方案
60     cl_int *            // 返回结果状态的指针
61 ) CL_API_SUFFIX__VERSION_1_0;
62 
63 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(// 矩形缓冲区写入
64     cl_command_queue,   // 命令队列
65     cl_mem,             // 目标缓冲区
66     cl_bool,            // 阻塞标记
67     const size_t *,     // 缓冲区写入起点
68     const size_t *,     // 源数据写入起点
69     const size_t *,     // 写入范围,是一个三维数组,分别为:一行数据量(Byte),行数,层数
70     size_t,             // 缓冲区行间跨度
71     size_t,             // 缓冲区层间跨度
72     size_t,             // 源数据行间跨度
73     size_t,             // 源数据层间跨度
74     const void *,       // 源数据指针
75     cl_uint,            // 时间列表长度
76     const cl_event *,   // 时间列表
77     cl_event *          // 本事件标记
78 ) CL_API_SUFFIX__VERSION_1_1;
79 
80 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(// 矩形缓冲区读出,参数定义同上
81     cl_command_queue,
82     cl_mem,
83     cl_bool,
84     const size_t *,
85     const size_t *,
86     const size_t *,
87     size_t,
88     size_t,
89     size_t,
90     size_t,
91     void *,
92     cl_uint,
93     const cl_event *,
94     cl_event *
95 ) CL_API_SUFFIX__VERSION_1_1;

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM