▶ 書上的代碼改進而成,從文件讀入一張 256 階灰度圖,按照給定的卷積窗口計算卷積,並輸出到文件中。
● 代碼,使用 9 格的均值窗口,居然硬讀寫 .bmp 文件,算是了解一下該文件的具體格式,留作紀念吧。
1 // convolution.cl 2 __kernel void convolution01(__read_only image2d_t inputImage, __write_only image2d_t outputImage, 3 int imageRow, int imageCol, __constant float* filter, int filterWidth, sampler_t sampler) 4 { 5 const int row = get_global_id(0), col = get_global_id(1); // 注意工作項的順序,可以和圖像讀取不一樣 6 const int halfWidth = filterWidth / 2; 7 float4 sum = { 0.0f, 0.0f, 0.0f, 0.0f }, pixel; // 輸出數據類型是四元浮點數 8 int i, j, filterIdx; // 卷積窗口單獨用一個下標遍歷 9 for (filterIdx = 0, i = -halfWidth; i <= halfWidth; i++) 10 { 11 for (j = -halfWidth; j <= halfWidth; j++) 12 { 13 pixel = read_imagef(inputImage, sampler, (int2)(col + j, row + i)); // 讀取目標坐標,注意列在前行在后 14 sum.x += pixel.x * filter[filterIdx++]; // 采用了單通道,只有第一分量有效 15 } 16 } 17 if (row < imageRow && col < imageCol) // 將落在有效范圍內的計算數據輸出 18 write_imagef(outputImage, (int2)(col, row), sum); 19 return; 20 }
1 // main.cpp 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <math.h> 5 #include <cl.h> 6 7 #pragma warning(disable : 4996) 8 9 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl"; 10 const char *inputFile = "R:/input.bmp"; 11 const char *outputFile = "R:/output.bmp"; 12 13 bool floatEq(const float a, const float b)// 相等返回 1 14 { 15 return (b == 0) ? fabs(a) < 0.001 : fabs(a / b - 1) < 0.001; 16 } 17 18 int readText(const char* kernelPath, char **pcode)// 讀取文本文件放入 pcode,返回字符串長度 19 { 20 FILE *fp; 21 int size; 22 //printf("<readText> File: %s\n", kernelPath); 23 fopen_s(&fp, kernelPath, "rb"); 24 if (!fp) 25 { 26 printf("<readText> Open file failed\n"); 27 getchar(); 28 exit(-1); 29 } 30 if (fseek(fp, 0, SEEK_END) != 0) 31 { 32 printf("<readText> Seek end of file failed\n"); 33 getchar(); 34 exit(-1); 35 } 36 if ((size = ftell(fp)) < 0) 37 { 38 printf("<readText> Get file position failed\n"); 39 getchar(); 40 exit(-1); 41 } 42 rewind(fp); 43 if ((*pcode = (char *)malloc(size + 1)) == NULL) 44 { 45 printf("<readText> Allocate space failed\n"); 46 getchar(); 47 exit(-1); 48 } 49 fread(*pcode, 1, size, fp); 50 (*pcode)[size] = '\0'; 51 fclose(fp); 52 return size + 1; 53 } 54 55 void storeImage(float *imageOut, const char *filename, const char *refFilename)// 輸出圖片 56 { 57 FILE *ifp, *ofp; 58 unsigned char *metaData, temp; 59 int offset, i, j, row, col, mod; 60 61 if (fopen_s(&ifp, refFilename, "rb") != 0)// 從 參考圖片(輸入文件)中讀取需要的行列數 62 { 63 printf(filename); 64 exit(-1); 65 } 66 fseek(ifp, 10, SEEK_SET); 67 fread(&offset, 4, 1, ifp); 68 fseek(ifp, 18, SEEK_SET); 69 fread(&col, 4, 1, ifp); 70 fread(&row, 4, 1, ifp); 71 fseek(ifp, 0, SEEK_SET); 72 if ((metaData = (unsigned char *)malloc(offset)) == NULL) 73 { 74 printf("<storeImage> Allocate space failed\n"); 75 getchar(); 76 exit(-1); 77 } 78 fread(metaData, 1, offset, ifp); // 從輸入文件中讀取元信息 79 //printf("Output image %s\n", filename); 80 if (fopen_s(&ofp, filename, "wb") != 0) 81 { 82 printf("<storeImage> Open output file failed\n"); 83 getchar(); 84 exit(-1); 85 } 86 if (fwrite(metaData, 1, offset, ofp) != offset) // 將元信息原封不動的放入輸出文件中 87 { 88 printf("<storeImage> Write output metaData failed\n"); 89 getchar(); 90 exit(-1); 91 } 92 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)// .bmp 行是顛倒的,倒着填充 93 { 94 for (j = 0; j < col; j++) 95 { 96 temp = (unsigned char)imageOut[i * col + j]; 97 fwrite(&temp, sizeof(unsigned char), 1, ofp); 98 } 99 for (j = 0; j < mod; fwrite(&temp, sizeof(unsigned char), 1, ofp), j++);// 列數非 4 的倍數時補上 junk padding 100 101 } 102 103 fclose(ifp); 104 fclose(ofp); 105 free(metaData); 106 return; 107 } 108 109 float *readImage(const char *filename, int *outputRow, int *outputCol)// 從文件讀取圖片 110 { 111 unsigned char temp; 112 int i, j, row, col, offset, mod; 113 float *outputImage; 114 FILE *fp; 115 116 if (fopen_s(&fp, filename, "rb") != 0) 117 { 118 printf("<readImage> Open file failed\n"); 119 getchar(); 120 exit(-1); 121 } 122 fseek(fp, 10, SEEK_SET); // 第 10 字節的位置 123 fread(&offset, 4, 1, fp); // 元信息大小 124 fseek(fp, 18, SEEK_SET); // 第 18 字節位置 125 fread(&col, 4, 1, fp); // 讀取列數和行數 126 fread(&row, 4, 1, fp); 127 printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row); 128 129 if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL) 130 { 131 printf("<readImage> Allocate space failed\n"); 132 getchar(); 133 exit(-1); 134 } 135 fseek(fp, offset, SEEK_SET);// 元信息結束的地方,開始讀圖像數據 136 fflush(NULL); 137 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--)// .bmp 行是顛倒的,順着讀文件,倒着填充,mod 為列的 junk pading 厚度 138 { 139 for (j = 0; j < col; j++) 140 { 141 fread(&temp, sizeof(unsigned char), 1, fp); 142 outputImage[i * col + j] = (float)temp; 143 } 144 for (j = 0; j < mod; fread(&temp, sizeof(unsigned char), 1, fp), j++);// 讀取 junk padding,不傳入數據中(.bmp 文件中有這幾列,但是不顯示) 145 } 146 fclose(fp); 147 148 *outputRow = row; 149 *outputCol = col; 150 return outputImage; 151 } 152 153 int main() 154 { 155 int imageRow, imageCol, dataSize, row, col, i, j, correct; 156 float *inputImage, *outputImage, sum; 157 158 inputImage = readImage(inputFile, &imageRow, &imageCol);// 從文件讀取圖像數據和行列信息 159 dataSize = imageRow * imageCol * sizeof(float); 160 outputImage = (float*)malloc(dataSize); 161 162 const int filterWidth = 7, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2; 163 float filter[49] = 164 { 0, 0, 0, 0, 0, 0, 0, 165 0, 0, 0, 0, 0, 0, 0, 166 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 167 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 168 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0 171 }; 172 173 cl_int status; 174 cl_uint nPlatform; 175 clGetPlatformIDs(0, NULL, &nPlatform); 176 cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id)); 177 clGetPlatformIDs(nPlatform, listPlatform, NULL); 178 cl_uint nDevice = 0; 179 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice); 180 cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id)); 181 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL); 182 cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); 183 cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], 0, &status); 184 185 cl_image_format format; // 圖像格式描述符 186 format.image_channel_order = CL_R; // 單通道 187 format.image_channel_data_type = CL_FLOAT; // 浮點類型(讀進來還是 uchar,但是計算需要浮點) 188 189 cl_mem d_inputImage, d_outputImage, d_filter; 190 if (true)// 舊 OpenCL 函數 clCreateImage2D 191 { 192 d_inputImage = clCreateImage2D(context, 0, &format, imageCol, imageRow, 0, NULL, &status); 193 d_outputImage = clCreateImage2D(context, 0, &format, imageCol, imageRow, 0, NULL, &status); 194 } 195 if (false)// 新 OpenCL 使用描述符圖像描述符 cl_image_desc 和函數 clCreateImage 196 { 197 cl_image_desc desc; 198 desc.image_type = CL_MEM_OBJECT_IMAGE2D; 199 desc.image_width = imageCol; 200 desc.image_height = imageRow; 201 desc.image_depth = 0; 202 desc.image_array_size = 0; 203 desc.image_row_pitch = 0; 204 desc.image_slice_pitch = 0; 205 desc.num_mip_levels = 0; 206 desc.num_samples = 0; 207 desc.buffer = NULL; 208 209 d_inputImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, &status); 210 d_outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &status); 211 } 212 d_filter = clCreateBuffer(context, 0, filterSize * sizeof(float), NULL, &status); 213 214 size_t origin[3] = { 0, 0, 0 }, region[3] = { imageCol, imageRow, 1 }; // 拷貝圖像數據用的原點和尺寸,注意尺寸是先數列再數行 215 clEnqueueWriteImage(queue, d_inputImage, CL_TRUE, origin, region, 0, 0, inputImage, 0, NULL, NULL); 216 clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0, filterSize * sizeof(float), filter, 0, NULL, NULL); 217 218 cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);// 采樣器 219 220 char *code; 221 size_t length = readText(sourceText, &code); 222 cl_program program = clCreateProgramWithSource(context, 1, (const char **)&code, &length, NULL); 223 clBuildProgram(program, 1, listDevice, NULL, NULL, NULL); 224 cl_kernel kernel = clCreateKernel(program, "convolution01", &status); 225 226 clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); 227 clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); 228 clSetKernelArg(kernel, 2, sizeof(int), &imageRow); 229 clSetKernelArg(kernel, 3, sizeof(int), &imageCol); 230 clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter); 231 clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); 232 clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler); 233 234 size_t globalSize[2] = { imageRow, imageCol };// localSize = { 1, 1}; 可以用 NULL 代替 235 status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, NULL, NULL); 236 clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, region, 0, 0, outputImage, 0, NULL, NULL); 237 238 storeImage(outputImage, outputFile, inputFile);// 將輸出圖像寫入文件中 239 240 for (row = 0, correct = 1; row < imageRow && correct; row++)// 檢查計算結果 241 { 242 for (col = 0; col < imageCol && correct; col++) 243 { 244 sum = 0; 245 for (i = -halfFilterWidth; i <= halfFilterWidth; i++) 246 { 247 for (j = -halfFilterWidth; j <= halfFilterWidth; j++) 248 { 249 if (row + i >= 0 && row + i < imageRow && col + j >= 0 && col + j < imageCol) 250 sum += inputImage[(row + i) * imageCol + col + j] * filter[(i + halfFilterWidth) * filterWidth + j + halfFilterWidth]; 251 } 252 } 253 if (row >= halfFilterWidth && row < imageRow - halfFilterWidth && col >= halfFilterWidth && col < imageCol - halfFilterWidth && 254 !floatEq(outputImage[row * imageCol + col], sum)) 255 { 256 printf("Error at [%d,%d], output:%f, ref:%f\n", row, col, outputImage[row*imageCol + col], sum); 257 correct = 0; 258 } 259 } 260 } 261 if (correct) 262 printf("Result correct.\n"); 263 264 free(listPlatform); 265 free(listDevice); 266 free(inputImage); 267 free(outputImage); 268 free(code); 269 clReleaseContext(context); 270 clReleaseCommandQueue(queue); 271 clReleaseProgram(program); 272 clReleaseKernel(kernel); 273 clReleaseMemObject(d_inputImage); 274 clReleaseMemObject(d_outputImage); 275 clReleaseMemObject(d_filter); 276 clReleaseSampler(sampler); 277 getchar(); 278 return 0; 279 }
● 輸出結果,給了一張 400 × 400 的圖片參與,另外,計算一張 5040 × 7000 的圖片需要 23 ms。
<readImage> Input image R:/input.bmp, width = 400, height = 400 Output image R:/output.bmp Result correct.
● 代碼,使用局部內存優化
1 // convolution.cl 2 __kernel void convolution02(__global float* inputImage, __global float* outputImage, int imageRow, int imageCol, 3 __constant float* filter, int filterWidth, __local float* localMem, int localMemRow, int localMemCol) 4 { 5 const int groupCol = get_group_id(0) * get_local_size(0), groupRow = get_group_id(1) * get_local_size(1); 6 const int localCol = get_local_id(0), localRow = get_local_id(1); 7 const int globalCol = groupCol + localCol, globalRow = groupRow + localRow; 8 const int halfWidth = filterWidth / 2; 9 int i, j, curRow, curCol, filterIndex; 10 float sum; 11 12 // 將源圖數據讀入局部內存 13 for (i = localRow; i < localMemRow; i += get_local_size(1)) 14 { 15 curRow = groupRow + i; 16 for (j = localCol; j < localMemCol; j += get_local_size(0)) 17 { 18 curCol = groupCol + j; 19 if (curRow < imageRow && curCol < imageCol) 20 localMem[i * localMemCol + j] = inputImage[curRow * imageCol + curCol]; 21 } 22 } 23 barrier(CLK_LOCAL_MEM_FENCE); 24 25 // 計算卷積 26 if (globalRow < imageRow - filterWidth + 1 && globalCol < imageCol - filterWidth + 1)// 選取位於有效范圍內的工作組 27 { 28 sum = 0.0f, filterIndex = 0; 29 for (i = localRow; i < localRow + filterWidth; i++) 30 { 31 for (j = localCol; j < localCol + filterWidth; j++) 32 sum += localMem[i * localMemCol + j] * filter[filterIndex++]; 33 } 34 // 循環展開 35 /* 36 for (i = localRow; i < localRow + filterWidth; i++) 37 { 38 int offset = i * localMemCol + localCol; 39 sum += localMem[offset++] * filter[filterIndex++];// 行數等於 filterWidth 40 sum += localMem[offset++] * filter[filterIndex++]; 41 sum += localMem[offset++] * filter[filterIndex++]; 42 sum += localMem[offset++] * filter[filterIndex++]; 43 sum += localMem[offset++] * filter[filterIndex++]; 44 sum += localMem[offset++] * filter[filterIndex++]; 45 sum += localMem[offset++] * filter[filterIndex++]; 46 } 47 */ 48 // 循環完全展開 49 /* 50 int offset = localRow*localMemCol + localCol; 51 sum += localMem[offset + 0] * filter[filterIndex++]; 52 sum += localMem[offset + 1] * filter[filterIndex++]; 53 sum += localMem[offset + 2] * filter[filterIndex++]; 54 sum += localMem[offset + 3] * filter[filterIndex++]; 55 sum += localMem[offset + 4] * filter[filterIndex++]; 56 sum += localMem[offset + 5] * filter[filterIndex++]; 57 sum += localMem[offset + 6] * filter[filterIndex++]; 58 offset += localMemCol; 59 sum += localMem[offset + 0] * filter[filterIndex++]; 60 sum += localMem[offset + 1] * filter[filterIndex++]; 61 sum += localMem[offset + 2] * filter[filterIndex++]; 62 sum += localMem[offset + 3] * filter[filterIndex++]; 63 sum += localMem[offset + 4] * filter[filterIndex++]; 64 sum += localMem[offset + 5] * filter[filterIndex++]; 65 sum += localMem[offset + 6] * filter[filterIndex++]; 66 offset += localMemCol; 67 sum += localMem[offset + 0] * filter[filterIndex++]; 68 sum += localMem[offset + 1] * filter[filterIndex++]; 69 sum += localMem[offset + 2] * filter[filterIndex++]; 70 sum += localMem[offset + 3] * filter[filterIndex++]; 71 sum += localMem[offset + 4] * filter[filterIndex++]; 72 sum += localMem[offset + 5] * filter[filterIndex++]; 73 sum += localMem[offset + 6] * filter[filterIndex++]; 74 offset += localMemCol; 75 sum += localMem[offset + 0] * filter[filterIndex++]; 76 sum += localMem[offset + 1] * filter[filterIndex++]; 77 sum += localMem[offset + 2] * filter[filterIndex++]; 78 sum += localMem[offset + 3] * filter[filterIndex++]; 79 sum += localMem[offset + 4] * filter[filterIndex++]; 80 sum += localMem[offset + 5] * filter[filterIndex++]; 81 sum += localMem[offset + 6] * filter[filterIndex++]; 82 offset += localMemCol; 83 sum += localMem[offset + 0] * filter[filterIndex++]; 84 sum += localMem[offset + 1] * filter[filterIndex++]; 85 sum += localMem[offset + 2] * filter[filterIndex++]; 86 sum += localMem[offset + 3] * filter[filterIndex++]; 87 sum += localMem[offset + 4] * filter[filterIndex++]; 88 sum += localMem[offset + 5] * filter[filterIndex++]; 89 sum += localMem[offset + 6] * filter[filterIndex++]; 90 offset += localMemCol; 91 sum += localMem[offset + 0] * filter[filterIndex++]; 92 sum += localMem[offset + 1] * filter[filterIndex++]; 93 sum += localMem[offset + 2] * filter[filterIndex++]; 94 sum += localMem[offset + 3] * filter[filterIndex++]; 95 sum += localMem[offset + 4] * filter[filterIndex++]; 96 sum += localMem[offset + 5] * filter[filterIndex++]; 97 sum += localMem[offset + 6] * filter[filterIndex++]; 98 offset += localMemCol; 99 sum += localMem[offset + 0] * filter[filterIndex++]; 100 sum += localMem[offset + 1] * filter[filterIndex++]; 101 sum += localMem[offset + 2] * filter[filterIndex++]; 102 sum += localMem[offset + 3] * filter[filterIndex++]; 103 sum += localMem[offset + 4] * filter[filterIndex++]; 104 sum += localMem[offset + 5] * filter[filterIndex++]; 105 sum += localMem[offset + 6] * filter[filterIndex++]; 106 */ 107 // 數據輸出 108 outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = sum; 109 } 110 return; 111 } 112 113 __kernel void convolution03(__global float4* inputImage, __global float* outputImage, int imageRow, int imageCol, 114 __constant float* filter, int filterWidth, __local float* localMem, int localMemRow, int localMemCol) 115 { 116 const int groupCol4 = get_group_id(0) * get_local_size(0) / 4, groupRow4 = get_group_id(1) * get_local_size(1); 117 const int localId = get_local_id(1) * get_local_size(0) + get_local_id(0); 118 int localCol = (localId % (localMemCol / 4)), localRow = (localId / (localMemCol / 4)); 119 int globalCol = groupCol4 + localCol, globalRow = groupRow4 + localRow; 120 const int halfWidth = filterWidth / 2; 121 122 __local float4 *localImage4 = (__local float4*)&localMem[localRow*localMemCol + localCol * 4];// 局部內存數據 123 124 if (globalRow < imageRow && globalCol < imageCol / 4 && localRow < localMemRow) 125 localImage4[0] = inputImage[globalRow*imageCol / 4 + globalCol]; 126 barrier(CLK_LOCAL_MEM_FENCE); 127 128 // 重設 坐標以輸出 129 localCol = get_local_id(0); 130 localRow = get_local_id(1); 131 globalCol = get_group_id(0)*get_local_size(0) + localCol; 132 globalRow = get_group_id(1)*get_local_size(1) + localRow; 133 134 // 計算卷積 135 int i, j, filterIndex; 136 float sum; 137 if (globalRow < imageRow - filterWidth + 1 && globalCol < imageCol - filterWidth + 1) 138 { 139 sum = 0.0f, filterIndex = 0; 140 for (i = localRow; i < localRow + filterWidth; i++) 141 { 142 for (int j = localCol; j < localCol + filterWidth; j++) 143 sum += localMem[i * localMemCol + j] * filter[filterIndex++]; 144 } 145 // 循環展開同 convolution02 146 // 輸出數據 147 outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = 0;//sum; 148 } 149 return; 150 }
1 // main.cpp 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <math.h> 5 #include <cl.h> 6 7 #pragma warning(disable : 4996) 8 9 #define NON_OPTIMIZED // 不使用優化,使用函數 convolution02 10 //#define READ_ALIGNED // 使用內存對齊優化,使用函數 convolution02 11 //#define READ4 // 局部內存使用 float4 讀取優化,使用函數 convolution03,有點問題 12 #define WGX 16 // 工作組尺寸 13 #define WGY 16 14 15 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl"; 16 const char *inputFile = "R:/input.bmp"; 17 const char *outputFile = "R:/output.bmp"; 18 19 unsigned int roundUp(unsigned int value, unsigned int base)// 將 value 向上取整到 multiple 的整數倍 20 { 21 unsigned int remainder = value % base; 22 return remainder == 0 ? value : (value + base - remainder); 23 } 24 25 int readText(const char* kernelPath, char **pcode) 26 { 27 FILE *fp; 28 int size; 29 //printf("<readText> File: %s\n", kernelPath); 30 fopen_s(&fp, kernelPath, "rb"); 31 if (!fp) 32 { 33 printf("<readText> Open file failed\n"); 34 getchar(); 35 exit(-1); 36 } 37 if (fseek(fp, 0, SEEK_END) != 0) 38 { 39 printf("<readText> Seek end of file failed\n"); 40 getchar(); 41 exit(-1); 42 } 43 if ((size = ftell(fp)) < 0) 44 { 45 printf("<readText> Get file position failed\n"); 46 getchar(); 47 exit(-1); 48 } 49 rewind(fp); 50 if ((*pcode = (char *)malloc(size + 1)) == NULL) 51 { 52 printf("<readText> Allocate space failed\n"); 53 getchar(); 54 exit(-1); 55 } 56 fread(*pcode, 1, size, fp); 57 (*pcode)[size] = '\0'; 58 fclose(fp); 59 return size + 1; 60 } 61 62 void storeImage(float *imageOut, const char *filename, const char *refFilename) 63 { 64 FILE *ifp, *ofp; 65 unsigned char *metaData, temp; 66 int offset, i, j, row, col, mod; 67 68 if (fopen_s(&ifp, refFilename, "rb") != 0) 69 { 70 printf(filename); 71 exit(-1); 72 } 73 fseek(ifp, 10, SEEK_SET); 74 fread(&offset, 4, 1, ifp); 75 fseek(ifp, 18, SEEK_SET); 76 fread(&col, 4, 1, ifp); 77 fread(&row, 4, 1, ifp); 78 fseek(ifp, 0, SEEK_SET); 79 if ((metaData = (unsigned char *)malloc(offset)) == NULL) 80 { 81 printf("<storeImage> Allocate space failed\n"); 82 getchar(); 83 exit(-1); 84 } 85 fread(metaData, 1, offset, ifp); 86 87 if (fopen_s(&ofp, filename, "wb") != 0) 88 { 89 printf("<storeImage> Open output file failed\n"); 90 getchar(); 91 exit(-1); 92 } 93 if (fwrite(metaData, 1, offset, ofp) != offset) 94 { 95 printf("<storeImage> Write output metaData failed\n"); 96 getchar(); 97 exit(-1); 98 } 99 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--) 100 { 101 for (j = 0; j < col; j++) 102 { 103 temp = (unsigned char)imageOut[i * col + j]; 104 fwrite(&temp, sizeof(unsigned char), 1, ofp); 105 } 106 for (j = 0; j < mod; fwrite(&temp, sizeof(unsigned char), 1, ofp), j++); 107 108 } 109 110 fclose(ifp); 111 fclose(ofp); 112 free(metaData); 113 return; 114 } 115 116 float *readImage(const char *filename, int *outputRow, int *outputCol) 117 { 118 unsigned char temp; 119 int i, j, row, col, offset, mod; 120 float *outputImage; 121 FILE *fp; 122 123 if (fopen_s(&fp, filename, "rb") != 0) 124 { 125 printf("<readImage> Open file failed\n"); 126 getchar(); 127 exit(-1); 128 } 129 fseek(fp, 10, SEEK_SET); 130 fread(&offset, 4, 1, fp); 131 fseek(fp, 18, SEEK_SET); 132 fread(&col, 4, 1, fp); 133 fread(&row, 4, 1, fp); 134 printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row); 135 136 if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL) 137 { 138 printf("<readImage> Allocate space failed\n"); 139 getchar(); 140 exit(-1); 141 } 142 fseek(fp, offset, SEEK_SET); 143 fflush(NULL); 144 for (i = row - 1, mod = (col % 4 == 0 ? 0 : 4 - col % 4); i >= 0; i--) 145 { 146 for (j = 0; j < col; j++) 147 { 148 fread(&temp, sizeof(unsigned char), 1, fp); 149 outputImage[i * col + j] = (float)temp; 150 } 151 for (j = 0; j < mod; fread(&temp, sizeof(unsigned char), 1, fp), j++); 152 } 153 fclose(fp); 154 155 *outputRow = row; 156 *outputCol = col; 157 return outputImage; 158 } 159 160 int main() 161 { 162 int imageRow, imageCol, dataSize, deviceRow, deviceCol, deviceDataSize; 163 float *inputImage, *outputImage; 164 inputImage = readImage(inputFile, &imageRow, &imageCol); 165 dataSize = imageRow * imageCol * sizeof(float); 166 outputImage = (float*)malloc(dataSize); 167 168 // 調整列數 169 #ifdef NON_OPTIMIZED // 不調整 170 deviceCol = imageCol; 171 #else // 增加道工作組尺寸的整數倍 172 deviceCol = roundUp(imageCol, WGX); 173 #endif 174 deviceRow = imageRow; // 行數不變 175 deviceDataSize = sizeof(float) * deviceRow * deviceCol; 176 177 const int filterWidth = 7, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2; 178 float filter[49] = 179 { 180 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 182 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 183 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 184 0, 0,1.f / 9,1.f / 9,1.f / 9, 0, 0, 185 0, 0, 0, 0, 0, 0, 0, 186 0, 0, 0, 0, 0, 0, 0 187 }; 188 189 cl_int status; 190 cl_uint nPlatform; 191 clGetPlatformIDs(0, NULL, &nPlatform); 192 cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id)); 193 clGetPlatformIDs(nPlatform, listPlatform, NULL); 194 cl_uint nDevice = 0; 195 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice); 196 cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id)); 197 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL); 198 cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); 199 cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], 0, &status); 200 201 // 使用普通的緩沖區,而不用 image 202 cl_mem d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL); 203 cl_mem d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL); 204 cl_mem d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * filterSize, NULL, NULL); 205 206 // 緩沖區寫入 207 #if defined NON_OPTIMIZED // 直接寫入 208 clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize, inputImage, 0, NULL, NULL); 209 #else // 對齊寫入 210 size_t d_origin[3] = { 0,0,0 }, h_origin[3] = { 0,0,0 }, region[3] = { sizeof(float) * deviceCol, deviceRow, 1 }; 211 clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, 0, sizeof(float) * imageCol, 0, inputImage, 0, NULL, NULL); 212 #endif 213 clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, sizeof(float) * filterSize, filter, 0, NULL, NULL); 214 215 char *code; 216 size_t length = readText(sourceText, &code); 217 cl_program program = clCreateProgramWithSource(context, 1, (const char **)&code, &length, &status); 218 status = clBuildProgram(program, 1, listDevice, NULL, NULL, NULL); 219 220 // 創建不同的內核 221 #if defined NON_OPTIMIZED || defined READ_ALIGNED 222 cl_kernel kernel = clCreateKernel(program, "convolution02", NULL); 223 #else 224 cl_kernel kernel = clCreateKernel(program, "convolution03", NULL); 225 #endif 226 227 size_t globalSize[2] = { roundUp(imageCol - filterWidth + 1, WGX), roundUp(imageRow - filterWidth + 1, WGY) }, localSize[2] = { WGX, WGY }; 228 229 // 局部內存大小 230 int localRow, localCol; 231 localRow = localSize[1] + filterWidth - 1; // 把一個工作組的大小墊起光環元素的寬度 232 #if defined NON_OPTIMIZED || defined READ_ALIGNED 233 localCol = localSize[0] + filterWidth - 1; 234 #else 235 localCol = roundUp(localSize[0] + filterWidth - 1, 4); // 墊起之外還要對齊到 4 的倍數上 236 #endif 237 clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); 238 clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); 239 clSetKernelArg(kernel, 2, sizeof(int), &deviceRow); 240 clSetKernelArg(kernel, 3, sizeof(int), &deviceCol); 241 clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter); 242 clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); 243 clSetKernelArg(kernel, 6, sizeof(float) * localCol * localRow, NULL); 244 clSetKernelArg(kernel, 7, sizeof(int), &localRow); 245 clSetKernelArg(kernel, 8, sizeof(int), &localCol); 246 247 status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL); 248 249 // 結果寫回 250 #if defined NON_OPTIMIZED 251 clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, deviceDataSize, outputImage, 0, NULL, NULL); 252 #else // 最邊上一圈 filterWidth / 2 的部分不寫回 253 d_origin[0] = 3 * sizeof(float), d_origin[1] = 3, d_origin[2] = 0; 254 h_origin[0] = 3 * sizeof(float), h_origin[1] = 3, h_origin[2] = 0; 255 region[0] = (imageCol - filterWidth + 1) * sizeof(float), region[1] = (imageRow - filterWidth + 1), region[2] = 1; 256 clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, 0, sizeof(float) * imageCol, 0, outputImage, 0, NULL, NULL); 257 #endif 258 storeImage(outputImage, outputFile, inputFile); 259 260 // 去掉了檢查結果的部分 261 free(listPlatform); 262 free(listDevice); 263 free(inputImage); 264 free(outputImage); 265 free(code); 266 clReleaseContext(context); 267 clReleaseCommandQueue(queue); 268 clReleaseProgram(program); 269 clReleaseKernel(kernel); 270 clReleaseMemObject(d_inputImage); 271 clReleaseMemObject(d_outputImage); 272 clReleaseMemObject(d_filter); 273 printf("Finshed.\n"); 274 getchar(); 275 return 0; 276 }
● 輸出結果,與上面的簡單方法相同
● 用到的函數和定義
1 //cl.h 2 // 采樣器越界處理方案 3 #define CL_ADDRESS_NONE 0x1130 4 #define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 5 #define CL_ADDRESS_CLAMP 0x1132 6 #define CL_ADDRESS_REPEAT 0x1133 7 #define CL_ADDRESS_MIRRORED_REPEAT 0x1134 8 9 // 插值方案 10 #define CL_FILTER_NEAREST 0x1140 11 #define CL_FILTER_LINEAR 0x1141 12 13 // 用到的采樣器和描述符的定義 14 typedef struct _cl_sampler* cl_sampler; 15 typedef struct _cl_image_format 16 { 17 cl_channel_order image_channel_order; 18 cl_channel_type image_channel_data_type; 19 } cl_image_format; 20 21 typedef struct _cl_image_desc 22 { 23 cl_mem_object_type image_type; 24 size_t image_width; 25 size_t image_height; 26 size_t image_depth; 27 size_t image_array_size; 28 size_t image_row_pitch; 29 size_t image_slice_pitch; 30 cl_uint num_mip_levels; 31 cl_uint num_samples; 32 cl_mem buffer; 33 } cl_image_desc; 34 35 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(// OpenCL1.2 中廢棄的 image 創建函數 36 cl_context, // 上下文 37 cl_mem_flags, // 特殊標志 38 const cl_image_format *,// image 描述符 39 size_t, // 寬度 40 size_t, // 高度 41 size_t, // 行跨步 42 void *, // 自動傳入主機數據 43 cl_int * // 返回結果狀態的指針 44 ) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; 45 46 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(// OpenCL1.2 中開始使用的 image 創建函數 47 cl_context, // 上下文 48 cl_mem_flags, // 特殊標志 49 const cl_image_format *,// image 格式描述符 50 const cl_image_desc *, // image 描述符 51 void *, // 主機數據 52 cl_int * // 返回結果狀態的指針 53 ) CL_API_SUFFIX__VERSION_1_2; 54 55 extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSampler(// 初始化采樣器 56 cl_context, // 上下文 57 cl_bool, // 是否使用歸一化坐標 58 cl_addressing_mode, // 越界處理方案 59 cl_filter_mode, // 差值方案 60 cl_int * // 返回結果狀態的指針 61 ) CL_API_SUFFIX__VERSION_1_0; 62 63 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(// 矩形緩沖區寫入 64 cl_command_queue, // 命令隊列 65 cl_mem, // 目標緩沖區 66 cl_bool, // 阻塞標記 67 const size_t *, // 緩沖區寫入起點 68 const size_t *, // 源數據寫入起點 69 const size_t *, // 寫入范圍,是一個三維數組,分別為:一行數據量(Byte),行數,層數 70 size_t, // 緩沖區行間跨度 71 size_t, // 緩沖區層間跨度 72 size_t, // 源數據行間跨度 73 size_t, // 源數據層間跨度 74 const void *, // 源數據指針 75 cl_uint, // 時間列表長度 76 const cl_event *, // 時間列表 77 cl_event * // 本事件標記 78 ) CL_API_SUFFIX__VERSION_1_1; 79 80 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(// 矩形緩沖區讀出,參數定義同上 81 cl_command_queue, 82 cl_mem, 83 cl_bool, 84 const size_t *, 85 const size_t *, 86 const size_t *, 87 size_t, 88 size_t, 89 size_t, 90 size_t, 91 void *, 92 cl_uint, 93 const cl_event *, 94 cl_event * 95 ) CL_API_SUFFIX__VERSION_1_1;