二維數組 cudaMallocPitch() 和三維數組 cudaMalloc3D() 的使用


▶ 使用函數 cudaMallocPitch() 和配套的函數 cudaMemcpy2D() 來使用二維數組。C 中二維數組內存分配是轉化為一維數組,連貫緊湊,每次訪問數組中的元素都必須從數組首元素開始遍歷;而 cuda 中這樣分配的二維數組內存保證了數組每一行首元素的地址值都按照 256 或 512 的倍數對齊,提高訪問效率,但使得每行末尾元素與下一行首元素地址可能不連貫,使用指針尋址時要注意考慮尾部。

1 // cuda_rumtime_api.h
2 extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t widthByte, size_t height);
3  
4 extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);

● cudaMAllocPitch() 傳入存儲器指針 **devPtr,偏移值的指針 *pitch,數組行字節數 widthByte,數組行數 height。函數返回后指針指向分配的內存(每行地址對齊到 AlignByte 字節,為 256B 或 512B),偏移值指針指向的值為該行實際字節數(= sizeof(datatype) * width + alignByte - 1) / alignByte)。

● cudaMemcpy2D() 傳入目標存儲器的指針 *dst,目標存儲器行字節數 dpitch,源存儲器指針 *src,源存儲器行字節數 spitch,數組行字節數 width,數組行數 height,拷貝方向 kind。這里要求存儲器行字節數不小於數組行字節數,多出來的部分就是每行尾部空白部分。

● 整個測試代碼。

 1 #include <stdio.h>
 2 #include <malloc.h>
 3 #include <cuda_runtime_api.h>
 4 #include "device_launch_parameters.h"
 5 
 6 __global__ void myKernel(float* devPtr, int height, int width, int pitch)
 7 {
 8     int row, col;
 9     float *rowHead;
10     
11     for (row = 0; row < height; row++)
12     {
13         rowHead = (float*)((char*)devPtr + row * pitch);
14         
15         for (col = 0; col < width; col++)
16         {
17             printf("\t%f", rowHead[col]);// 逐個打印並自增 1
18             rowHead[col]++;
19         }
20         printf("\n");
21     }
22 }
23 
24 int main()
25 {
26     size_t width = 6;
27     size_t height = 5;
28     float *h_data, *d_data;
29     size_t pitch;
30 
31     h_data = (float *)malloc(sizeof(float)*width*height);
32     for (int i = 0; i < width*height; i++)
33         h_data[i] = (float)i;
34 
35     printf("\n\tAlloc memory.");
36     cudaMallocPitch((void **)&d_data, &pitch, sizeof(float)*width, height);
37     printf("\n\tPitch = %d B\n", pitch);
38                                 
39     printf("\n\tCopy to Device.\n");
40     cudaMemcpy2D(d_data, pitch, h_data, sizeof(float)*width, sizeof(float)*width, height, cudaMemcpyHostToDevice);
41     
42     myKernel << <1, 1 >> > (d_data, height, width, pitch);
43     cudaDeviceSynchronize();
44 
45     printf("\n\tCopy back to Host.\n");
46     cudaMemcpy2D(h_data, sizeof(float)*width, d_data, pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
47 
48     for (int i = 0; i < width*height; i++)
49     {
50         printf("\t%f", h_data[i]);
51         if ((i + 1) % width == 0)
52             printf("\n");
53     }               
54 
55     free(h_data);
56     cudaFree(d_data);
57 
58     getchar();
59     return 0;
60 }

 ● 輸出結果:

        Alloc memory.
        Pitch = 512 B

        Copy to Device.
        0.000000        1.000000        2.000000        3.000000        4.000000        5.000000
        6.000000        7.000000        8.000000        9.000000        10.000000       11.000000
        12.000000       13.000000       14.000000       15.000000       16.000000       17.000000
        18.000000       19.000000       20.000000       21.000000       22.000000       23.000000
        24.000000       25.000000       26.000000       27.000000       28.000000       29.000000

        Copy back to Host.
        1.000000        2.000000        3.000000        4.000000        5.000000        6.000000
        7.000000        8.000000        9.000000        10.000000       11.000000       12.000000
        13.000000       14.000000       15.000000       16.000000       17.000000       18.000000
        19.000000       20.000000       21.000000       22.000000       23.000000       24.000000
        25.000000       26.000000       27.000000       28.000000       29.000000       30.000000

 

▶ 使用函數 cudaMalloc3D() 和配套的函數 cudaMemcpy3D() 來使用三維數組。因為涉及的參數較多,需要定義一些用來傳參的結構,形式上和二維數組的使用有較大差距,不好看。

● 涉及的相關代碼

 1 // driver_types.h
 2 struct cudaArray;                      // cuda 數組
 3 typedef struct cudaArray * cudaArray_t;// cuda 指針
 4 
 5 struct __device_builtin__ cudaPitchedPtr
 6 {
 7     void   *ptr;      // 實際數組指針(用完后要用 cudaFree() 釋放掉)
 8     size_t  pitch;    // 數組行字節數
 9     size_t  xsize;    // 數組列數 
10     size_t  ysize;    // 數組行數
11 };
12 
13 struct __device_builtin__ cudaExtent
14 {
15     size_t width;     // 數組行字節數 
16     size_t height;    // 數組行數 
17     size_t depth;     // 數組層數
18 };
19 
20 struct __device_builtin__ cudaPos
21 {
22     size_t x;
23     size_t y;
24     size_t z;
25 };
26 
27 struct __device_builtin__ cudaMemcpy3DParms
28 {
29     cudaArray_t            srcArray;  // 原數組指針
30     struct cudaPos         srcPos;    // 原數組偏移 
31     struct cudaPitchedPtr  srcPtr;    // ?Pitched source memory address 
32 
33     cudaArray_t            dstArray;  // 目標數組指針 
34     struct cudaPos         dstPos;    // 目標數組偏移
35     struct cudaPitchedPtr  dstPtr;    // ?Pitched destination memory address 
36 
37     struct cudaExtent      extent;    // 數組實際尺寸(去掉對齊用的空白部分) 
38     enum cudaMemcpyKind    kind;      // 拷貝類型 
39 };
40 
41 // driver_functions.h
42 static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
43 {                                     // 簡單生成 cudaPitchedPtr 結構的方法
44     struct cudaPitchedPtr s;
45 
46     s.ptr = d;
47     s.pitch = p;
48     s.xsize = xsz;
49     s.ysize = ysz;
50 
51     return s;
52 }
53 
54 static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
55 {                                     // 簡單的生成 cudaPos 結構的方法
56     struct cudaPos p;
57 
58     p.x = x;
59     p.y = y;
60     p.z = z;
61 
62     return p;
63 }
64 
65 static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
66 {                                     // 簡單的生成 cudaExtent 結構的方法
67     struct cudaExtent e;
68 
69     e.width = w;
70     e.height = h;
71     e.depth = d;
72 
73     return e;
74 }
75 
76 // cuda_runtime_api.h
77 extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent);
78 
79 extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);

 

● 完整的測試程序

 1 #include <stdio.h>
 2 #include <malloc.h>
 3 #include <cuda_runtime_api.h>
 4 #include "device_launch_parameters.h"
 5 #include <driver_functions.h>
 6 
 7 __global__ void myKernel(cudaPitchedPtr devPitchedPtr, cudaExtent extent)
 8 {
 9     float * devPtr = (float *)devPitchedPtr.ptr;
10     float *sliceHead, *rowHead;
11         // 可以定義為 char * 作面、行遷移的時候直接加減字節數,取行內元素的時候再換回 float *
12 
13     for (int z = 0; z < extent.depth; z++)
14     {
15         sliceHead = (float *)((char *)devPtr + z * devPitchedPtr.pitch * extent.height);
16         for (int y = 0; y < extent.height; y++)
17         {
18             rowHead = (float*)((char *)sliceHead + y * devPitchedPtr.pitch);
19             for (int x = 0; x < extent.width / sizeof(float); x++)// extent 存儲的是行有效字節數,要除以元素大小 
20             {
21                 printf("\t%f",rowHead[x]);// 逐個打印並自增 1
22                 rowHead[x]++;
23             }
24             printf("\n");
25         }
26         printf("\n");
27     }
28 }
29 
30 int main()
31 {
32     size_t width = 2;
33     size_t height = 3;
34     size_t depth = 4;
35     float *h_data;
36 
37     cudaPitchedPtr d_data;
38     cudaExtent extent;
39     cudaMemcpy3DParms cpyParm;
40 
41     h_data = (float *)malloc(sizeof(float) * width * height * depth);
42     for (int i = 0; i < width * height * depth; i++)
43         h_data[i] = (float)i;
44 
45     printf("\n\tAlloc memory.");
46     extent = make_cudaExtent(sizeof(float) * width, height, depth);
47     cudaMalloc3D(&d_data, extent);
48                                 
49     printf("\n\tCopy to Device.\n");
50     cpyParm = {0};
51     cpyParm.srcPtr = make_cudaPitchedPtr((void*)h_data, sizeof(float) * width, width, height);
52     cpyParm.dstPtr = d_data;
53     cpyParm.extent = extent;
54     cpyParm.kind = cudaMemcpyHostToDevice;
55     cudaMemcpy3D(&cpyParm);
56 
57     myKernel << <1, 1 >> > (d_data, extent);
58     cudaDeviceSynchronize();
59 
60     printf("\n\tCopy back to Host.\n");
61     cpyParm = { 0 };
62     cpyParm.srcPtr = d_data;
63     cpyParm.dstPtr = make_cudaPitchedPtr((void*)h_data, sizeof(float) * width, width, height);
64     cpyParm.extent = extent;
65     cpyParm.kind = cudaMemcpyDeviceToHost;
66     cudaMemcpy3D(&cpyParm);
67 
68     for (int i = 0; i < width*height*depth; i++)
69     {
70         printf("\t%f", h_data[i]);
71         if ((i + 1) % width == 0)
72             printf("\n");
73         if ((i + 1) % (width*height) == 0)
74             printf("\n");
75     }               
76 
77     free(h_data);
78     cudaFree(d_data.ptr);
79     getchar();
80     return 0;
81 }

 ● 輸出結果:

        Alloc memory.
        Copy to Device.
        0.000000        1.000000
        2.000000        3.000000
        4.000000        5.000000

        6.000000        7.000000
        8.000000        9.000000
        10.000000       11.000000

        12.000000       13.000000
        14.000000       15.000000
        16.000000       17.000000

        18.000000       19.000000
        20.000000       21.000000
        22.000000       23.000000


        Copy back to Host.
        1.000000        2.000000
        3.000000        4.000000
        5.000000        6.000000

        7.000000        8.000000
        9.000000        10.000000
        11.000000       12.000000

        13.000000       14.000000
        15.000000       16.000000
        17.000000       18.000000

        19.000000       20.000000
        21.000000       22.000000
        23.000000       24.000000        

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM