CUDA C Programming Guide 在線教程學習筆記 Part 8

本文轉載自查看原文 2017-11-15 23:09 1432 CUDA/ CUDA C Programming Guide

▶ 線程束表決函數（Warp Vote Functions）

● 用於同一線程束內各線程通信和計算規約指標。

 1 // device_functions.h，cc < 9.0
 2 __DEVICE_FUNCTIONS_STATIC_DECL__ int __all(int a)
 3 {
 4     int result;
 5     asm __volatile__("{ \n\t"
 6         ".reg .pred \t%%p1; \n\t"
 7         ".reg .pred \t%%p2; \n\t"
 8         "setp.ne.u32 \t%%p1, %1, 0; \n\t"
 9         "vote.all.pred \t%%p2, %%p1; \n\t"
10         "selp.s32 \t%0, 1, 0, %%p2; \n\t"
11         "}" : "=r"(result) : "r"(a));
12     return result;
13 }
14 
15 __DEVICE_FUNCTIONS_STATIC_DECL__ int __any(int a)
16 {
17     int result;
18     asm __volatile__("{ \n\t"
19         ".reg .pred \t%%p1; \n\t"
20         ".reg .pred \t%%p2; \n\t"
21         "setp.ne.u32 \t%%p1, %1, 0; \n\t"
22         "vote.any.pred \t%%p2, %%p1; \n\t"
23         "selp.s32 \t%0, 1, 0, %%p2; \n\t"
24         "}" : "=r"(result) : "r"(a));
25     return result;
26 }
27 
28 __DEVICE_FUNCTIONS_STATIC_DECL__
29 #if defined(__CUDACC_RTC__) || defined(__CUDACC_INTEGRATED__)
30 unsigned int __ballot(int a)
31 #else
32 int __ballot(int a)
33 #endif
34 {
35     int result;
36     asm __volatile__("{ \n\t"
37         ".reg .pred \t%%p1; \n\t"
38         "setp.ne.u32 \t%%p1, %1, 0; \n\t"
39         "vote.ballot.b32 \t%0, %%p1; \n\t"
40         "}" : "=r"(result) : "r"(a));
41     return result;
42 }
43 
44 // device_functions.h，cc≥9.0，改進並廢棄了原來的三個，增加兩個
45 int __all_sync(unsigned int mask, int predicate);
46 int __any_sync(unsigned int mask, int predicate);
47 int __uni_sync(unsigned int mask, int predicate);
48 unsigned int __ballot_sync(unsigned int mask, int predicate);
49 unsigned int __activemask();
50 
51 //sm_30_intrinsics.hpp，cc ≥ 9.0
52 __SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred)
53 {
54     extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred);
55     return __nvvm_vote_all_sync(mask, pred);
56 }
57 
58 __SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred)
59 {
60     extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred);
61     return __nvvm_vote_any_sync(mask, pred);
62 }
63 
64 __SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred)
65 {
66     extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred);
67     return __nvvm_vote_uni_sync(mask, pred);
68 }
69 
70 __SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred)
71 {
72     extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred);
73     return __nvvm_vote_ballot_sync(mask, pred);
74 }
75 
76 __SM_30_INTRINSICS_DECL__unsigned __activemask()
77 {
78     unsigned ret;
79     int predicate = 1;
80     asm volatile ("{ .reg .pred p; setp.ne.u32 p, %1, 0; vote.ballot.b32 %0, p; } " : "=r"(ret) : "r"(predicate));
81     return ret;
82 }

● 在設備代碼的一個線程中調用 _all(predicate)，__any(mask, predicate)，__ballot(mask, predicate) 時，該線程所在的線程束中所有線程（標號 0 ~ 31，稱為 lane ID）求變量 predicate 的值，並按照一定的規律返回一個整形值。

● _all() 當且僅當所有線程的 predicate 非零時返回 1，否則返回 0。

● _any() 當且僅當至少有一個線程的 predicate 非零時返回 1，否則返回 0。

● _ballot() 返回一個無符號整數，代表了該線程束內變量 predicate 的非零值分布情況。線程 predicate 為零的該函數返回值該位為 0，線程 predicate 非零的該函數返回值該位為 1 。

● CUDA9.0 對以上函數進行了改進，變成了 _all_sync()，_any_sync()，_ballot_sync() 。添加了參數 unsigned int mask（注意也是 32 bit），用來指定線程束中的特定位參與 predicate 的計算（而不像 CUDA8.0 中那樣全員參與），不參加計算的線程結果按 0 計。函數強制同步了所有被 mask 指定的線程，就算被指定的線程不活躍，也要包含該函數的調用，否則結果未定義。

● _uni_sync() 當且僅當被 mask 指定線程的 predicate 全部非零或全部為零時返回 1，否則返回 0。

● __activemask() 返回一個無符號整數，代表了該線程束內活動線程的分布情況。該線程活動則返回值該位為 1，否則為 0 。該函數沒有 mask參數，必須全員參加。

● CUDA8.0 上的測試代碼

 1 #include <stdio.h>
 2 #include <malloc.h>
 3 #include <cuda_runtime.h>
 4 #include "device_launch_parameters.h"
 5 #include "device_functions.h" 
 6 
 7 __global__ void vote_all(int *a, int *b, int n)
 8 {
 9     int tid = threadIdx.x;
10     if (tid > n)
11         return;
12     int temp = a[tid];
13     b[tid] = __all(temp > 48);
14 }
15 
16 __global__ void vote_any(int *a, int *b, int n)
17 {
18     int tid = threadIdx.x;
19     if (tid > n)
20         return;
21     int temp = a[tid];
22     b[tid] = __any(temp > 48);
23 }
24 
25 __global__ void vote_ballot(int *a, int *b, int n)
26 {
27     int tid = threadIdx.x;
28     if (tid > n)
29         return;
30     int temp = a[tid];
31     b[tid] = __ballot(temp > 42 && temp < 53);
32 }
33 
34 int main()
35 {
36     int *h_a, *h_b, *d_a, *d_b;
37     int n = 128, m = 32;
38     int nsize = n * sizeof(int);
39 
40     h_a = (int *)malloc(nsize);
41     h_b = (int *)malloc(nsize);
42     for (int i = 0; i < n; ++i)
43         h_a[i] = i;
44     memset(h_b, 0, nsize);
45     cudaMalloc(&d_a, nsize);
46     cudaMalloc(&d_b, nsize);
47     cudaMemcpy(d_a, h_a, nsize, cudaMemcpyHostToDevice);
48     cudaMemset(d_b, 0, nsize);
49 
50     vote_all << <1, n >> >(d_a, d_b, n);
51     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
52     printf("vote_all():");
53     for (int i = 0; i < n; ++i)
54     {
55         if (!(i % m))
56             printf("\n");
57         printf("%d ", h_b[i]);
58     }
59     printf("\n");
60 
61     vote_any << <1, n >> >(d_a, d_b, n);
62     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
63     printf("vote_any():");
64     for (int i = 0; i < n; ++i)
65     {
66         if (!(i % m))
67             printf("\n");
68         printf("%d ", h_b[i]);
69     }
70     printf("\n");
71 
72     vote_ballot << <1, n >> >(d_a, d_b, n);
73     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
74     cudaDeviceSynchronize();
75     printf("vote_ballot():");
76     for (int i = 0; i < n; ++i)
77     {
78         if (!(i % m))
79             printf("\n");
80         printf("%u ", h_b[i]);// 用無符號整數輸出
81     }
82     printf("\n");
83 
84     getchar();
85     return 0;
86 }

● 輸出結果。其中 2095104₁₀ = 0000 0000 0001 1111 1111 1000 0000 0000₂，即第二個線程束（標號 32 ~ 63）的第 11 位（含0，標號43）起連續 10 位為 1，其余為 0 。

vote_all():
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
vote_any():
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
vote_ballot():
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

● CUDA9.0 上的測試代碼：

  1 #include <stdio.h>
  2 #include <malloc.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include "device_functions.h" 
  6 
  7 __global__ void vote_all(int *a, int *b, int n)
  8 {
  9     int tid = threadIdx.x;
 10     if (tid > n)
 11         return;
 12     int temp = a[tid];
 13     b[tid] = __all_sync(0xffffffff, temp > 48);// 注意添加了參數 mask 
 14 }
 15 
 16 __global__ void vote_any(int *a, int *b, int n)
 17 {
 18     int tid = threadIdx.x;
 19     if (tid > n)
 20         return;
 21     int temp = a[tid];
 22     b[tid] = __any_sync(0xffffffff, temp > 48); 
 23 }
 24 
 25 __global__ void vote_ballot(int *a, int *b, int n)
 26 {
 27     int tid = threadIdx.x;
 28     if (tid > n)
 29         return;
 30     int temp = a[tid];
 31     b[tid] = __ballot_sync(0xffffffff, temp > 42 && temp < 53); 
 32 }
 33 
 34 __global__ void vote_union(int *a, int *b, int n)
 35 {
 36     int tid = threadIdx.x;
 37     if (tid > n)
 38         return;
 39     int temp = a[tid];
 40     b[tid] = __uni_sync(0xffffffff, temp > 42 && temp < 53);
 41 }
 42 
 43 __global__ void vote_active(int *a, int *b, int n)
 44 {
 45     int tid = threadIdx.x;
 46     if (tid > n || tid % 2)// 斃掉了所有偶數號線程
 47         return;
 48     int temp = a[tid];
 49     b[0] = __activemask();
 50 }
 51 
 52 int main()
 53 {
 54     int *h_a, *h_b, *d_a, *d_b;
 55     int n = 128, m = 32;
 56     int nsize = n * sizeof(int);
 57 
 58     h_a = (int *)malloc(nsize);
 59     h_b = (int *)malloc(nsize);
 60     for (int i = 0; i < n; ++i)
 61         h_a[i] = i;
 62     memset(h_b, 0, nsize);
 63     cudaMalloc(&d_a, nsize);
 64     cudaMalloc(&d_b, nsize);
 65     cudaMemcpy(d_a, h_a, nsize, cudaMemcpyHostToDevice);
 66     cudaMemset(d_b, 0, nsize);
 67 
 68     vote_all << <1, n >> >(d_a, d_b, n);
 69     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
 70     printf("vote_all():");
 71     for (int i = 0; i < n; ++i)
 72     {
 73         if (!(i % m))
 74             printf("\n");
 75         printf("%d ", h_b[i]);
 76     }
 77     printf("\n");
 78 
 79     vote_any << <1, n >> >(d_a, d_b, n);
 80     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
 81     printf("vote_any():");
 82     for (int i = 0; i < n; ++i)
 83     {
 84         if (!(i % m))
 85             printf("\n");
 86         printf("%d ", h_b[i]);
 87     }
 88     printf("\n");
 89 
 90     vote_union << <1, n >> >(d_a, d_b, n);
 91     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
 92     printf("vote_union():");
 93     for (int i = 0; i < n; ++i)
 94     {
 95         if (!(i % m))
 96             printf("\n");
 97         printf("%d ", h_b[i]);
 98     }
 99     printf("\n");
100 
101     vote_ballot << <1, n >> >(d_a, d_b, n);
102     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
103     cudaDeviceSynchronize();
104     printf("vote_ballot():");
105     for (int i = 0; i < n; ++i)
106     {
107         if (!(i % m))
108             printf("\n");
109         printf("%u ", h_b[i]);// 用無符號整數輸出
110     }
111     printf("\n");
112 
113     vote_active << <1, n >> >(d_a, d_b, n);
114     cudaMemcpy(h_b, d_b, sizeof(int), cudaMemcpyDeviceToHost);
115     cudaDeviceSynchronize();
116     printf("vote_active():\n%u ", h_b[0]);// 用無符號整數輸出    
117     printf("\n");
118 
119     getchar();
120     return 0;
121 }

● 輸出結果。其中 2095104 同 CUDA8.0 中的情況；1431655765₁₀ = 0101 0101 0101 0101 0101 0101 0101 0101₂，即所有偶數號線程都不活躍（提前 return 掉了）。

vote_all():
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
vote_any():
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
vote_union():
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
vote_ballot():
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
vote_active():
1431655765

▶ 線程束匹配函數（Warp Match Functions），要求 cc ≥ 7.0 的設備。

● 與線程束表決函數類似，對線程束內指定的線程進行計算，返回滿足條件的線程編號構成的無符號整數。T 可以是 int，unsigned int，long，unsigned long，long long，unsigned long long，float，double 。

1 unsigned int __match_any_sync(unsigned mask, T value);
2 unsigned int __match_all_sync(unsigned mask, T value, int *pred);

● __match_any_sync() 比較 mask 指定的所有線程中的變量 value，返回具有相同值的線程編號構成的無符號整數。

● __match_all_sync() 比較 mask 指定的所有線程中的變量 value，當所有被指定的線程具有相同值的時候返回 mask 且 *pred 被置為 true，否則返回 0 且置 *pred 為 false。

▶ 線程束統籌函數（Warp Shuffle Functions）

● 定義在 sm_30_intrinsics.hpp 中，與 Warp Vote Functions 兩者構成了整個頭文件。T 可以是 int，unsigned int，long，unsigned long，long long，unsigned long long，float，double，__half，__half2 。

 1 // sm_30_intrinsics.h，cuda < 9.0
 2 T __shfl(int var, int srcLane, int width);
 3 T __shfl_up(int var, int srcLane, int width);
 4 T __shfl_down(int var, int srcLane, int width);
 5 T __shfl_xor(int var, int srcLane, int width);
 6 
 7 // sm_30_intrinsics.h，cuda ≥ 9.0
 8 T __shfl_sync(unsigned mask, T var, int srcLane, int width = warpSize);
 9 T __shfl_up_sync(unsigned mask, T var, unsigned int delta, int width = warpSize);
10 T __shfl_down_sync(unsigned mask, T var, unsigned int delta, int width = warpSize);
11 T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize);

● 此處說明的圖，以及后面的規約計算代碼來源：http://blog.csdn.net/bruce_0712/article/details/64926471

● __shfl_sync() 被 mask 指定的線程返回標號為 srcLane 的線程中的變量 var 的值，其余線程返回0 。如下圖例子中，調用 shfl_sync(mask, x, 2, 16); ，則標號為 2 的線程向標號為 0 ~ 15 的線程廣播了其變量 x 的值；標號為 18 的線程向標號為 16 ~ 31 的線程廣播了其變量 x 的值。

● __shfl_up_sync() 被 mask 指定的線程返回向前偏移為 delta 的線程中的變量 var 的值，其余線程返回0 。如下圖例子中，調用 shfl_up_sync(mask, x, 2, 16); ，則標號為 2 ~15 的線程分別獲得標號為 0 ~ 13 的線程中變量 x 的值；標號為 18 ~31 的線程分別獲得標號為 16 ~ 29 的線程中變量 x 的值。

● __shfl_down_sync() 被 mask 指定的線程返回向后偏移為 delta 的線程中的變量 var 的值，其余線程返回0 。如下圖例子中，調用 shfl_down_sync(mask, x, 2, 16); ，則標號為 0 ~13 的線程分別獲得標號為 2 ~ 15 的線程中變量 x 的值；標號為 16 ~29 的線程分別獲得標號為 18 ~ 31 的線程中變量 x 的值。

● __shfl_xor_sync() 被 mask 指定的線程返回向后偏移為 delta 的線程中的變量 var 的值，其余線程返回0 。如下圖例子中，調用 shfl_down_sync(mask, x, 1, 16); ，則標號為 0 ~13 的線程分別獲得標號為 2 ~ 15 的線程中變量 x 的值；標號為 16 ~29 的線程分別獲得標號為 18 ~ 31 的線程中變量 x 的值。

● __shfl_xor_sync() 的參數 laneMask 說明：

■ 當 n = 2^k 時，表現為將連續的 n 個元素看做一個整體，與其后方連續的 n 個元素的整體做交換，但是兩個整體的內部不做交換。例如 [0, 1, 2, 3, 4, 5, 6, 7] 做 n = 2 的變換得到 [2, 3, 0, 1, 6, 7, 4, 5] 。

■ 當 n ≠ 2^k 時，先將 n 拆分成若干 2^k 之和，分別做這些層次上的變換。這種操作是良定義的（二元輪換滿足交換律和結合律）。例如 [0, 1, 2, 3, 4, 5, 6, 7] 做 n = 3 的變換時，先做 n = 2 的變換，得到 [2, 3, 0, 1, 6, 7, 4, 5]，再做 n = 1 的變換，得到 [3, 2, 1, 0, 7, 6, 5, 4] 。

● 測試代碼

  1 #include <stdio.h>
  2 #include <malloc.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include "device_functions.h" 
  6 
  7 __global__ void shfl(int *a, int *b, int n)
  8 {
  9     int tid = threadIdx.x;
 10     if (tid > n)
 11         return;
 12     int temp = -a[tid];// 廣播的值為線程原值的相反數
 13     b[tid] = a[tid];   // 先將值賦成原值
 14     
 15     b[tid] = __shfl_sync(0x00000000, temp, 0, 16);
 16     // mask 作用不明，無論是調整為 0xffffffff 還是 0x55555555 還是 0x00000000 結果都沒有變化
 17     // temp 要廣播的變量
 18     // 0    廣播源線程編號。若參數超出32，則自動取模處理（如輸入為 99，則自動變成 99 % 32 = 3）
 19     // 16   廣播寬度。默認值 32（線程束內廣播），可以調整為不超過 32 的 2 的整數次冪，超出 32 操作未定義（實測結果被當成 32 處理）
 20 }
 21 
 22 __global__ void shfl_up(int *a, int *b, int n)
 23 {
 24     int tid = threadIdx.x;
 25     if (tid > n)
 26         return;
 27     int temp = -a[tid];
 28     b[tid] = a[tid];   
 29 
 30     b[tid] = __shfl_up_sync(0x00000000, temp, 1, 16);
 31     // 1    偏移量，而不是源線程編號
 32 }
 33 
 34 __global__ void shfl_down(int *a, int *b, int n)
 35 {
 36     int tid = threadIdx.x;
 37     if (tid > n)
 38         return;
 39     int temp = -a[tid];// 廣播的值為線程原值的相反數
 40     b[tid] = a[tid];   // 先將值賦成原值
 41 
 42     b[tid] = __shfl_down_sync(0x00000000, temp, 1, 16);
 43     // 1    偏移量，而不是源線程編號
 44 }
 45 
 46 __global__ void shfl_xor(int *a, int *b, int n)
 47 {
 48     int tid = threadIdx.x;
 49     if (tid > n)
 50         return;
 51     int temp = -a[tid];// 廣播的值為線程原值的相反數
 52     b[tid] = a[tid];   // 先將值賦成原值
 53 
 54     b[tid] = __shfl_xor_sync(0x00000000, temp, 1, 16);
 55     // 1    移動塊大小，比較復雜，見前面的函數說明
 56 }
 57 
 58 
 59 int main()
 60 {
 61     int *h_a, *h_b, *d_a, *d_b;
 62     int n = 128, m = 32;
 63     int nsize = n * sizeof(int);
 64 
 65     h_a = (int *)malloc(nsize);
 66     h_b = (int *)malloc(nsize);
 67     for (int i = 0; i < n; ++i)
 68         h_a[i] = i;
 69     memset(h_b, 0, nsize);
 70     cudaMalloc(&d_a, nsize);
 71     cudaMalloc(&d_b, nsize);
 72     cudaMemcpy(d_a, h_a, nsize, cudaMemcpyHostToDevice);
 73     cudaMemset(d_b, 0, nsize);
 74 
 75     printf("Inital Array:");
 76     for (int i = 0; i < n; ++i)
 77     {
 78         if (!(i % m))
 79             printf("\n");
 80         printf("%4d ", h_a[i]);
 81     }
 82     printf("\n");
 83 
 84     shfl << <1, n >> >(d_a, d_b, n);
 85     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
 86     cudaDeviceSynchronize();
 87     printf("shfl():");
 88     for (int i = 0; i < n; ++i)
 89     {
 90         if (!(i % m))
 91             printf("\n");
 92         printf("%4d ", h_b[i]);
 93     }
 94     printf("\n");
 95 
 96     shfl_up << <1, n >> >(d_a, d_b, n);
 97     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
 98     cudaDeviceSynchronize();
 99     printf("shfl_up():");
100     for (int i = 0; i < n; ++i)
101     {
102         if (!(i % m))
103             printf("\n");
104         printf("%4d ", h_b[i]);
105     }
106     printf("\n");
107 
108     shfl_down << <1, n >> >(d_a, d_b, n);
109     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
110     cudaDeviceSynchronize();
111     printf("shfl_down():");
112     for (int i = 0; i < n; ++i)
113     {
114         if (!(i % m))
115             printf("\n");
116         printf("%4d ", h_b[i]);
117     }
118     printf("\n");
119 
120     shfl_xor << <1, n >> >(d_a, d_b, n);
121     cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost);
122     cudaDeviceSynchronize();
123     printf("shfl_xor():");
124     for (int i = 0; i < n; ++i)
125     {
126         if (!(i % m))
127             printf("\n");
128         printf("%4d ", h_b[i]);
129     }
130     printf("\n");
131 
132     getchar();
133     return 0;
134 }

● 輸出結果

Inital Array:
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31
  32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63
  64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95
  96   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127
shfl():
   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0  -16  -16  -16  -16  -16  -16  -16  -16  -16  -16  -16  -16  -16  -16  -16  -16
 -32  -32  -32  -32  -32  -32  -32  -32  -32  -32  -32  -32  -32  -32  -32  -32  -48  -48  -48  -48  -48  -48  -48  -48  -48  -48  -48  -48  -48  -48  -48  -48
 -64  -64  -64  -64  -64  -64  -64  -64  -64  -64  -64  -64  -64  -64  -64  -64  -80  -80  -80  -80  -80  -80  -80  -80  -80  -80  -80  -80  -80  -80  -80  -80
 -96  -96  -96  -96  -96  -96  -96  -96  -96  -96  -96  -96  -96  -96  -96  -96 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112
shfl_up():
   0    0   -1   -2   -3   -4   -5   -6   -7   -8   -9  -10  -11  -12  -13  -14  -16  -16  -17  -18  -19  -20  -21  -22  -23  -24  -25  -26  -27  -28  -29  -30
 -32  -32  -33  -34  -35  -36  -37  -38  -39  -40  -41  -42  -43  -44  -45  -46  -48  -48  -49  -50  -51  -52  -53  -54  -55  -56  -57  -58  -59  -60  -61  -62
 -64  -64  -65  -66  -67  -68  -69  -70  -71  -72  -73  -74  -75  -76  -77  -78  -80  -80  -81  -82  -83  -84  -85  -86  -87  -88  -89  -90  -91  -92  -93  -94
 -96  -96  -97  -98  -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -112 -112 -113 -114 -115 -116 -117 -118 -119 -120 -121 -122 -123 -124 -125 -126
shfl_down():
  -1   -2   -3   -4   -5   -6   -7   -8   -9  -10  -11  -12  -13  -14  -15  -15  -17  -18  -19  -20  -21  -22  -23  -24  -25  -26  -27  -28  -29  -30  -31  -31
 -33  -34  -35  -36  -37  -38  -39  -40  -41  -42  -43  -44  -45  -46  -47  -47  -49  -50  -51  -52  -53  -54  -55  -56  -57  -58  -59  -60  -61  -62  -63  -63
 -65  -66  -67  -68  -69  -70  -71  -72  -73  -74  -75  -76  -77  -78  -79  -79  -81  -82  -83  -84  -85  -86  -87  -88  -89  -90  -91  -92  -93  -94  -95  -95
 -97  -98  -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -111 -113 -114 -115 -116 -117 -118 -119 -120 -121 -122 -123 -124 -125 -126 -127 -127
shfl_xor():
  -1    0   -3   -2   -5   -4   -7   -6   -9   -8  -11  -10  -13  -12  -15  -14  -17  -16  -19  -18  -21  -20  -23  -22  -25  -24  -27  -26  -29  -28  -31  -30
 -33  -32  -35  -34  -37  -36  -39  -38  -41  -40  -43  -42  -45  -44  -47  -46  -49  -48  -51  -50  -53  -52  -55  -54  -57  -56  -59  -58  -61  -60  -63  -62
 -65  -64  -67  -66  -69  -68  -71  -70  -73  -72  -75  -74  -77  -76  -79  -78  -81  -80  -83  -82  -85  -84  -87  -86  -89  -88  -91  -90  -93  -92  -95  -94
 -97  -96  -99  -98 -101 -100 -103 -102 -105 -104 -107 -106 -109 -108 -111 -110 -113 -112 -115 -114 -117 -116 -119 -118 -121 -120 -123 -122 -125 -124 -127 -126

● 用 __shfl() 函數進行規約計算的代碼（只給出核函數代碼）：

 1 __global__ void reduce1(int *dst, int *src, const int n)
 2 {
 3     int tidGlobal = threadIdx.x + blockDim.x * blockIdx.x;
 4     int tidLocal = threadIdx.x;
 5 
 6     int sum = src[tidGlobal];
 7   
 8     __syncthreads();
 9 
10     for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
11         sum += __shfl_down(sum, offset);// 每次把后一半的結果挪到前一半並做加法
12 
13     if (tidLocal == 0)
14         dst[blockIdx.x] = sum;
15 }

▶ B.16. Warp matrix functions [PREVIEW FEATURE]（略過），要求 cc ≥ 7.0 的設備。

▶ B.17. Profiler Counter Function（略過）

1 //device_functions.h
2 #define __prof_trigger(X) asm __volatile__ ("pmevent \t" #X ";")

● 原文：Each multiprocessor has a set of sixteen hardware counters that an application can increment with a single instruction by calling the __prof_trigger() function. Increments by one per warp the per-multiprocessor hardware counter of index counter. Counters 8 to 15 are reserved and should not be used by applications. The value of counters 0, 1, ..., 7 can be obtained via nvprof by nvprof --events prof_trigger_0x where x is 0, 1, ..., 7. All counters are reset before each kernel launch (note that when collecting counters, kernel launches are synchronous as mentioned in Concurrent Execution between Host and Device).

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 CUDA C Programming Guide 在線教程學習筆記 Part 3 推薦一個python在線教程(可互動學習) knockout.js在線教程白月黑羽Python在線教程 AngularJs學習筆記--Guide教程系列文章索引 Metal Programming Guide Spark Streaming Programming Guide the C programming language 閱讀筆記1 CUDA學習筆記（三）——CUDA內存 Flink DataSet API Programming Guide