▶ 線程束表決函數(Warp Vote Functions)
● 用於同一線程束內各線程通信和計算規約指標。
1 // device_functions.h,cc < 9.0 2 __DEVICE_FUNCTIONS_STATIC_DECL__ int __all(int a) 3 { 4 int result; 5 asm __volatile__("{ \n\t" 6 ".reg .pred \t%%p1; \n\t" 7 ".reg .pred \t%%p2; \n\t" 8 "setp.ne.u32 \t%%p1, %1, 0; \n\t" 9 "vote.all.pred \t%%p2, %%p1; \n\t" 10 "selp.s32 \t%0, 1, 0, %%p2; \n\t" 11 "}" : "=r"(result) : "r"(a)); 12 return result; 13 } 14 15 __DEVICE_FUNCTIONS_STATIC_DECL__ int __any(int a) 16 { 17 int result; 18 asm __volatile__("{ \n\t" 19 ".reg .pred \t%%p1; \n\t" 20 ".reg .pred \t%%p2; \n\t" 21 "setp.ne.u32 \t%%p1, %1, 0; \n\t" 22 "vote.any.pred \t%%p2, %%p1; \n\t" 23 "selp.s32 \t%0, 1, 0, %%p2; \n\t" 24 "}" : "=r"(result) : "r"(a)); 25 return result; 26 } 27 28 __DEVICE_FUNCTIONS_STATIC_DECL__ 29 #if defined(__CUDACC_RTC__) || defined(__CUDACC_INTEGRATED__) 30 unsigned int __ballot(int a) 31 #else 32 int __ballot(int a) 33 #endif 34 { 35 int result; 36 asm __volatile__("{ \n\t" 37 ".reg .pred \t%%p1; \n\t" 38 "setp.ne.u32 \t%%p1, %1, 0; \n\t" 39 "vote.ballot.b32 \t%0, %%p1; \n\t" 40 "}" : "=r"(result) : "r"(a)); 41 return result; 42 } 43 44 // device_functions.h,cc≥9.0,改進並廢棄了原來的三個,增加兩個 45 int __all_sync(unsigned int mask, int predicate); 46 int __any_sync(unsigned int mask, int predicate); 47 int __uni_sync(unsigned int mask, int predicate); 48 unsigned int __ballot_sync(unsigned int mask, int predicate); 49 unsigned int __activemask(); 50 51 //sm_30_intrinsics.hpp,cc ≥ 9.0 52 __SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) 53 { 54 extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred); 55 return __nvvm_vote_all_sync(mask, pred); 56 } 57 58 __SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) 59 { 60 extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred); 61 return __nvvm_vote_any_sync(mask, pred); 62 } 63 64 __SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) 65 { 66 extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred); 67 return __nvvm_vote_uni_sync(mask, pred); 68 } 69 70 __SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) 71 { 72 extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred); 73 return __nvvm_vote_ballot_sync(mask, pred); 74 } 75 76 __SM_30_INTRINSICS_DECL__unsigned __activemask() 77 { 78 unsigned ret; 79 int predicate = 1; 80 asm volatile ("{ .reg .pred p; setp.ne.u32 p, %1, 0; vote.ballot.b32 %0, p; } " : "=r"(ret) : "r"(predicate)); 81 return ret; 82 }
● 在設備代碼的一個線程中調用 _all(predicate),__any(mask, predicate),__ballot(mask, predicate) 時,該線程所在的線程束中所有線程(標號 0 ~ 31,稱為 lane ID)求變量 predicate 的值,並按照一定的規律返回一個整形值。
● _all() 當且僅當所有線程的 predicate 非零時返回 1,否則返回 0。
● _any() 當且僅當至少有一個線程的 predicate 非零時返回 1,否則返回 0。
● _ballot() 返回一個無符號整數,代表了該線程束內變量 predicate 的非零值分布情況。線程 predicate 為零的該函數返回值該位為 0,線程 predicate 非零的該函數返回值該位為 1 。
● CUDA9.0 對以上函數進行了改進,變成了 _all_sync(),_any_sync(),_ballot_sync() 。添加了參數 unsigned int mask(注意也是 32 bit),用來指定線程束中的特定位參與 predicate 的計算(而不像 CUDA8.0 中那樣全員參與),不參加計算的線程結果按 0 計。函數強制同步了所有被 mask 指定的線程,就算被指定的線程不活躍,也要包含該函數的調用,否則結果未定義。
● _uni_sync() 當且僅當被 mask 指定線程的 predicate 全部非零或全部為零時返回 1,否則返回 0。
● __activemask() 返回一個無符號整數,代表了該線程束內活動線程的分布情況。該線程活動則返回值該位為 1,否則為 0 。該函數沒有 mask參數,必須全員參加。
● CUDA8.0 上的測試代碼
1 #include <stdio.h> 2 #include <malloc.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include "device_functions.h" 6 7 __global__ void vote_all(int *a, int *b, int n) 8 { 9 int tid = threadIdx.x; 10 if (tid > n) 11 return; 12 int temp = a[tid]; 13 b[tid] = __all(temp > 48); 14 } 15 16 __global__ void vote_any(int *a, int *b, int n) 17 { 18 int tid = threadIdx.x; 19 if (tid > n) 20 return; 21 int temp = a[tid]; 22 b[tid] = __any(temp > 48); 23 } 24 25 __global__ void vote_ballot(int *a, int *b, int n) 26 { 27 int tid = threadIdx.x; 28 if (tid > n) 29 return; 30 int temp = a[tid]; 31 b[tid] = __ballot(temp > 42 && temp < 53); 32 } 33 34 int main() 35 { 36 int *h_a, *h_b, *d_a, *d_b; 37 int n = 128, m = 32; 38 int nsize = n * sizeof(int); 39 40 h_a = (int *)malloc(nsize); 41 h_b = (int *)malloc(nsize); 42 for (int i = 0; i < n; ++i) 43 h_a[i] = i; 44 memset(h_b, 0, nsize); 45 cudaMalloc(&d_a, nsize); 46 cudaMalloc(&d_b, nsize); 47 cudaMemcpy(d_a, h_a, nsize, cudaMemcpyHostToDevice); 48 cudaMemset(d_b, 0, nsize); 49 50 vote_all << <1, n >> >(d_a, d_b, n); 51 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 52 printf("vote_all():"); 53 for (int i = 0; i < n; ++i) 54 { 55 if (!(i % m)) 56 printf("\n"); 57 printf("%d ", h_b[i]); 58 } 59 printf("\n"); 60 61 vote_any << <1, n >> >(d_a, d_b, n); 62 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 63 printf("vote_any():"); 64 for (int i = 0; i < n; ++i) 65 { 66 if (!(i % m)) 67 printf("\n"); 68 printf("%d ", h_b[i]); 69 } 70 printf("\n"); 71 72 vote_ballot << <1, n >> >(d_a, d_b, n); 73 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 74 cudaDeviceSynchronize(); 75 printf("vote_ballot():"); 76 for (int i = 0; i < n; ++i) 77 { 78 if (!(i % m)) 79 printf("\n"); 80 printf("%u ", h_b[i]);// 用無符號整數輸出 81 } 82 printf("\n"); 83 84 getchar(); 85 return 0; 86 }
● 輸出結果。其中 209510410 = 0000 0000 0001 1111 1111 1000 0000 00002,即第二個線程束(標號 32 ~ 63)的第 11 位(含0,標號43)起連續 10 位為 1,其余為 0 。
vote_all(): 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 vote_any(): 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 vote_ballot(): 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
● CUDA9.0 上的測試代碼:
1 #include <stdio.h> 2 #include <malloc.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include "device_functions.h" 6 7 __global__ void vote_all(int *a, int *b, int n) 8 { 9 int tid = threadIdx.x; 10 if (tid > n) 11 return; 12 int temp = a[tid]; 13 b[tid] = __all_sync(0xffffffff, temp > 48);// 注意添加了參數 mask 14 } 15 16 __global__ void vote_any(int *a, int *b, int n) 17 { 18 int tid = threadIdx.x; 19 if (tid > n) 20 return; 21 int temp = a[tid]; 22 b[tid] = __any_sync(0xffffffff, temp > 48); 23 } 24 25 __global__ void vote_ballot(int *a, int *b, int n) 26 { 27 int tid = threadIdx.x; 28 if (tid > n) 29 return; 30 int temp = a[tid]; 31 b[tid] = __ballot_sync(0xffffffff, temp > 42 && temp < 53); 32 } 33 34 __global__ void vote_union(int *a, int *b, int n) 35 { 36 int tid = threadIdx.x; 37 if (tid > n) 38 return; 39 int temp = a[tid]; 40 b[tid] = __uni_sync(0xffffffff, temp > 42 && temp < 53); 41 } 42 43 __global__ void vote_active(int *a, int *b, int n) 44 { 45 int tid = threadIdx.x; 46 if (tid > n || tid % 2)// 斃掉了所有偶數號線程 47 return; 48 int temp = a[tid]; 49 b[0] = __activemask(); 50 } 51 52 int main() 53 { 54 int *h_a, *h_b, *d_a, *d_b; 55 int n = 128, m = 32; 56 int nsize = n * sizeof(int); 57 58 h_a = (int *)malloc(nsize); 59 h_b = (int *)malloc(nsize); 60 for (int i = 0; i < n; ++i) 61 h_a[i] = i; 62 memset(h_b, 0, nsize); 63 cudaMalloc(&d_a, nsize); 64 cudaMalloc(&d_b, nsize); 65 cudaMemcpy(d_a, h_a, nsize, cudaMemcpyHostToDevice); 66 cudaMemset(d_b, 0, nsize); 67 68 vote_all << <1, n >> >(d_a, d_b, n); 69 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 70 printf("vote_all():"); 71 for (int i = 0; i < n; ++i) 72 { 73 if (!(i % m)) 74 printf("\n"); 75 printf("%d ", h_b[i]); 76 } 77 printf("\n"); 78 79 vote_any << <1, n >> >(d_a, d_b, n); 80 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 81 printf("vote_any():"); 82 for (int i = 0; i < n; ++i) 83 { 84 if (!(i % m)) 85 printf("\n"); 86 printf("%d ", h_b[i]); 87 } 88 printf("\n"); 89 90 vote_union << <1, n >> >(d_a, d_b, n); 91 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 92 printf("vote_union():"); 93 for (int i = 0; i < n; ++i) 94 { 95 if (!(i % m)) 96 printf("\n"); 97 printf("%d ", h_b[i]); 98 } 99 printf("\n"); 100 101 vote_ballot << <1, n >> >(d_a, d_b, n); 102 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 103 cudaDeviceSynchronize(); 104 printf("vote_ballot():"); 105 for (int i = 0; i < n; ++i) 106 { 107 if (!(i % m)) 108 printf("\n"); 109 printf("%u ", h_b[i]);// 用無符號整數輸出 110 } 111 printf("\n"); 112 113 vote_active << <1, n >> >(d_a, d_b, n); 114 cudaMemcpy(h_b, d_b, sizeof(int), cudaMemcpyDeviceToHost); 115 cudaDeviceSynchronize(); 116 printf("vote_active():\n%u ", h_b[0]);// 用無符號整數輸出 117 printf("\n"); 118 119 getchar(); 120 return 0; 121 }
● 輸出結果。其中 2095104 同 CUDA8.0 中的情況;143165576510 = 0101 0101 0101 0101 0101 0101 0101 01012,即所有偶數號線程都不活躍(提前 return 掉了)。
vote_all(): 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 vote_any(): 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 vote_union(): 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 vote_ballot(): 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 2095104 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 vote_active():
1431655765
▶ 線程束匹配函數(Warp Match Functions),要求 cc ≥ 7.0 的設備。
● 與線程束表決函數類似,對線程束內指定的線程進行計算,返回滿足條件的線程編號構成的無符號整數。T 可以是 int,unsigned int,long,unsigned long,long long,unsigned long long,float,double 。
1 unsigned int __match_any_sync(unsigned mask, T value); 2 unsigned int __match_all_sync(unsigned mask, T value, int *pred);
● __match_any_sync() 比較 mask 指定的所有線程中的變量 value,返回具有相同值的線程編號構成的無符號整數。
● __match_all_sync() 比較 mask 指定的所有線程中的變量 value,當所有被指定的線程具有相同值的時候返回 mask 且 *pred 被置為 true,否則返回 0 且置 *pred 為 false。
▶ 線程束統籌函數(Warp Shuffle Functions)
● 定義在 sm_30_intrinsics.hpp 中,與 Warp Vote Functions 兩者構成了整個頭文件。T 可以是 int,unsigned int,long,unsigned long,long long,unsigned long long,float,double,__half,__half2 。
1 // sm_30_intrinsics.h,cuda < 9.0 2 T __shfl(int var, int srcLane, int width); 3 T __shfl_up(int var, int srcLane, int width); 4 T __shfl_down(int var, int srcLane, int width); 5 T __shfl_xor(int var, int srcLane, int width); 6 7 // sm_30_intrinsics.h,cuda ≥ 9.0 8 T __shfl_sync(unsigned mask, T var, int srcLane, int width = warpSize); 9 T __shfl_up_sync(unsigned mask, T var, unsigned int delta, int width = warpSize); 10 T __shfl_down_sync(unsigned mask, T var, unsigned int delta, int width = warpSize); 11 T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize);
● 此處說明的圖,以及后面的規約計算代碼來源:http://blog.csdn.net/bruce_0712/article/details/64926471
● __shfl_sync() 被 mask 指定的線程返回標號為 srcLane 的線程中的變量 var 的值,其余線程返回0 。如下圖例子中,調用 shfl_sync(mask, x, 2, 16); ,則標號為 2 的線程向標號為 0 ~ 15 的線程廣播了其變量 x 的值;標號為 18 的線程向標號為 16 ~ 31 的線程廣播了其變量 x 的值。

● __shfl_up_sync() 被 mask 指定的線程返回向前偏移為 delta 的線程中的變量 var 的值,其余線程返回0 。如下圖例子中,調用 shfl_up_sync(mask, x, 2, 16); ,則標號為 2 ~15 的線程分別獲得標號為 0 ~ 13 的線程中變量 x 的值;標號為 18 ~31 的線程分別獲得標號為 16 ~ 29 的線程中變量 x 的值。

● __shfl_down_sync() 被 mask 指定的線程返回向后偏移為 delta 的線程中的變量 var 的值,其余線程返回0 。如下圖例子中,調用 shfl_down_sync(mask, x, 2, 16); ,則標號為 0 ~13 的線程分別獲得標號為 2 ~ 15 的線程中變量 x 的值;標號為 16 ~29 的線程分別獲得標號為 18 ~ 31 的線程中變量 x 的值。

● __shfl_xor_sync() 被 mask 指定的線程返回向后偏移為 delta 的線程中的變量 var 的值,其余線程返回0 。如下圖例子中,調用 shfl_down_sync(mask, x, 1, 16); ,則標號為 0 ~13 的線程分別獲得標號為 2 ~ 15 的線程中變量 x 的值;標號為 16 ~29 的線程分別獲得標號為 18 ~ 31 的線程中變量 x 的值。

● __shfl_xor_sync() 的參數 laneMask 說明:
■ 當 n = 2k 時,表現為將連續的 n 個元素看做一個整體,與其后方連續的 n 個元素的整體做交換,但是兩個整體的內部不做交換。例如 [0, 1, 2, 3, 4, 5, 6, 7] 做 n = 2 的變換得到 [2, 3, 0, 1, 6, 7, 4, 5] 。
■ 當 n ≠ 2k 時,先將 n 拆分成若干 2k 之和,分別做這些層次上的變換。這種操作是良定義的(二元輪換滿足交換律和結合律)。例如 [0, 1, 2, 3, 4, 5, 6, 7] 做 n = 3 的變換時,先做 n = 2 的變換,得到 [2, 3, 0, 1, 6, 7, 4, 5],再做 n = 1 的變換,得到 [3, 2, 1, 0, 7, 6, 5, 4] 。
● 測試代碼
1 #include <stdio.h> 2 #include <malloc.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include "device_functions.h" 6 7 __global__ void shfl(int *a, int *b, int n) 8 { 9 int tid = threadIdx.x; 10 if (tid > n) 11 return; 12 int temp = -a[tid];// 廣播的值為線程原值的相反數 13 b[tid] = a[tid]; // 先將值賦成原值 14 15 b[tid] = __shfl_sync(0x00000000, temp, 0, 16); 16 // mask 作用不明,無論是調整為 0xffffffff 還是 0x55555555 還是 0x00000000 結果都沒有變化 17 // temp 要廣播的變量 18 // 0 廣播源線程編號。若參數超出32,則自動取模處理(如輸入為 99,則自動變成 99 % 32 = 3) 19 // 16 廣播寬度。默認值 32(線程束內廣播),可以調整為不超過 32 的 2 的整數次冪,超出 32 操作未定義(實測結果被當成 32 處理) 20 } 21 22 __global__ void shfl_up(int *a, int *b, int n) 23 { 24 int tid = threadIdx.x; 25 if (tid > n) 26 return; 27 int temp = -a[tid]; 28 b[tid] = a[tid]; 29 30 b[tid] = __shfl_up_sync(0x00000000, temp, 1, 16); 31 // 1 偏移量,而不是源線程編號 32 } 33 34 __global__ void shfl_down(int *a, int *b, int n) 35 { 36 int tid = threadIdx.x; 37 if (tid > n) 38 return; 39 int temp = -a[tid];// 廣播的值為線程原值的相反數 40 b[tid] = a[tid]; // 先將值賦成原值 41 42 b[tid] = __shfl_down_sync(0x00000000, temp, 1, 16); 43 // 1 偏移量,而不是源線程編號 44 } 45 46 __global__ void shfl_xor(int *a, int *b, int n) 47 { 48 int tid = threadIdx.x; 49 if (tid > n) 50 return; 51 int temp = -a[tid];// 廣播的值為線程原值的相反數 52 b[tid] = a[tid]; // 先將值賦成原值 53 54 b[tid] = __shfl_xor_sync(0x00000000, temp, 1, 16); 55 // 1 移動塊大小,比較復雜,見前面的函數說明 56 } 57 58 59 int main() 60 { 61 int *h_a, *h_b, *d_a, *d_b; 62 int n = 128, m = 32; 63 int nsize = n * sizeof(int); 64 65 h_a = (int *)malloc(nsize); 66 h_b = (int *)malloc(nsize); 67 for (int i = 0; i < n; ++i) 68 h_a[i] = i; 69 memset(h_b, 0, nsize); 70 cudaMalloc(&d_a, nsize); 71 cudaMalloc(&d_b, nsize); 72 cudaMemcpy(d_a, h_a, nsize, cudaMemcpyHostToDevice); 73 cudaMemset(d_b, 0, nsize); 74 75 printf("Inital Array:"); 76 for (int i = 0; i < n; ++i) 77 { 78 if (!(i % m)) 79 printf("\n"); 80 printf("%4d ", h_a[i]); 81 } 82 printf("\n"); 83 84 shfl << <1, n >> >(d_a, d_b, n); 85 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 86 cudaDeviceSynchronize(); 87 printf("shfl():"); 88 for (int i = 0; i < n; ++i) 89 { 90 if (!(i % m)) 91 printf("\n"); 92 printf("%4d ", h_b[i]); 93 } 94 printf("\n"); 95 96 shfl_up << <1, n >> >(d_a, d_b, n); 97 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 98 cudaDeviceSynchronize(); 99 printf("shfl_up():"); 100 for (int i = 0; i < n; ++i) 101 { 102 if (!(i % m)) 103 printf("\n"); 104 printf("%4d ", h_b[i]); 105 } 106 printf("\n"); 107 108 shfl_down << <1, n >> >(d_a, d_b, n); 109 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 110 cudaDeviceSynchronize(); 111 printf("shfl_down():"); 112 for (int i = 0; i < n; ++i) 113 { 114 if (!(i % m)) 115 printf("\n"); 116 printf("%4d ", h_b[i]); 117 } 118 printf("\n"); 119 120 shfl_xor << <1, n >> >(d_a, d_b, n); 121 cudaMemcpy(h_b, d_b, nsize, cudaMemcpyDeviceToHost); 122 cudaDeviceSynchronize(); 123 printf("shfl_xor():"); 124 for (int i = 0; i < n; ++i) 125 { 126 if (!(i % m)) 127 printf("\n"); 128 printf("%4d ", h_b[i]); 129 } 130 printf("\n"); 131 132 getchar(); 133 return 0; 134 }
● 輸出結果
Inital Array: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 shfl(): 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -32 -32 -32 -32 -32 -32 -32 -32 -32 -32 -32 -32 -32 -32 -32 -32 -48 -48 -48 -48 -48 -48 -48 -48 -48 -48 -48 -48 -48 -48 -48 -48 -64 -64 -64 -64 -64 -64 -64 -64 -64 -64 -64 -64 -64 -64 -64 -64 -80 -80 -80 -80 -80 -80 -80 -80 -80 -80 -80 -80 -80 -80 -80 -80 -96 -96 -96 -96 -96 -96 -96 -96 -96 -96 -96 -96 -96 -96 -96 -96 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 -112 shfl_up(): 0 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -16 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -32 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -48 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -64 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -80 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -96 -96 -97 -98 -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -112 -112 -113 -114 -115 -116 -117 -118 -119 -120 -121 -122 -123 -124 -125 -126 shfl_down(): -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -15 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -31 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -47 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -63 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -79 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -95 -97 -98 -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -111 -113 -114 -115 -116 -117 -118 -119 -120 -121 -122 -123 -124 -125 -126 -127 -127 shfl_xor(): -1 0 -3 -2 -5 -4 -7 -6 -9 -8 -11 -10 -13 -12 -15 -14 -17 -16 -19 -18 -21 -20 -23 -22 -25 -24 -27 -26 -29 -28 -31 -30 -33 -32 -35 -34 -37 -36 -39 -38 -41 -40 -43 -42 -45 -44 -47 -46 -49 -48 -51 -50 -53 -52 -55 -54 -57 -56 -59 -58 -61 -60 -63 -62 -65 -64 -67 -66 -69 -68 -71 -70 -73 -72 -75 -74 -77 -76 -79 -78 -81 -80 -83 -82 -85 -84 -87 -86 -89 -88 -91 -90 -93 -92 -95 -94 -97 -96 -99 -98 -101 -100 -103 -102 -105 -104 -107 -106 -109 -108 -111 -110 -113 -112 -115 -114 -117 -116 -119 -118 -121 -120 -123 -122 -125 -124 -127 -126
● 用 __shfl() 函數進行規約計算的代碼(只給出核函數代碼):
1 __global__ void reduce1(int *dst, int *src, const int n) 2 { 3 int tidGlobal = threadIdx.x + blockDim.x * blockIdx.x; 4 int tidLocal = threadIdx.x; 5 6 int sum = src[tidGlobal]; 7 8 __syncthreads(); 9 10 for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) 11 sum += __shfl_down(sum, offset);// 每次把后一半的結果挪到前一半並做加法 12 13 if (tidLocal == 0) 14 dst[blockIdx.x] = sum; 15 }
▶ B.16. Warp matrix functions [PREVIEW FEATURE](略過),要求 cc ≥ 7.0 的設備。
▶ B.17. Profiler Counter Function(略過)
1 //device_functions.h 2 #define __prof_trigger(X) asm __volatile__ ("pmevent \t" #X ";")
● 原文:Each multiprocessor has a set of sixteen hardware counters that an application can increment with a single instruction by calling the __prof_trigger() function. Increments by one per warp the per-multiprocessor hardware counter of index counter. Counters 8 to 15 are reserved and should not be used by applications. The value of counters 0, 1, ..., 7 can be obtained via nvprof by nvprof --events prof_trigger_0x where x is 0, 1, ..., 7. All counters are reset before each kernel launch (note that when collecting counters, kernel launches are synchronous as mentioned in Concurrent Execution between Host and Device).
