問題:對於使用GPU計算時,都想知道kernel函數運行所耗費的時間,使用CUDA內置的API可以方便准確的獲得kernel運行時間。
在CPU上,可以使用clock()函數和GetTickCount()函數計時。
clock_t start, end; start = clock(); //執行步驟;
......
end = clock(); printf(" time (CPU) : %f ms(毫秒) \n", end - start);
int startTime, endTime; // 開始時間 startTime = GetTickCount(); //執行步驟;
......
endTime = GetTickCount(); cout << " 總時間為 : " << (double)(endTime - startTime)<< " ms " << endl;
對於CUDA核函數計時使用clock()或GetTickCount()函數結果不准確,計算歸約求和的例子如下:
//CPU計時 clock_t start, end; start = clock(); d_SharedMemoryTest << < NThreadX, ThreadX >> > (S_Para, MX); //調用核函數(M個包含N個線程的線程塊) cudaDeviceSynchronize(); end = clock(); clock_t time = end - start; printf(" time (GPU) : %f ms \n", time);
結果為0.000000 ms(明顯結果錯誤):

而使用CUDA內置API(cudaEvent_t)計時,主要代碼如下
//GPU計時 cudaEvent_t startTime, endTime; cudaEventCreate(&startTime); cudaEventCreate(&endTime); cudaEventRecord(startTime, 0); d_SharedMemoryTest << < NThreadX, ThreadX >> > (S_Para, MX); //調用核函數(M個包含N個線程的線程塊) cudaEventRecord(endTime, 0); cudaEventSynchronize(startTime); cudaEventSynchronize(endTime); float time; cudaEventElapsedTime(&time, startTime, endTime); printf(" time (GPU) : %f ms \n", time); cudaEventDestroy(startTime); cudaEventDestroy(endTime);
結果為39.848801 ms:

最后附上全部代碼:
#pragma once #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "device_functions.h" #include <iostream> using namespace std; const int NX = 10369000; //數組長度 const int ThreadX = 256; //線程塊大小 //使用shared memory和多個線程塊 __global__ void d_SharedMemoryTest(double *para, int MX) { int i = threadIdx.x; //該線程塊中線程索引 int tid = blockIdx.x * blockDim.x + threadIdx.x; //M個包含N個線程的線程塊中相對應全局內存數組的索引(全局線程) __shared__ double s_Para[ThreadX]; //定義固定長度(線程塊長度)的共享內存數組 if (tid < MX) //判斷全局線程小於整個數組長度NX,防止數組越界 s_Para[i] = para[tid]; //將對應全局內存數組中一段元素的值賦給共享內存數組 __syncthreads(); //(紅色下波浪線提示由於VS不識別,不影響運行)同步,等待所有線程把自己負責的元素載入到共享內存再執行下面代碼 if (tid < MX) { for (int index = 1; index < blockDim.x; index *= 4) //歸約求和 (對應256=4*4*4*4線程數) { __syncthreads(); if (i % (4 * index) == 0) { s_Para[i] += s_Para[i + index] + s_Para[i + 2*index] + s_Para[i + 3*index]; } } } if (i == 0) //求和完成,總和保存在共享內存數組的0號元素中 para[blockIdx.x * blockDim.x + i] = s_Para[i]; //在每個線程塊中,將共享內存數組的0號元素賦給全局內存數組的對應元素,即線程塊索引*線程塊維度+i(blockIdx.x * blockDim.x + i) } //使用shared memory和多個線程塊 void s_ParallelTest() { double *Para; cudaMallocManaged((void **)&Para, sizeof(double) * NX); //統一內存尋址,CPU和GPU都可以使用 double ParaSum = 0; for (int i = 0; i<NX; i++) { Para[i] = 1; //數組賦值 ParaSum += Para[i]; //CPU端數組累加 } cout << " CPU result = " << ParaSum << endl; //顯示CPU端結果 double d_ParaSum; int Blocks = ((NX + ThreadX - 1) / ThreadX); cout << " 線程塊大小 :" << ThreadX << " 線程塊數量 :" << Blocks << endl; double *S_Para; int MX = ThreadX * Blocks; cudaMallocManaged(&S_Para, sizeof(double) * MX); for (int i=0; i<MX; i++) { if (i < NX) S_Para[i] = Para[i]; } ////CPU計時 //clock_t start, end; //start = clock(); //d_SharedMemoryTest << < Blocks, ThreadX >> > (S_Para, MX); //調用核函數(M個包含N個線程的線程塊) // //cudaDeviceSynchronize(); //end = clock(); //clock_t time = end - start; //printf(" time (GPU) : %f ms \n", time); //GPU計時 cudaEvent_t startTime, endTime; cudaEventCreate(&startTime); cudaEventCreate(&endTime); cudaEventRecord(startTime, 0); d_SharedMemoryTest << < Blocks, ThreadX >> > (S_Para, MX); //調用核函數(M個包含N個線程的線程塊) cudaEventRecord(endTime, 0); cudaEventSynchronize(startTime); cudaEventSynchronize(endTime); float time; cudaEventElapsedTime(&time, startTime, endTime); printf(" time (GPU) : %f ms \n", time); cudaEventDestroy(startTime); cudaEventDestroy(endTime); for (int i=0; i<Blocks; i++) { d_ParaSum += S_Para[i*ThreadX]; //將每個線程塊相加求的和(保存在對應全局內存數組中)相加求和 } cout << " GPU result = " << d_ParaSum << endl; //顯示GPU端結果 } int main() { s_ParallelTest(); system("pause"); return 0; }
