由於剛剛開始學習Cuda,還沒有整理出一個完整的Cuda類,只是在Nvidia提供的kenerl架構上做修改。
但用於初體驗GPU給我們帶來的好處也綽綽有余了。
直接貼代碼:
/* 矩陣乘法,CPU版本和GPU版本的對比 */ #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <stdlib.h> #include <time.h> #include <Windows.h> #include <string> #include <malloc.h> //用於指示不同的GPU 優化版本 enum Type { Mode1 = 1, //Mode 1 :將每一個C[i][j]都分別分配一個線程 Mode2 = 2 //Mode 2 :不讓一個線程完整計算一個C[i][j],通過C(i,j) = sum { A(i,k)*B(k,j) }發現,我們還可以再細度划分: // sub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)} 0<=ksub < blockSize // C(i, j) = sum{ Csub(i, j) } // 就是把矩陣分成n*n個大的子塊,然后每一個block負責計算子塊i 和 子塊j的子乘積,計算完畢后加起來則可。這里主要使用了共享顯存作優化。 }; cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode); __global__ void MatrixMulGPU_1(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB) { float sum = 0; //找出該線程所在的行和列 int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; //線程Thread(row, col)負責計算C(row, col) for (int i = 0; i < WB; ++i) { sum += a[row * WA + i] * b[i * WB + col]; } c[row * WB + col] = sum; } template<int BLOCK_SIZE> __global__ void MatrixMulGPU_2(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB) { // Block index int bx = blockIdx.x; int by = blockIdx.y; // Thread index int tx = threadIdx.x; int ty = threadIdx.y; // Index of the first sub-matrix of A processed by the block int aBegin = WA * BLOCK_SIZE * by; // Index of the last sub-matrix of A processed by the block int aEnd = aBegin + WA - 1; // Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * bx; // Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * WB; // Csub is used to store the element of the block sub-matrix // that is computed by the thread float Csub = 0; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int i = aBegin, j = bBegin; i <= aEnd; i += aStep, j += bStep) { // Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; // Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; // Load the matrices from device memory // to shared memory; each thread loads // one element of each matrix As[ty][tx] = a[i + WA * ty + tx]; Bs[ty][tx] = b[j + WB * ty + tx]; // Synchronize to make sure the matrices are loaded __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[ty][k] * Bs[k][tx]; } // Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration __syncthreads(); } // Write the block sub-matrix to device memory; // each thread writes one element int k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx; c[k + WB * ty + tx] = Csub; } //GPU version void MatrixMulCPU(float *_C, const float* _A, const float* _B, int WA, int HA, int WB, int HB) { if (WA != HB) { printf("the matrix A and B cannot be multipled!"); exit(0); } for (int i = 0; i < HA; ++i) { for (int j = 0; j < WB; ++j) { for (int k = 0; k < WA; ++k) { _C[i * WA + j] += _A[i * WA + k] * _B[k * WB + j]; } } } } //給初始的矩陣一個隨機值 void randomInit(float* _data, int _size) { for (int i = 0; i < _size; ++i) { _data[i] = rand() / (float)RAND_MAX * 100; } } //print the matrix void printMatrix(float* m_Matrix, int W, int H) { for (int i = 0; i < W * H; ++i) { printf("%2.1f ", m_Matrix[i]); if (i % W == 0 && i != 0) printf("\n"); } printf("\n"); } bool CheckAnswer(const float* _C, const float* _D, unsigned int size) { bool isRight = true; for (int i = 0; i < size && isRight == true; ++i) { if (_C[i] != _D[i]) isRight = false; } return isRight; } int main() { const int width_A = 1024; const int height_A = 1024; const int width_B = 1024; const int height_B = 1024; float *B = (float *)malloc(sizeof(float) * height_B * width_B); float *A = (float *)malloc(sizeof(float) * height_A * width_A); float *C = (float *)malloc(sizeof(float) * height_A * width_B); float *D = (float *)malloc(sizeof(float) * height_A * width_B); float *E = (float *)malloc(sizeof(float) * height_A * width_B); memset(A, 0.0, sizeof(float) * height_A * width_A); memset(B, 0.0, sizeof(float) * height_B * width_B); memset(C, 0.0, sizeof(float) * height_A * width_B); memset(D, 0.0, sizeof(float) * height_A * width_B); memset(E, 0.0, sizeof(float) * height_A * width_B); //產生隨機數生成器 srand((unsigned)time(0)); randomInit(B, height_B * width_B); randomInit(A, height_A * width_A); //printMatrix(B, width_B, height_B); //printMatrix(A, width_A, height_A); //CPU 計算 unsigned int tick1 = GetTickCount(); MatrixMulCPU(C, A, B, width_A, height_A, width_B, height_B); printf("CPU use time : %dms\n", GetTickCount() - tick1); //GPU Type m_Mode = Mode1; unsigned int tick2 = GetTickCount(); cudaError_t cudaStatus = addWithCuda(D, A, B, width_A, height_A, width_B, height_B, m_Mode); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!\n"); return 1; } printf("GPU mode1 use time : %dms\n", GetTickCount() - tick2); m_Mode = Mode2; unsigned int tick3 = GetTickCount(); cudaStatus = addWithCuda(E, A, B, width_A, height_A, width_B, height_B, m_Mode); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!\n"); return 1; } printf("GPU mode2 use time : %dms\n", GetTickCount() - tick3); //檢查GPU, CPU 計算的結果是否相同 if (!CheckAnswer(C, D, height_A * width_B) && !CheckAnswer(C, E, height_A * width_B)) printf("The answer is wrong!"); else printf("The answer is right!"); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } return 0; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode) { float *dev_a = 0; float *dev_b = 0; float *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, HA * WB * sizeof(float)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, HA * WA * sizeof(float)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, HB * WB * sizeof(float)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, HA * WA * sizeof(float), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, HB * WB * sizeof(float), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } //為每一個C[i][j]設置一個線程進行計算 int block_size = 16; dim3 Threads(block_size, block_size); dim3 Blocks(WB / block_size, HA / block_size); // Launch a kernel on the GPU with one thread for each element. if (mode == Mode1) { MatrixMulGPU_1 << <Blocks, Threads >>>(dev_c, dev_a, dev_b, WA, WB); } if (mode == Mode2) { MatrixMulGPU_2<16> << <Blocks, Threads >> >(dev_c, dev_a, dev_b, WA, WB); } // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, HA * WB * sizeof(float), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus; }
代碼中,總過使用了CPU的計算和兩種GPU的運算,最終的運行結果如下:
可以明顯的看出,GPU的運行速度比CPU快很多,並且將任務越細分,運行的速度也更快。
后續我還想通過更多的方式(比如texture binding)來繼續進行優化。