CPU 矩陣乘法
能相乘的兩個矩陣,必須滿足一個矩陣的行數和第二個矩陣的列數相同.
A(N*P) * B(P*M) = C(N*M). 其中P是行數,N是列數, 從寬高的角度來說,即 A的寬度和B的高度是相同的.C矩陣 = ha * wb.
//C = A*B void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb) { float sum = 0; for (int i = 0; i < _ha; ++i) { for (int j = 0; j < _wb; ++j) { sum = 0; for (int k = 0; k < _wa; ++k) {
//i*_wa得到當前對應的是A矩陣中的哪一行,+k對應當前行的哪一列.矩陣A的stride是wa
//j對應當前矩陣B的哪一列,+k*wb依次表示第0行的j列,第1行的j列...第wa-1行的j列.矩陣B的stride是wb sum += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j]; } _C[i*_wb+j] = (float)sum; } } }
簡單矩陣乘法
怎么划分呢?首先最初的想法是 讓每一個線程計算一個C(i,j),那么估算一下,應該需要height_c*width_c,也就是ha*wb個線程。進一步,我們將矩陣按一個大方格Block划分,如果一個方格Block大小是16*16,那么矩陣80*48的可以表示為5(*16) * 3(*16),即5*3個大格子(Grid),所以grid的划分自然就是(height_c/16) *(width_c/16)個線程了。
計算版本0:
__global__ void matrix_kernel_0(float* _C,const float* _A,const float *_B,int _wa,int _wb) { float sum = 0; //找出該線程所在的行列 int row = blockIdx.y*blockDim.y + threadIdx.y; // X 對應矩陣row, Y對應舉證col int col = blockIdx.x*blockDim.x + threadIdx.x; //線程Thread(row,col)負責計算C(row,col) for (int i = 0; i < _wa; ++i) { sum += _A[row*_wa + i]*_B[i*_wb + col]; } _C[row*_wb + col] = sum; }
這個是Heterogeneous Parallel Programming lab3:Basic Matrix Matrix Multiplication的代碼:

#include <wb.h> #define wbCheck(stmt) \ do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ return -1; \ } \ } while (0) #if 0 //This is C verison matrixMUl //C = A*B void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb) { float sum = 0; for (int i = 0; i < _ha; ++i) { for (int j = 0; j < _wb; ++j) { sum = 0; for (int k = 0; k < _wa; ++k) { sum += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j]; } _C[i*_wb+j] = (float)sum; } } } #endif // Compute C = A * B , Matrix C = hA * wB = rowA * columnB __global__ void matrixMultiply(float *A, float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { //@@ Insert code to implement matrix multiplication here float sum = 0.0f; int row = blockIdx.y*blockDim.y + threadIdx.y; int col = blockIdx.x*blockDim.x + threadIdx.x; if(row < numCRows && col < numCColumns){ for (int i = 0; i < numAColumns; ++i) { sum += A[row*numAColumns + i] * B[i*numBColumns + col]; } C[row*numBColumns + col] = sum; } printf("C = %f\n",C[row*numBColumns + col]); } int main(int argc, char **argv) { wbArg_t args; float *hostA; // The A matrix float *hostB; // The B matrix float *hostC; // The output C matrix float *deviceA; float *deviceB; float *deviceC; int numARows; // number of rows in the matrix A int numAColumns; // number of columns in the matrix A int numBRows; // number of rows in the matrix B int numBColumns; // number of columns in the matrix B int numCRows; // number of rows in the matrix C (you have to set this) int numCColumns; // number of columns in the matrix C (you have to set this) args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host"); hostA = ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); hostB = ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); //@@ Set numCRows and numCColumns numCRows = 0; numCColumns = 0; if(numAColumns != numBRows){ wbLog(TRACE, "numAColumns != numBRows, Break "); return 1; } numCRows = numARows; numCColumns = numBColumns; unsigned int A_size = numARows * numAColumns * sizeof(float); unsigned int B_size = numBRows * numBColumns * sizeof(float); unsigned int C_size = numCRows * numCColumns * sizeof(float); //@@ Allocate the hostC matrix hostC = ( float * )malloc(C_size); wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory."); //@@ Allocate GPU memory here wbCheck(cudaMalloc((void**)&deviceA, A_size)); wbCheck(cudaMalloc((void**)&deviceB, B_size)); wbCheck(cudaMalloc((void**)&deviceC, C_size)); wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU."); //@@ Copy memory to the GPU here wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice)); wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice)); wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice)); wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here dim3 DimGrid((numCColumns-1)/16+1, (numCRows-1)/16+1, 1); dim3 DimBlock(16, 16, 1); wbTime_start(Compute, "Performing CUDA computation"); //@@ Launch the GPU Kernel here matrixMultiply<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns); cudaDeviceSynchronize(); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); //@@ Copy the GPU memory back to the CPU here //@@ Copy the GPU memory back to the CPU here wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost)); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); //@@ Free the GPU memory here wbCheck(cudaFree(deviceA)); wbCheck(cudaFree(deviceB)); wbCheck(cudaFree(deviceC)); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA); free(hostB); free(hostC); return 0; }
使用tile來划分矩陣乘法
另外一種思路,我們不讓每一個線程完整計算一個C(i,j),通過C(i,j) = sum { A(i,k)*B(k,j) }發現,我們還可以再細度划分:
Csub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)} 0<=ksub < blockSize
C(i,j) = sum{Csub(i,j)}
就是把矩陣分成n*n個大的子塊,然后每一個block負責計算子塊i 和 子塊j的子乘積,計算完畢后加起來則可。這里主要引入shared Memory來提高程序效率.
計算矩陣我們
__global__ void matrix_kernel_1(float* _C,const float* _A,const float *_B,int _wa,int _wb) //_wa是A矩陣的寬度,_wb是矩陣B的寬度 { int bx = blockIdx.x; //Block X的當前位置 int by = blockIdx.y; //Block y的當前位置 int tx = threadIdx.x; int ty = threadIdx.y; //該block要處理的A ,A的取值方向是X軸方向, B的取值方向是Y軸方向 int aBegin = _wa*(by*BLOCK_SIZE);//A(0,by) //在矩陣A上每個block的首地址 int aEnd = aBegin + _wa - 1; // int aStep = BLOCK_SIZE;//offsetA //因為A是橫向取值,所以step是blocksize int bBegin = BLOCK_SIZE*bx;//B(bx,0) //矩陣B的首地址 int bStep = BLOCK_SIZE*_wb;//offsetB //因為B是縱向取值,所以step是blocksize*_wb. float cSub = 0;
//每一個線程計算一個像素點,分成wa/block 次來計算,每次計算一段A(sub) * B(sub),最后累加得到C的結果.
//假設矩陣都是n*n的,那么舊的basicMatrix每個線程都需要執行2n次globalMemory的訪問,這里用到sharedMemory只需要執行2n/blocksize,每個線程可以提高blocksize倍,
//每個block里面的thread都是通過讀取sharedMemory來執行計算的,速度會非常快. for (int a = aBegin,b = bBegin; a <= aEnd; a += aStep,b += bStep) { __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; //每個線程負責一個元素拷貝,我們以block為單位來分析。假設blocksize=16, 一個block里面有 16*16個線程。
//每個block 可以填滿需要用到的 As, 和Bs大小的矩陣。這里就是矩陣A里面的16*16的數據可以填滿,保存在sharedMemory中。同樣B矩陣也是。 As[ty][tx] = _A[a + _wa*ty + tx]; Bs[ty][tx] = _B[b + _wb*ty + tx]; __syncthreads(); //同步使得矩陣A,和矩陣B的第一個tile*tile的數據保存在As和Bs里,供下面的計算使用. //每個線程負責計算一個子塊i 和 子塊j的子乘積寬度是block_size,執行到wa/block_size次,累加可得到C的值 for (int k = 0; k < BLOCK_SIZE; ++k) { cSub += As[ty][k]*Bs[k][tx]; } __syncthreads(); } //全局地址,向全局寄存器寫回去 //一個線程負責一個元素,一個block負責一個子塊 int cIndex = (by*BLOCK_SIZE + ty)*_wb + (bx*BLOCK_SIZE + tx); _C[cIndex] = cSub; }
二維矩陣的索引問題:
假設有一個32*48的矩陣,x的范圍[0,47], y的范圍是[0,31]。 在代碼中是以一個二維數組保存,內存是連續的。目前我們要找到point(23,23)的索引:
一維指針指向的應該是: 23*48 + 23.
但是我們同樣也可以用grid 和block 來划分這個矩陣:
grid(3,2) (2是列,即X維度,2是行,Y維度), block(16,16) 。 grid X的范圍是是[0,2], Y的范圍是[0,1]. 同理適用於block 我們也可以用下面的方式找到point(23,23)的索引:
point(23,23) 對應grid的坐標是(1,1) 對應的block坐標是(7,7)。block的寬度是16。
point.y = (by*Block_size + ty) = 1*16+7 =23
point.x = (bx*Block_size + tx)= 1*16+7 =23
point(x,y)的索引位置是: y * width + x
這個是Heterogeneous Parallel Programming lab4:Basic Matrix Matrix Multiplication的代碼:

#include <wb.h> #define wbCheck(stmt) \ do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ return -1; \ } \ } while (0) #define TILE_WIDTH 32 //block size ,each thread to calucate each block // Compute C = A * B __global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { //@@ Insert code to implement matrix multiplication here //@@ You have to use shared memory for this MP __shared__ float sharedM[TILE_WIDTH][TILE_WIDTH]; __shared__ float sharedN[TILE_WIDTH][TILE_WIDTH]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int row = by*TILE_WIDTH + ty; int col = bx*TILE_WIDTH + tx; float v = 0.0; for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++) { if (i*TILE_WIDTH + tx < numAColumns && row < numARows) sharedM[ty][tx] = A[row*numAColumns + i*TILE_WIDTH + tx]; else sharedM[ty][tx] = 0.0; if (i*TILE_WIDTH + ty < numBRows && col < numBColumns) sharedN[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + col]; else sharedN[ty][tx] = 0.0; __syncthreads(); for(int j = 0; j < TILE_WIDTH; j++) v += sharedM[ty][j] * sharedN[j][tx]; __syncthreads(); } if (row < numCRows && col < numCColumns) C[row*numCColumns + col] = v; } int main(int argc, char **argv) { wbArg_t args; float *hostA; // The A matrix float *hostB; // The B matrix float *hostC; // The output C matrix float *deviceA; float *deviceB; float *deviceC; int numARows; // number of rows in the matrix A int numAColumns; // number of columns in the matrix A int numBRows; // number of rows in the matrix B int numBColumns; // number of columns in the matrix B int numCRows; // number of rows in the matrix C (you have to set this) int numCColumns; // number of columns in the matrix C (you have to set this) args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host"); hostA = ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); hostB = ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); //@@ Set numCRows and numCColumns numCRows = 0; numCColumns = 0; if(numAColumns != numBRows){ wbLog(TRACE, "numAColumns != numBRows, Break "); return 1; } numCRows = numARows; numCColumns = numBColumns; unsigned int A_size = numARows * numAColumns * sizeof(float); unsigned int B_size = numBRows * numBColumns * sizeof(float); unsigned int C_size = numCRows * numCColumns * sizeof(float); //@@ Allocate the hostC matrix hostC = ( float * )malloc(C_size); wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory."); //@@ Allocate GPU memory here wbCheck(cudaMalloc((void**)&deviceA, A_size)); wbCheck(cudaMalloc((void**)&deviceB, B_size)); wbCheck(cudaMalloc((void**)&deviceC, C_size)); wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU."); //@@ Copy memory to the GPU here wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice)); wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice)); wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice)); wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here dim3 DimGrid(ceil(numCColumns / 32.0), ceil(numCRows / 32.0), 1); dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1); wbTime_start(Compute, "Performing CUDA computation"); //@@ Launch the GPU Kernel here matrixMultiplyShared<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns); cudaDeviceSynchronize(); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); //@@ Copy the GPU memory back to the CPU here //@@ Copy the GPU memory back to the CPU here wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost)); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); //@@ Free the GPU memory here wbCheck(cudaFree(deviceA)); wbCheck(cudaFree(deviceB)); wbCheck(cudaFree(deviceC)); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA); free(hostB); free(hostC); return 0; }