CPU 矩陣乘法

能相乘的兩個矩陣，必須滿足一個矩陣的行數和第二個矩陣的列數相同.

A(N*P) * B(P*M) = C(N*M).　其中P是行數，N是列數，　從寬高的角度來說，即 A的寬度和B的高度是相同的．C矩陣 = ha * wb.

其中C(i,j) = A矩陣中的i行和B矩陣中的j列進行點乘得到該點的值．

//C = A*B
void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb)
{
    float sum = 0;
    for (int i = 0; i < _ha; ++i)
    {
        for (int j = 0; j < _wb; ++j)
        {
            sum = 0;
            for (int k = 0; k < _wa; ++k)
            {
　　　　　　　//i*_wa得到當前對應的是A矩陣中的哪一行，+k對應當前行的哪一列．矩陣Ａ的stride是wa
　　　　　　　//j對應當前矩陣Ｂ的哪一列，＋k*wb依次表示第０行的j列,第１行的j列...第wa-1行的j列．矩陣Ｂ的stride是wb
                sum += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j];
            }
            _C[i*_wb+j] = (float)sum;
        }
    }
}

簡單矩陣乘法

C(i,j) = sum { A(i,k)*B(k,j) } 0<=k < _wa;耦合程度很小，所以我們可以通過划分區域的方法，讓每個線程負責一個區域。
怎么划分呢？首先最初的想法是讓每一個線程計算一個C(i,j)，那么估算一下，應該需要height_c*width_c，也就是ha*wb個線程。進一步，我們將矩陣按一個大方格Block划分，如果一個方格Block大小是16*16，那么矩陣80*48的可以表示為5(*16) * 3(*16)，即5*3個大格子(Grid)，所以grid的划分自然就是(height_c/16) *(width_c/16)個線程了。

好了，划分完后，內核代碼如下：　這個kernel的代碼只是把外層兩個循環變成

計算版本0：

__global__ void matrix_kernel_0(float* _C,const float* _A,const float *_B,int _wa,int _wb)
{
    float sum = 0;
    //找出該線程所在的行列
    int row = blockIdx.y*blockDim.y + threadIdx.y;  // X 對應矩陣row, Y對應舉證col
    int col = blockIdx.x*blockDim.x + threadIdx.x;

    //線程Thread(row,col)負責計算C(row,col)
    for (int i = 0; i < _wa; ++i)
    {
        sum += _A[row*_wa + i]*_B[i*_wb + col];
    }
    _C[row*_wb + col] = sum;
}

這個是Heterogeneous Parallel Programming　lab3:Basic Matrix Matrix Multiplication的代碼：

#include <wb.h>

#define wbCheck(stmt)                                                          \
  do {                                                                         \
    cudaError_t err = stmt;                                                    \
    if (err != cudaSuccess) {                                                  \
      wbLog(ERROR, "Failed to run stmt ", #stmt);                              \
      wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));           \
      return -1;                                                               \
    }                                                                          \
  } while (0)


#if 0 //This is C verison matrixMUl
//C = A*B
void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb)
{
    float sum = 0;
    for (int i = 0; i < _ha; ++i)
    {
        for (int j = 0; j < _wb; ++j)
        {
            sum = 0;
            for (int k = 0; k < _wa; ++k)
            {
                sum += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j];
            }
            _C[i*_wb+j] = (float)sum;
        }
    }
}
#endif


// Compute C = A * B , Matrix C = hA * wB = rowA * columnB
__global__ void matrixMultiply(float *A, float *B, float *C, int numARows,
                               int numAColumns, int numBRows, int numBColumns,
                               int numCRows, int numCColumns) {
  //@@ Insert code to implement matrix multiplication here
     float sum = 0.0f;

    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;


    if(row < numCRows && col < numCColumns){
        for (int i = 0; i < numAColumns; ++i)
        {
            sum += A[row*numAColumns + i] * B[i*numBColumns + col];
        }
        C[row*numBColumns + col] = sum;
    }
    printf("C = %f\n",C[row*numBColumns + col]);

}


int main(int argc, char **argv) {
  wbArg_t args;
  float *hostA; // The A matrix
  float *hostB; // The B matrix
  float *hostC; // The output C matrix
  float *deviceA;
  float *deviceB;
  float *deviceC;
  int numARows;    // number of rows in the matrix A
  int numAColumns; // number of columns in the matrix A
  int numBRows;    // number of rows in the matrix B
  int numBColumns; // number of columns in the matrix B
  int numCRows;    // number of rows in the matrix C (you have to set this)
  int numCColumns; // number of columns in the matrix C (you have to set this)

  args = wbArg_read(argc, argv);

  wbTime_start(Generic, "Importing data and creating memory on host");
  hostA =
      ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
  hostB =
      ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
  //@@ Set numCRows and numCColumns
  numCRows = 0;
  numCColumns = 0;
  if(numAColumns != numBRows){
    wbLog(TRACE, "numAColumns != numBRows, Break ");
    return 1;
  }
  numCRows = numARows;
  numCColumns = numBColumns;
  unsigned int A_size = numARows * numAColumns * sizeof(float);
  unsigned int B_size = numBRows * numBColumns * sizeof(float);
  unsigned int C_size = numCRows * numCColumns * sizeof(float);
  //@@ Allocate the hostC matrix
  hostC = ( float * )malloc(C_size);
  wbTime_stop(Generic, "Importing data and creating memory on host");

  wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
  wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

  wbTime_start(GPU, "Allocating GPU memory.");
  //@@ Allocate GPU memory here
  wbCheck(cudaMalloc((void**)&deviceA, A_size)); 
  wbCheck(cudaMalloc((void**)&deviceB, B_size)); 
  wbCheck(cudaMalloc((void**)&deviceC, C_size)); 
  wbTime_stop(GPU, "Allocating GPU memory.");

  wbTime_start(GPU, "Copying input memory to the GPU.");
  //@@ Copy memory to the GPU here
  wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));
  wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));
  wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));
  wbTime_stop(GPU, "Copying input memory to the GPU.");

  //@@ Initialize the grid and block dimensions here
  dim3 DimGrid((numCColumns-1)/16+1, (numCRows-1)/16+1, 1);
  dim3 DimBlock(16, 16, 1);

  wbTime_start(Compute, "Performing CUDA computation");
  //@@ Launch the GPU Kernel here
  matrixMultiply<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
  cudaDeviceSynchronize();
  wbTime_stop(Compute, "Performing CUDA computation");

  wbTime_start(Copy, "Copying output memory to the CPU");
  //@@ Copy the GPU memory back to the CPU here
  //@@ Copy the GPU memory back to the CPU here
  wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost));

  wbTime_stop(Copy, "Copying output memory to the CPU");

  wbTime_start(GPU, "Freeing GPU Memory");
  //@@ Free the GPU memory here
  wbCheck(cudaFree(deviceA));
  wbCheck(cudaFree(deviceB));
  wbCheck(cudaFree(deviceC));

  wbTime_stop(GPU, "Freeing GPU Memory");

  wbSolution(args, hostC, numCRows, numCColumns);

  free(hostA);
  free(hostB);
  free(hostC);

  return 0;
}

View Code

使用tile來划分矩陣乘法

另外一種思路，我們不讓每一個線程完整計算一個C(i,j)，通過C(i,j) = sum { A(i,k)*B(k,j) }發現，我們還可以再細度划分：
Csub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)} 0<=ksub < blockSize
C(i,j) = sum{Csub(i,j)}
就是把矩陣分成n*n個大的子塊，然后每一個block負責計算子塊i 和子塊j的子乘積，計算完畢后加起來則可。這里主要引入shared Memory來提高程序效率．

計算矩陣我們

__global__ void matrix_kernel_1(float* _C,const float* _A,const float *_B,int _wa,int _wb) //_wa是A矩陣的寬度，_wb是矩陣B的寬度
{
    int bx = blockIdx.x; //Block X的當前位置
    int by = blockIdx.y;　//Block　y的當前位置
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    //該block要處理的A ,A的取值方向是X軸方向， B的取值方向是Y軸方向
    int aBegin = _wa*(by*BLOCK_SIZE);//A(0,by) //在矩陣A上每個block的首地址
    int aEnd = aBegin + _wa - 1; //
    int aStep = BLOCK_SIZE;//offsetA  //因為A是橫向取值，所以step是blocksize

    int bBegin = BLOCK_SIZE*bx;//B(bx,0) //矩陣B的首地址
    int bStep = BLOCK_SIZE*_wb;//offsetB //因為B是縱向取值，所以step是blocksize*_wb.
    
    float cSub = 0;
//每一個線程計算一個像素點，分成wa/block 次來計算，每次計算一段A(sub) * B(sub)，最后累加得到C的結果．
//假設矩陣都是n*n的，那么舊的basicMatrix每個線程都需要執行2n次globalMemory的訪問，這里用到sharedMemory只需要執行2n／blocksize，每個線程可以提高blocksize倍，
／／每個block里面的thread都是通過讀取sharedMemory來執行計算的，速度會非常快．
    for (int a = aBegin,b = bBegin; a <= aEnd; a += aStep,b += bStep)
    {
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
        //每個線程負責一個元素拷貝,我們以block為單位來分析。假設blocksize=16, 一個block里面有 16*16個線程。
        //每個block 可以填滿需要用到的 As, 和Bs大小的矩陣。這里就是矩陣A里面的16*16的數據可以填滿，保存在sharedMemory中。同樣B矩陣也是。
        As[ty][tx] = _A[a + _wa*ty + tx];   
        Bs[ty][tx] = _B[b + _wb*ty + tx];

        __syncthreads();　//同步使得矩陣Ａ，和矩陣Ｂ的第一個tile*tile的數據保存在As和Bs里，供下面的計算使用．
        
        //每個線程負責計算一個子塊i 和 子塊j的子乘積寬度是block_size,執行到wa/block_size次，累加可得到Ｃ的值
        for (int k = 0; k < BLOCK_SIZE; ++k)
        {
            cSub += As[ty][k]*Bs[k][tx];　
        }

        __syncthreads();
    }

    //全局地址，向全局寄存器寫回去
    //一個線程負責一個元素，一個block負責一個子塊
    int cIndex = (by*BLOCK_SIZE + ty)*_wb + (bx*BLOCK_SIZE + tx);
    _C[cIndex] = cSub;
}

二維矩陣的索引問題:

假設有一個32*48的矩陣，x的范圍[0,47], y的范圍是[0,31]。在代碼中是以一個二維數組保存，內存是連續的。目前我們要找到point(23,23)的索引：

一維指針指向的應該是: 23*48 + 23.

但是我們同樣也可以用grid 和block 來划分這個矩陣:

grid(3,2) (2是列，即X維度，2是行，Y維度), block(16,16) 。 grid X的范圍是是[0,2], Y的范圍是[0,1]. 同理適用於block 我們也可以用下面的方式找到point(23,23)的索引：

point(23,23) 對應grid的坐標是（1,1）對應的block坐標是(7,7)。block的寬度是16。

point.y = (by*Block_size + ty) = 1*16+7 =23

point.x = (bx*Block_size + tx)= 1*16+7 =23

point(x,y)的索引位置是： y * width + x

這個是Heterogeneous Parallel Programming　lab4:Basic Matrix Matrix Multiplication的代碼：

#include <wb.h>

#define wbCheck(stmt)                                                          \
  do {                                                                         \
    cudaError_t err = stmt;                                                    \
    if (err != cudaSuccess) {                                                  \
      wbLog(ERROR, "Failed to run stmt ", #stmt);                              \
      wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));           \
      return -1;                                                               \
    }                                                                          \
  } while (0)

#define TILE_WIDTH 32  //block size ,each thread to calucate each block

// Compute C = A * B
__global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows,
                                     int numAColumns, int numBRows,
                                     int numBColumns, int numCRows,
                                     int numCColumns) {
  //@@ Insert code to implement matrix multiplication here
  //@@ You have to use shared memory for this MP
    
    __shared__ float sharedM[TILE_WIDTH][TILE_WIDTH];
    __shared__ float sharedN[TILE_WIDTH][TILE_WIDTH];
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int row = by*TILE_WIDTH + ty;
    int col = bx*TILE_WIDTH + tx;
    float v = 0.0;

    for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++)
    {
        if (i*TILE_WIDTH + tx < numAColumns && row < numARows)
            sharedM[ty][tx] = A[row*numAColumns + i*TILE_WIDTH + tx];
        else
            sharedM[ty][tx] = 0.0;

        if (i*TILE_WIDTH + ty < numBRows && col < numBColumns)
            sharedN[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + col];
        else
            sharedN[ty][tx] = 0.0;
        __syncthreads();

        for(int j = 0; j < TILE_WIDTH; j++)
            v += sharedM[ty][j] * sharedN[j][tx];
        __syncthreads();
    }

    if (row < numCRows && col < numCColumns)
        C[row*numCColumns + col] = v;
    
}

int main(int argc, char **argv) {
  wbArg_t args;
  float *hostA; // The A matrix
  float *hostB; // The B matrix
  float *hostC; // The output C matrix
  float *deviceA;
  float *deviceB;
  float *deviceC;
  int numARows;    // number of rows in the matrix A
  int numAColumns; // number of columns in the matrix A
  int numBRows;    // number of rows in the matrix B
  int numBColumns; // number of columns in the matrix B
  int numCRows;    // number of rows in the matrix C (you have to set this)
  int numCColumns; // number of columns in the matrix C (you have to set this)

  args = wbArg_read(argc, argv);

  wbTime_start(Generic, "Importing data and creating memory on host");
  hostA =
      ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
  hostB =
      ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
  //@@ Set numCRows and numCColumns
  numCRows = 0;
  numCColumns = 0;
  if(numAColumns != numBRows){
    wbLog(TRACE, "numAColumns != numBRows, Break ");
    return 1;
  }
  numCRows = numARows;
  numCColumns = numBColumns;
  unsigned int A_size = numARows * numAColumns * sizeof(float);
  unsigned int B_size = numBRows * numBColumns * sizeof(float);
  unsigned int C_size = numCRows * numCColumns * sizeof(float);
  //@@ Allocate the hostC matrix
  hostC = ( float * )malloc(C_size);
  wbTime_stop(Generic, "Importing data and creating memory on host");

  wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
  wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

  wbTime_start(GPU, "Allocating GPU memory.");
  //@@ Allocate GPU memory here
  wbCheck(cudaMalloc((void**)&deviceA, A_size)); 
  wbCheck(cudaMalloc((void**)&deviceB, B_size)); 
  wbCheck(cudaMalloc((void**)&deviceC, C_size)); 
  wbTime_stop(GPU, "Allocating GPU memory.");

  wbTime_start(GPU, "Copying input memory to the GPU.");
  //@@ Copy memory to the GPU here
  wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));
  wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));
  wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));
  wbTime_stop(GPU, "Copying input memory to the GPU.");

  //@@ Initialize the grid and block dimensions here
  dim3 DimGrid(ceil(numCColumns / 32.0), ceil(numCRows / 32.0), 1);
  dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1);

  wbTime_start(Compute, "Performing CUDA computation");
  //@@ Launch the GPU Kernel here
  matrixMultiplyShared<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
  cudaDeviceSynchronize();
  wbTime_stop(Compute, "Performing CUDA computation");

  wbTime_start(Copy, "Copying output memory to the CPU");
  //@@ Copy the GPU memory back to the CPU here
  //@@ Copy the GPU memory back to the CPU here
  wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost));

  wbTime_stop(Copy, "Copying output memory to the CPU");

  wbTime_start(GPU, "Freeing GPU Memory");
  //@@ Free the GPU memory here
  wbCheck(cudaFree(deviceA));
  wbCheck(cudaFree(deviceB));
  wbCheck(cudaFree(deviceC));

  wbTime_stop(GPU, "Freeing GPU Memory");

  wbSolution(args, hostC, numCRows, numCColumns);

  free(hostA);
  free(hostB);
  free(hostC);

  return 0;
}

View Code

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 CUDA 矩陣乘法終極優化指南 cuda中用cublas庫做矩陣乘法矩陣乘法矩陣乘法矩陣乘法與矩陣加速矩陣乘法以及矩陣的逆，轉置矩陣系列之一：矩陣乘法的意義 4*4矩陣乘法終於搞定矩陣乘法的理解（轉）理解矩陣乘法