GPU與CPU版本的矩陣乘法對比

本文轉載自查看原文 2016-05-06 21:39 6220 Cuda
　　由於剛剛開始學習Cuda，還沒有整理出一個完整的Cuda類，只是在Nvidia提供的kenerl架構上做修改。
　　但用於初體驗GPU給我們帶來的好處也綽綽有余了。
　　直接貼代碼：
/*
    矩陣乘法，CPU版本和GPU版本的對比


*/

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <Windows.h>
#include <string>
#include <malloc.h>

//用於指示不同的GPU 優化版本
enum Type
{
    Mode1 = 1,   //Mode 1 :將每一個C[i][j]都分別分配一個線程
    Mode2 = 2     //Mode 2 :不讓一個線程完整計算一個C[i][j]，通過C(i,j) = sum { A(i,k)*B(k,j) }發現，我們還可以再細度划分：
                 //           sub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)}  0<=ksub < blockSize
                 //            C(i, j) = sum{ Csub(i, j) }
                 //            就是把矩陣分成n*n個大的子塊，然后每一個block負責計算子塊i 和 子塊j的子乘積，計算完畢后加起來則可。這里主要使用了共享顯存作優化。
};

cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode);

__global__ void MatrixMulGPU_1(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{
    float sum = 0;
    //找出該線程所在的行和列
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    //線程Thread(row, col)負責計算C(row, col)
    for (int i = 0; i < WB; ++i)
    {
        sum += a[row * WA + i] * b[i * WB + col];
    }

    c[row * WB + col] = sum;
}

template<int BLOCK_SIZE> __global__ void MatrixMulGPU_2(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed by the block
    int aEnd = aBegin + WA - 1;

    // Step size used to iterate through the sub-matrices of A
    int aStep = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the sub-matrices of B
    int bStep = BLOCK_SIZE * WB;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int i = aBegin, j = bBegin;
        i <= aEnd;
        i += aStep, j += bStep)
    {

        // Declaration of the shared memory array As used to
        // store the sub-matrix of A
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

        // Declaration of the shared memory array Bs used to
        // store the sub-matrix of B
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        As[ty][tx] = a[i + WA * ty + tx];
        Bs[ty][tx] = b[j + WB * ty + tx];

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
#pragma unroll

        for (int k = 0; k < BLOCK_SIZE; ++k)
        {
            Csub += As[ty][k] * Bs[k][tx];
        }

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    c[k + WB * ty + tx] = Csub;
}

//GPU version
void MatrixMulCPU(float *_C, const float* _A, const float* _B, int WA, int HA, int WB, int HB)
{
    if (WA != HB)
    {
        printf("the matrix A and B cannot be multipled!");
            exit(0);
    }

    for (int i = 0; i < HA; ++i)
    {
        for (int j = 0; j < WB; ++j)
        {
            for (int k = 0; k < WA; ++k)
            {
                _C[i * WA + j] += _A[i * WA + k] * _B[k * WB + j];
            }
        }
    }
}

//給初始的矩陣一個隨機值
void randomInit(float* _data, int _size)
{
    for (int i = 0; i < _size; ++i)
    {
        _data[i] = rand() / (float)RAND_MAX * 100;
    }
}

//print the matrix
void printMatrix(float* m_Matrix, int W, int H)
{
    for (int i = 0; i < W * H; ++i)
    {
        printf("%2.1f ", m_Matrix[i]);
        if (i % W == 0 && i != 0) printf("\n");
    }
    printf("\n");
}

bool CheckAnswer(const float* _C, const float* _D, unsigned int size)
{
    bool isRight = true;
    for (int i = 0; i < size && isRight == true; ++i)
    {
        if (_C[i] != _D[i])
            isRight = false;
    }

    return isRight;
}

int main()
{
    const int width_A = 1024;
    const int height_A = 1024;
    const int width_B = 1024;
    const int height_B = 1024;

    float *B = (float *)malloc(sizeof(float) * height_B * width_B);
    float *A = (float *)malloc(sizeof(float) * height_A * width_A);
    float *C = (float *)malloc(sizeof(float) * height_A * width_B);
    float *D = (float *)malloc(sizeof(float) * height_A * width_B);
    float *E = (float *)malloc(sizeof(float) * height_A * width_B);

    memset(A, 0.0, sizeof(float) * height_A * width_A);
    memset(B, 0.0, sizeof(float) * height_B * width_B);
    memset(C, 0.0, sizeof(float) * height_A * width_B);
    memset(D, 0.0, sizeof(float) * height_A * width_B);
    memset(E, 0.0, sizeof(float) * height_A * width_B);


    //產生隨機數生成器
    srand((unsigned)time(0));

    randomInit(B, height_B * width_B);
    randomInit(A, height_A * width_A);

    //printMatrix(B, width_B, height_B);
    //printMatrix(A, width_A, height_A);

    //CPU 計算
    unsigned int tick1 = GetTickCount();
    MatrixMulCPU(C, A, B, width_A, height_A, width_B, height_B);
    printf("CPU use time : %dms\n", GetTickCount() - tick1);

    //GPU 
    Type m_Mode = Mode1;

    unsigned int tick2 = GetTickCount();
    cudaError_t cudaStatus = addWithCuda(D, A, B, width_A, height_A, width_B, height_B, m_Mode);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "addWithCuda failed!\n");
        return 1;
    }
    printf("GPU mode1 use time : %dms\n", GetTickCount() - tick2);

    m_Mode = Mode2;
    unsigned int tick3 = GetTickCount();
    cudaStatus = addWithCuda(E, A, B, width_A, height_A, width_B, height_B, m_Mode);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "addWithCuda failed!\n");
        return 1;
    }
    printf("GPU mode2 use time : %dms\n", GetTickCount() - tick3);

    //檢查GPU, CPU 計算的結果是否相同
    if (!CheckAnswer(C, D, height_A * width_B) && !CheckAnswer(C, E, height_A * width_B))
        printf("The answer is wrong!");
    else printf("The answer is right!");

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }
    
    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode)
{
    float *dev_a = 0;
    float *dev_b = 0;
    float *dev_c = 0;
    cudaError_t cudaStatus;

    
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, HA * WB * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, HA * WA * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, HB * WB * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, HA * WA * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, HB * WB * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    //為每一個C[i][j]設置一個線程進行計算
    int block_size = 16;

    dim3 Threads(block_size, block_size);
    dim3 Blocks(WB / block_size, HA / block_size);

    // Launch a kernel on the GPU with one thread for each element.
    if (mode == Mode1)
    {
        MatrixMulGPU_1 << <Blocks, Threads >>>(dev_c, dev_a, dev_b, WA, WB);
    }

    if (mode == Mode2)
    {
        MatrixMulGPU_2<16> << <Blocks, Threads >> >(dev_c, dev_a, dev_b, WA, WB);
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, HA * WB * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}
代碼中，總過使用了CPU的計算和兩種GPU的運算，最終的運行結果如下：
可以明顯的看出，GPU的運行速度比CPU快很多，並且將任務越細分，運行的速度也更快。
后續我還想通過更多的方式（比如texture binding）來繼續進行優化。
免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。
猜您在找 CPU和GPU性能對比 GPU渲染和CPU渲染農場成本對比 tensorflow2.0 GPU和CPU 時間對比 Caffe常用算子GPU和CPU對比 faiss CPU版本+GPU版本安裝查看 tensorflow 是GPU版本還是CPU版本查看 tensorflow 是GPU版本還是CPU版本矩陣的乘法矩陣乘法矩陣乘法