CUDA學習5 常量內存與紋理內存

本文轉載自查看原文 2017-02-20 11:47 3525 工具學習

1.常量內存

當線程束中的所有線程都訪問相同的只讀數據時，使用常量內存將獲得額外的性能提升。

常量內存大小限制為64k。

常量內存其實只是全局內存的一種虛擬地址形式，並沒有特殊保留的常量內存塊。常量內存有兩個特性，一個是高速緩存，另一個是它支持將單個值廣播到線程束中的每個線程。但要注意的是，對於那些數據不太集中或者數據重用率不高的內存訪問，盡量不要使用常量內存。

當常量內存將數據分配或廣播到線程束中的每個線程時（注意，實際上硬件會將單次內存讀取操作廣播到半個線程束），廣播能夠在單個周期內發生，因此這個特性是非常有用的。雖然當所有16個線程都讀取相同地址時，這個功能可以極大提高性能，但當所有16個線程分別讀取不同的地址時，它實際上會降低性能。如果半個線程束中的所有16個線程需要訪問常量內存中的不同數據，那么這個16次不同的讀取操作會被串行化，從而需要16倍的時間來發出請求。但如果從全局內存中讀取，那么這些請求就會同時發出。這種情況下，從常量內存讀取就會慢於從全局內存中讀取。

需要注意的是，當我們聲明一個內核常量的時候，在編譯器將CUDA C代碼轉換成PTX匯編代碼時會用字面值（0x55555555）直接替換常量值(data)的地址。
const int data = 0x55555555; int d = data; //此時data會直接編譯為字面值0x55555555
但當我們聲明的是一個常量數組時，編譯器在將C代碼轉換成PTX匯編代碼時將會使用數組地址在匯編代碼中。
const int data[3] = {0x11111111, 0x22222222, 0x33333333}; int d = data[1]; //此時data[1]會被編譯為data[1]的地址
這時，在費米（計算能力為2.x的硬件）架構的設備上，全局內存借助一級緩存也能達到與常量內存相同的訪問速度。只有在計算能力為1.x的設備上，由於全局內存沒有用到緩存技術，此時使用常量內存才會獲得明顯的性能提升。

下例中使用常量內存性能並未獲得提升（Time to generate與不使用常量內存接近）。

運行《CUDA By Example》第六章示例有約8%的提升（4.8ms到5.2ms，小樣本）。

#include <windows.h>
#include <iostream>

__constant__ float dev_input[5*5*24*24];  //57600<64000
__global__ void MaxPool2d(const int height, const int pooled_height, float* top_data)
{
    int x = blockIdx.x;
    int y = blockIdx.y;
    int dx = gridDim.x;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int dtx = blockDim.x;
    int dty = blockDim.y;
    float s = -10000.0;
    int index2 = y*dx*dtx*dty + x*dtx*dty + ty*dtx + tx;
    int index = y*dx*height*height + x*height*height + ty*pooled_height*height + tx*pooled_height;
    for (int u = 0; u < pooled_height && (u + pooled_height*ty)<height; ++u)
    for (int v = 0; v < pooled_height && (v + pooled_height*tx)<height; ++v)
    if (*(dev_input + index + u*height + v)>s)
        s = *(dev_input + index + u*height + v);
    *(top_data + index2) = s;
}

int main()
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    const int N = 5, M = 5, H = 24, W = 24, D = 2;
    const int PH = H / D + H % D;
    int image_size = N*M*H*W*sizeof(float);
    int out_size = N*M*PH*PH*sizeof(float);
    float mul_by = 0.01;
    float *input, *output, *dev_output;
    
    input = new float[image_size];
    output = new float[out_size];
    for (int i = 0; i<N*M*H*W; i++)
        *(input + i) = i*mul_by;

    cudaMalloc((void**)&dev_output, out_size);
    //cudaMalloc((void**)&dev_input, image_size);
    cudaMemcpyToSymbol(dev_input, input, image_size);
    dim3    grid(M, N);
    dim3    threads(PH, PH);
    DWORD start_time = GetTickCount();
    cudaEventRecord(start,0);
    MaxPool2d << <grid, threads >> >( H, D, dev_output);
    cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
    DWORD end_time = GetTickCount();
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime,
        start, stop);
    std::cout << "Time to generate: "<<elapsedTime<< "ms\n";
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    std::cout << "Cost: " << end_time - start_time << "ms." << std::endl;
    for (int i = 0; i<10; i++)
        std::cout << *(output + i) << std::endl;

    //cudaFree(dev_input);
    cudaFree(dev_output);
    delete[] output;
    delete[] input;
    system("pause");
}

/*
Time to generate: 0.071552ms
Cost: 0ms.
0.25
0.27
0.29
0.31
0.33
0.35
0.37
0.39
0.41
0.43
*/

2.紋理內存

和常量內存一樣，紋理內存是另一種類型的只讀內存，在特定的訪問模式中（以下例子並非這種特定的訪問模式），紋理內存同樣能夠提升性能。

介紹摘自《GPU高性能編程CUDA實戰》。

紋理內存緩存在芯片上，因此在某些情況中，它能夠減少對內存的請求並提供更高效的內存帶寬。紋理緩存是專門為那些在內存訪問模式中存在大量空間局部性(Spatial Locality)的圖形應用程序而設計的。在某個計算應用程序中，這意味着一個線程讀取的位置可能與鄰近線程的讀取位置“非常接近”，如下圖所示。

從數學的角度，上圖中的4個地址並非連續的，在一般的CPU緩存中，這些地址將不會緩存。但由於GPU紋理緩存是專門為了加速這種訪問模式而設計的，因此如果在這種情況中使用紋理內存而不是全局內存，那么將會獲得性能的提升。

#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include <iostream>

#define max(a,b) (a>b?a:b)
texture<float> t_input;

__global__ void MaxPool2d(const int height, const int pooled_height, float* top_data)
{
    int x = blockIdx.x;
    int y = blockIdx.y;
    int dx = gridDim.x;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int dtx = blockDim.x;
    int dty = blockDim.y;
    float s = -10000.0;
    float a1, a2, a3, a4, a12, a34;
    int index2 = y*dx*dtx*dty + x*dtx*dty + ty*dtx + tx;
    int index = y*dx*height*height + x*height*height + ty*pooled_height*height + tx*pooled_height;
    int index3 = 0;
    bool b1 = 1 + 2 * ty == height;
    bool b2 = 1 + 2 * tx == height;
    
    if (b1&&b2)
    {
        a1 = tex1Dfetch(t_input, index );
        s = max(a1, s);
    }
    if ( !b2)
    {
        a1 = tex1Dfetch(t_input, index );
        a2 = tex1Dfetch(t_input, index + 1);
        a12 = max(a1,a2);
        s = max(a12, s);
        index3 = height + 1;
    }
    if (!b1)
    {
        a3 = tex1Dfetch(t_input, index + index3);
        a4 = tex1Dfetch(t_input, index + height);
        a34 = max(a3, a4);
        s = max(a34, s);
    }
    *(top_data + index2) = s;
}

int main()
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    const int N = 5, M = 5, H = 25, W = 25, D = 2;
    const int PH = H / D + H % D;
    const int image_size = N*M*H*W*sizeof(float);
    const int out_size = N*M*PH*PH*sizeof(float);
    float mul_by = 0.01;
    float *input, *output, *dev_input, *dev_output;


    input = new float[image_size];
    output = new float[out_size];
    for (int i = 0; i<N*M*H*W; i++)
        *(input + i) = i*mul_by;

    cudaMalloc((void**)&dev_output, out_size);
    cudaMalloc((void**)&dev_input, image_size);
    cudaBindTexture(NULL, t_input, dev_input,
        image_size);
    cudaMemcpy(dev_input, input, image_size,
        cudaMemcpyHostToDevice);

    dim3    grid(M, N);
    dim3    threads(PH, PH);
    cudaEventRecord(start, 0);
    MaxPool2d <<<grid, threads >>>(H, D, dev_output);
    cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime,
        start, stop);
    std::cout << "Time to generate: " << elapsedTime << "ms\n";
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    for (int i = 0; i<10; i++)
        std::cout << *(output + i) << std::endl;

    cudaFree(dev_input);
    cudaFree(dev_output);
    cudaUnbindTexture(t_input);
    delete[] output;
    delete[] input;
    system("pause");
}

/*
Time to generate: 0.128448ms
*/

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 CUDA 紋理內存 cuda紋理內存的使用 CUDA學習筆記（三）——CUDA內存顯存與紋理內存詳解 cuda學習3-共享內存和同步 CUDA -- 內存分配 Java內存中的常量池 CUDA共享內存的使用示例 Java內存分配之堆、棧和常量池 CUDA學習（四）之使用全局內存進行歸約求和（一個包含N個線程的線程塊）