CUDA紋理內存的訪問速度比全局內存要快,因此處理圖像數據時,使用紋理內存是一個提升性能的好方法。
貼一段自己寫的簡單的實現兩幅圖像加權和的代碼,使用紋理內存實現。
輸入:兩幅圖 lena, moon
輸出:兩幅圖像加權和
1 #include <opencv2\opencv.hpp> 2 #include <iostream> 3 #include <string> 4 #include <cuda.h> 5 #include <cuda_runtime.h> 6 #include <device_launch_parameters.h> 7 8 using namespace std; 9 using namespace cv; 10 11 //聲明CUDA紋理 12 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex1; 13 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex2; 14 //聲明CUDA數組 15 cudaArray* cuArray1; 16 cudaArray* cuArray2; 17 //通道數 18 cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc<uchar4>(); 19 20 21 __global__ void weightAddKerkel(uchar *pDstImgData, int imgHeight, int imgWidth,int channels) 22 { 23 const int tidx=blockDim.x*blockIdx.x+threadIdx.x; 24 const int tidy=blockDim.y*blockIdx.y+threadIdx.y; 25 26 if (tidx<imgWidth && tidy<imgHeight) 27 { 28 float4 lenaBGR,moonBGR; 29 //使用tex2D函數采樣紋理 30 lenaBGR=tex2D(refTex1, tidx, tidy); 31 moonBGR=tex2D(refTex2, tidx, tidy); 32 33 int idx=(tidy*imgWidth+tidx)*channels; 34 float alpha=0.5; 35 pDstImgData[idx+0]=(alpha*lenaBGR.x+(1-alpha)*moonBGR.x)*255; 36 pDstImgData[idx+1]=(alpha*lenaBGR.y+(1-alpha)*moonBGR.y)*255; 37 pDstImgData[idx+2]=(alpha*lenaBGR.z+(1-alpha)*moonBGR.z)*255; 38 pDstImgData[idx+3]=0; 39 } 40 } 41 42 void main() 43 { 44 Mat Lena=imread("data/lena.jpg"); 45 Mat moon=imread("data/moon.jpg"); 46 cvtColor(Lena, Lena, CV_BGR2BGRA); 47 cvtColor(moon, moon, CV_BGR2BGRA); 48 int imgWidth=Lena.cols; 49 int imgHeight=Lena.rows; 50 int channels=Lena.channels(); 51 52 //設置紋理屬性 53 cudaError_t t; 54 refTex1.addressMode[0] = cudaAddressModeClamp; 55 refTex1.addressMode[1] = cudaAddressModeClamp; 56 refTex1.normalized = false; 57 refTex1.filterMode = cudaFilterModeLinear; 58 //綁定cuArray到紋理 59 cudaMallocArray(&cuArray1, &cuDesc, imgWidth, imgHeight); 60 t = cudaBindTextureToArray(refTex1, cuArray1); 61 62 refTex2.addressMode[0] = cudaAddressModeClamp; 63 refTex2.addressMode[1] = cudaAddressModeClamp; 64 refTex2.normalized = false; 65 refTex2.filterMode = cudaFilterModeLinear; 66 cudaMallocArray(&cuArray2, &cuDesc, imgWidth, imgHeight); 67 t = cudaBindTextureToArray(refTex2, cuArray2); 68 69 //拷貝數據到cudaArray 70 t=cudaMemcpyToArray(cuArray1, 0,0, Lena.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice); 71 t=cudaMemcpyToArray(cuArray2, 0,0, moon.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice); 72 73 //輸出圖像 74 Mat dstImg=Mat::zeros(imgHeight, imgWidth, CV_8UC4); 75 uchar *pDstImgData=NULL; 76 t=cudaMalloc(&pDstImgData, imgHeight*imgWidth*sizeof(uchar)*channels); 77 78 //核函數,實現兩幅圖像加權和 79 dim3 block(8,8); 80 dim3 grid( (imgWidth+block.x-1)/block.x, (imgHeight+block.y-1)/block.y ); 81 weightAddKerkel<<<grid, block, 0>>>(pDstImgData, imgHeight, imgWidth, channels); 82 cudaThreadSynchronize(); 83 84 //從GPU拷貝輸出數據到CPU 85 t=cudaMemcpy(dstImg.data, pDstImgData, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyDeviceToHost); 86 87 //顯示 88 namedWindow("show"); 89 imshow("show", dstImg); 90 waitKey(0); 91 }