1.使用hip實現矩陣乘
1 #include <stdio.h> 2 #include <stdlib.h> 3 4 #include <hip/hip_runtime.h> 5 #include <hip/hip_runtime_api.h> 6 7 #define M 4 8 #define K 4 9 #define N 4 10 11 void initial(double* list,int row,int col) 12 { 13 double *num = list; 14 for (int i=0; i<row*col; i++) 15 { 16 num[i] = rand()%10; 17 } 18 } 19 20 void CpuMatrix(double *A,double *B,double *C) 21 { 22 int i,j,k; 23 24 for( i=0; i<M; i++) 25 { 26 for(j=0; j<N; j++) 27 { 28 double sum = 0; 29 for(int k=0; k<K; k++) 30 { 31 sum += A[i*K + k] * B[k * N + j]; 32 } 33 C[i * N + j] = sum; 34 } 35 } 36 } 37 38 __global__ void GpuMatrix(double *dev_A,double *dev_B,double *dev_C) 39 { 40 int ix = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; 41 int iy = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; 42 if(ix<K && iy<M) 43 { 44 double sum = 0; 45 for( int k = 0; k < K;k++) 46 { 47 sum += dev_A[iy*K + k] * dev_B[k*N + ix]; 48 } 49 dev_C[iy * N + ix] = sum; 50 } 51 } 52 53 void printMatrix(double *list,int row,int col) 54 { 55 double *p = list; 56 for(int i=0; i<row; i++) 57 { 58 for(int j=0; j<col; j++) 59 { 60 printf("%10lf",p[j]); 61 } 62 p = p + col; 63 printf("\n"); 64 } 65 } 66 int main(int argc,char **argv) 67 { 68 int Axy = M*K; 69 int Abytes = Axy * sizeof(double); 70 71 int Bxy = K*N; 72 int Bbytes = Bxy * sizeof(double); 73 74 int nxy = M*N; 75 int nbytes = nxy * sizeof(double); 76 77 float time_cpu,time_gpu; 78 79 clock_t start_cpu,stop_cpu; 80 81 hipEvent_t start_GPU,stop_GPU; 82 83 double *host_A, *host_B, *host_C, *c_CPU; 84 host_A = (double*)malloc(Abytes); 85 host_B = (double*)malloc(Bbytes); 86 host_C = (double*)malloc(nbytes); 87 c_CPU = (double*)malloc(nbytes); 88 89 90 initial(host_A,M,K); 91 92 printf("A:(%d,%d):\n",M,K); 93 printMatrix(host_A,M,K); 94 95 initial(host_B,K,N); 96 97 printf("B:(%d,%d):\n",K,N); 98 printMatrix(host_B,K,N); 99 100 // start_cpu = clock(); 101 CpuMatrix(host_A,host_B,host_C); 102 // stop_cpu = clock(); 103 104 printf("Host_C:(%d,%d):\n",M,N); 105 // printf("\nCPU time is %f(ms)\n",(float)(stop_cpu-start_cpu)/CLOCKS_PER_SEC); 106 printMatrix(host_C,M,N); 107 double *dev_A,*dev_B,*dev_C; 108 hipMalloc(&dev_A,Axy*sizeof(double)); 109 hipMalloc(&dev_B,Bxy*sizeof(double)); 110 hipMalloc(&dev_C,nxy*sizeof(double)); 111 112 dim3 block(1024,1); 113 dim3 grid(64,64); 114 115 hipMemcpy(dev_A,host_A,Abytes,hipMemcpyDeviceToHost); 116 hipMemcpy(dev_B,host_B,Bbytes,hipMemcpyDeviceToHost); 117 118 hipEventCreate(&start_GPU); 119 hipEventCreate(&stop_GPU); 120 hipEventRecord(start_GPU,0); 121 hipLaunchKernelGGL(GpuMatrix,grid,block,0,0,dev_A,dev_B,dev_C); 122 hipEventRecord(stop_GPU,0); 123 hipEventSynchronize(start_GPU); 124 hipEventSynchronize(stop_GPU); 125 hipEventElapsedTime(&time_gpu, start_GPU,stop_GPU); 126 printf("\nThe time from GPU:\t%f(ms)\n", time_GPU/1000); 127 hipDeviceSynchronize(); 128 hipEventDestroy(start_GPU); 129 hipEventDestroy(stop_GPU); 130 131 hipMemcpy(c_CPU,dev_C,nbytes,hipMemcpyDeviceToHost); 132 printf("device_C:(%d,%d):\n",M,N); 133 printMatrix(c_CPU,M,N); 134 135 136 hipFree(dev_A); 137 hipFree(dev_B); 138 hipFree(dev_C); 139 free(host_A); 140 free(host_B); 141 free(host_C); 142 free(c_CPU); 143 144 return 0; 145 }
結果如下:
2.使用結構體實現HIP的矩陣乘
共享內存使用__shared__ 內存空間說明符來分配。
共享內存應該比全局內存快得多,這在線程結構中有提及並在共享內存中有詳細描述。因此,任何可以用
共享內存訪問替換全局內存訪問的機會都應該被利用,如下面的矩陣乘法示例所示。
下面的示例代碼是不利用共享內存的矩陣乘法的簡單實現。每個線程讀取 A 的一行和 B 的一列,並計算 C
的相應元素,如圖 9 所示。因此,A 將從全局內存中被讀取 B.width 次,而 B 將被讀取 A.height 次。

1 #include <stdio.h> 2 #include <time.h> 3 #include <stdlib.h> 4 #include <hip/hip_runtime.h> 5 #include <hip/hip_runtime_api.h> 6 7 typedef struct{ 8 int width; 9 int height; 10 float* elements; 11 }Matrix; 12 13 #define BLOCK_SIZE 4 14 15 __global__ void MatMulKernel(const Matrix,const Matrix,Matrix); 16 17 void initial(float* A,int N) 18 { 19 int i; 20 for(i = 0;i<N;i++) 21 { 22 A[i] = rand()%10; 23 } 24 } 25 26 void shuchu(Matrix A,int N) 27 { 28 29 int j=0; 30 for(int i=0; i < N; i++) 31 { 32 if( j == A.width) 33 { 34 printf("\n"); 35 j = 0; 36 i--; 37 }else 38 { 39 printf("%15lf",A.elements[i]); 40 j++; 41 } 42 } 43 } 44 45 __global__ void MatMulKernel(Matrix A,Matrix B,Matrix C) 46 { 47 float Cvalue = 0; 48 int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; 49 int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; 50 for(int e = 0; e < A.width; ++e) 51 { 52 Cvalue += A.elements[row * A.width + e] * B.elements[e*B.width + col]; 53 } 54 C.elements[row * C.width + col] = Cvalue; 55 } 56 57 //在CPU上計算矩陣乘 58 void CpuMatrix(Matrix A,Matrix B,Matrix C) 59 { 60 int M,N,K; 61 M = A.height; 62 N = B.width; 63 K = A.width; 64 int i,j,k; 65 for(i = 0;i < M;i++) 66 { 67 for(j = 0;j<N;j++) 68 { 69 float sum = 0.0; 70 for(k = 0;k<K;k++) 71 { 72 sum += A.elements[i * K + k] * B.elements[k * N + j]; 73 } 74 C.elements[i * N + j] = sum; 75 } 76 } 77 } 78 void MatMul(Matrix A,Matrix B,Matrix C) 79 { 80 Matrix d_A; 81 Matrix d_B; 82 Matrix d_C; 83 d_A.width = A.width; 84 d_A.height = A.height; 85 d_B.width = B.width; 86 d_B.height = B.height; 87 d_C.width = C.width; 88 d_C.height = C.height; 89 size_t size_A = A.width * A.height * sizeof(float); 90 size_t size_B = B.width * B.height * sizeof(float); 91 size_t size_C = C.width * C.height * sizeof(float); 92 93 hipMalloc(&d_A.elements,size_A); 94 hipMalloc(&d_B.elements,size_B); 95 hipMalloc(&d_C.elements,size_C); 96 dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); 97 dim3 dimGrid(B.width / dimBlock.x,A.height / dimBlock.y); 98 99 hipMemcpy(d_A.elements,A.elements,size_A,hipMemcpyHostToDevice); 100 hipMemcpy(d_B.elements,B.elements,size_B,hipMemcpyHostToDevice); 101 //測試時間 102 float gpu_time; 103 hipEvent_t start_GPU,stop_GPU; 104 hipEventCreate(&start_GPU); 105 hipEventCreate(&stop_GPU); 106 hipEventRecord(start_GPU,0); 107 108 hipLaunchKernelGGL(MatMulKernel,dimGrid,dimBlock,0,0,d_A,d_B,d_C); 109 110 hipEventRecord(stop_GPU,0); 111 hipEventSynchronize(start_GPU); 112 hipEventSynchronize(stop_GPU); 113 hipEventElapsedTime(&gpu_time,start_GPU,stop_GPU); 114 hipDeviceSynchronize(); 115 printf("\nGPU spend time is: %lf(ms)\n",gpu_time/1000); 116 hipEventDestroy(start_GPU); 117 hipEventDestroy(stop_GPU); 118 hipMemcpy(C.elements,d_C.elements,size_C,hipMemcpyDeviceToHost); 119 120 printf("\nGPU result is :\n"); 121 shuchu(C,C.width*C.height); 122 printf("\n"); 123 hipFree(d_A.elements); 124 hipFree(d_B.elements); 125 hipFree(d_C.elements); 126 } 127 int main() 128 { 129 Matrix A; 130 Matrix B; 131 Matrix C; 132 A.width = BLOCK_SIZE; 133 A.height = BLOCK_SIZE; 134 B.width = BLOCK_SIZE; 135 B.height = BLOCK_SIZE; 136 C.width = BLOCK_SIZE; 137 C.height = BLOCK_SIZE; 138 139 int size = BLOCK_SIZE * BLOCK_SIZE; 140 int size_A = A.width * A.height * sizeof(float); 141 int size_B = B.width * B.height * sizeof(float); 142 int size_C = C.width * C.height * sizeof(float); 143 144 A.elements = (float *)malloc(size_A); 145 B.elements = (float *)malloc(size_B); 146 C.elements = (float *)malloc(size_C); 147 148 initial(A.elements,A.height*A.width); 149 printf("A:\n"); 150 shuchu(A,A.width*A.height); 151 152 printf("\nB:\n"); 153 initial(B.elements,B.height*B.width); 154 shuchu(B,B.width*B.height); 155 156 //調用CPU計算 157 //測試CPU的計算時間 158 clock_t start_CPU,stop_CPU; 159 double cpu_time; 160 start_CPU = clock(); 161 162 CpuMatrix(A,B,C); 163 stop_CPU = clock(); 164 //cpu_time = (double)(stop_CPU-start_CPU)/CLOCKS_PER_SEC; 165 //printf("\nCPU time is %lf(ms)\n",cpu_time); 166 printf("\nCPU result :\n"); 167 shuchu(C,C.width*C.height); 168 // shuchu(C,C.width*C.height); 169 printf("\n"); 170 MatMul(A,B,C); 171 return 0; 172 } 173 174
運行結果如下:
3.利用結構體實現HIP的數組相加
1 #include <stdio.h> 2 #include <time.h> 3 #include <stdlib.h> 4 #include <hip/hip_runtime.h> 5 #include <hip/hip_runtime_api.h> 6 7 typedef struct{ 8 int width; 9 float* elements; 10 }Matrix; 11 12 #define BLOCK_SIZE 4 13 14 __global__ void MatMulKernel(const Matrix,const Matrix,Matrix); 15 16 void initial(float* A,int N) 17 { 18 int i; 19 for(i = 0;i<N;i++) 20 { 21 A[i] = rand()%10; 22 } 23 } 24 25 void shuchu(Matrix A,int N) 26 { 27 for(int i=0; i < N; i++) 28 { 29 printf("%10lf",A.elements[i]); 30 } 31 printf("\n"); 32 } 33 34 __global__ void MatMulKernel(Matrix A,Matrix B,Matrix C) 35 { 36 int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; 37 38 C.elements[col] = A.elements[col]+B.elements[col]; 39 } 40 41 42 void CpuMatrix(Matrix A,Matrix B,Matrix C) 43 { 44 int N; 45 N = B.width; 46 int i; 47 for(i=0;i<N;i++) 48 { 49 C.elements[i] = A.elements[i] + B.elements[i]; 50 } 51 52 } 53 void MatMul(Matrix A,Matrix B,Matrix C) 54 { 55 Matrix d_A; 56 Matrix d_B; 57 Matrix d_C; 58 d_A.width = A.width; 59 d_B.width = B.width; 60 d_C.width = C.width; 61 62 size_t size_A = A.width * sizeof(float); 63 size_t size_B = B.width * sizeof(float); 64 size_t size_C = C.width * sizeof(float); 65 66 hipMalloc(&d_A.elements,size_A); 67 hipMalloc(&d_B.elements,size_B); 68 hipMalloc(&d_C.elements,size_C); 69 dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); 70 dim3 dimGrid(1); 71 72 hipMemcpy(d_A.elements,A.elements,size_A,hipMemcpyHostToDevice); 73 hipMemcpy(d_B.elements,B.elements,size_B,hipMemcpyHostToDevice); 74 75 float gpu_time; 76 hipEvent_t start_GPU,stop_GPU; 77 hipEventCreate(&start_GPU); 78 hipEventCreate(&stop_GPU); 79 hipEventRecord(start_GPU,0); 80 81 hipLaunchKernelGGL(MatMulKernel,dimGrid,dimBlock,0,0,d_A,d_B,d_C); 82 83 hipEventRecord(stop_GPU,0); 84 hipEventSynchronize(start_GPU); 85 hipEventSynchronize(stop_GPU); 86 hipEventElapsedTime(&gpu_time,start_GPU,stop_GPU); 87 hipDeviceSynchronize(); 88 printf("\nGPU spend time is: %lf(ms)\n",gpu_time/1000); 89 hipEventDestroy(start_GPU); 90 hipEventDestroy(stop_GPU); 91 hipMemcpy(C.elements,d_C.elements,size_C,hipMemcpyDeviceToHost); 92 93 printf("\nGPU result is :\n"); 94 shuchu(C,C.width); 95 printf("\n"); 96 hipFree(d_A.elements); 97 hipFree(d_B.elements); 98 hipFree(d_C.elements); 99 } 100 int main() 101 { 102 Matrix A; 103 Matrix B; 104 Matrix C; 105 A.width = BLOCK_SIZE; 106 107 B.width = BLOCK_SIZE; 108 109 C.width = BLOCK_SIZE; 110 111 int size_A = A.width * sizeof(float); 112 int size_B = B.width * sizeof(float); 113 int size_C = C.width * sizeof(float); 114 115 A.elements = (float *)malloc(size_A); 116 B.elements = (float *)malloc(size_B); 117 C.elements = (float *)malloc(size_C); 118 119 initial(A.elements,A.width); 120 printf("A:\n"); 121 shuchu(A,A.width); 122 123 printf("\nB:\n"); 124 initial(B.elements,B.width); 125 shuchu(B,B.width); 126 127 CpuMatrix(A,B,C); 128 129 printf("\nCPU result :\n"); 130 shuchu(C,C.width); 131 132 printf("\n"); 133 MatMul(A,B,C); 134 return 0; 135 } 136 137
運行結果如下:
4.使用共享內存實現矩陣乘法(利用了結構體)
下面的示例代碼是利用共享內存的矩陣乘法的實現.在這個實現中,每個線程塊負責計算 C 的一個方形子 矩陣 Csub ,塊內的每個線程負責計算 Csub 的一個元素.如圖 10 所示,Csub 等於兩個矩陣的乘積:維度為 (A.width, block_size)的子矩陣 A 與 Csub 有相同的行索引,維度為(block_size, A.width )的子矩陣 B 與 Csub 有相同的列索引.為了適應設備資源的需求,將這兩個矩陣根據需要分為維度為 block_size 的多個 正方形矩陣.計算這些方形矩陣的乘積之和即可得到 Csub .這些乘積中的每一個的計算都是首先將兩個 對應的正方形矩陣從全局內存加載到共享內存,一個線程加載一個元素,然后再讓每個線程計算一個元 素.每個線程將這些乘積的結果累積到一個寄存器中,完成后再將結果寫入全局內存.
通過這種方式分塊計算,我們充分利用了快速的共享內存,並節省了大量的全局內存帶寬,因為 A 只從全局內存中讀取了(B.width / block_size)次,B 只從全局內存中讀取了(A.height / block_size)次。
前一段示例代碼中的矩陣類型使用了 stride 字段進行擴充,因此可以使用相同類型有效地表示子矩陣。__device__函數用於獲取和設置元素並從矩陣中構建任何子矩陣。

1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <hip/hip_runtime.h> 4 #include <hip/hip_runtime_api.h> 5 6 typedef struct{ 7 int width; 8 int height; 9 int stride; 10 float* elements; 11 }Matrix; 12 13 #define BLOCK_SIZE 4 14 15 //初始化 16 void initial(float* A,int N) 17 { 18 int i; 19 for(i = 0;i<N;i++) 20 { 21 A[i] = rand()%10; 22 } 23 } 24 25 26 __device__ float GetElement(const Matrix A,int row,int col) 27 { 28 return A.elements[row*A.stride+col]; 29 } 30 31 __device__ void SetElement(Matrix A,int row,int col,float value) 32 { 33 A.elements[row*A.stride+col]=value; 34 } 35 __device__ Matrix GetSubMatrix(Matrix A,int row,int col) 36 { 37 Matrix Asub; 38 Asub.width = BLOCK_SIZE; 39 Asub.height = BLOCK_SIZE; 40 Asub.stride = A.stride; 41 Asub.elements = &A.elements[A.stride*BLOCK_SIZE*row+BLOCK_SIZE*col]; 42 return Asub; 43 } 44 45 void shuchu(Matrix A,int N) 46 { 47 48 int j=0; 49 for(int i=0; i < N; i++) 50 { 51 if( j == A.width) 52 { 53 printf("\n"); 54 j = 0; 55 i--; 56 }else 57 { 58 printf("%15lf",A.elements[i]); 59 j++; 60 } 61 } 62 printf("\n"); 63 } 64 __global__ void MatMulKernel(Matrix A,Matrix B,Matrix C) 65 { 66 int blockRow = hipBlockIdx_y; 67 int blockCol = hipBlockIdx_x; 68 Matrix Csub = GetSubMatrix(C,blockRow,blockCol); 69 float Cvalue = 0; 70 int row = hipThreadIdx_y; 71 int col = hipThreadIdx_x; 72 for(int m=0; m<(A.width/BLOCK_SIZE);++m) 73 { 74 Matrix Asub = GetSubMatrix(A,blockRow,m); 75 Matrix Bsub = GetSubMatrix(B,m,blockCol); 76 __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 77 __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 78 79 As[row][col]=GetElement(Asub,row,col); 80 Bs[row][col]=GetElement(Bsub,row,col); 81 82 __syncthreads(); 83 for(int e = 0;e<BLOCK_SIZE;++e) 84 { 85 Cvalue += As[row][e]*Bs[e][col]; 86 } 87 __syncthreads(); 88 SetElement(Csub,row,col,Cvalue); 89 } 90 } 91 92 void MatMul(const Matrix A,const Matrix B,Matrix C) 93 { 94 Matrix d_A; 95 d_A.width = d_A.stride = A.width; 96 d_A.height = A.height; 97 size_t size = A.width * A.height * sizeof(float); 98 hipMalloc(&d_A.elements,size); 99 hipMemcpy(d_A.elements,A.elements,size,hipMemcpyHostToDevice); 100 101 Matrix d_B; 102 d_B.width = d_B.stride=B.width; 103 d_B.height = B.height; 104 size = B.width * B.height * sizeof(float); 105 hipMalloc(&d_B.elements,size); 106 hipMemcpy(d_B.elements,B.elements,size,hipMemcpyHostToDevice); 107 108 Matrix d_C; 109 d_C.width = d_C.stride = C.width; 110 d_C.height = C.height; 111 size = C.width * C.height * sizeof(float); 112 hipMalloc(&d_C.elements,size); 113 dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); 114 dim3 dimGrid(B.width / dimBlock.x,A.height / dimBlock.y); 115 116 float gpu_time; 117 hipEvent_t start_GPU,stop_GPU; 118 hipEventCreate(&start_GPU); 119 hipEventCreate(&stop_GPU); 120 hipEventRecord(start_GPU,0); 121 122 hipLaunchKernelGGL(MatMulKernel,dimGrid,dimBlock,0,0,d_A,d_B,d_C); 123 124 hipEventRecord(stop_GPU,0); 125 hipEventSynchronize(start_GPU); 126 hipEventSynchronize(stop_GPU); 127 hipEventElapsedTime(&gpu_time,start_GPU,stop_GPU); 128 hipDeviceSynchronize(); 129 printf("\nGPU spend time is: %lf(ms)\n",gpu_time/1000); 130 hipEventDestroy(start_GPU); 131 hipEventDestroy(stop_GPU); 132 133 hipMemcpy(C.elements,d_C.elements,size,hipMemcpyDeviceToHost); 134 135 printf("\nGPU result is:\n"); 136 shuchu(C,C.width*C.height); 137 138 hipFree(d_A.elements); 139 hipFree(d_B.elements); 140 hipFree(d_C.elements); 141 } 142 143 //使用CPU進行計算 144 void CpuMatrix(Matrix A,Matrix B,Matrix C) 145 { 146 int M,N,K; 147 M = A.height; 148 N = B.width; 149 K = A.width; 150 int i,j,k; 151 for(i = 0;i < M;i++) 152 { 153 for(j = 0;j<N;j++) 154 { 155 float sum = 0.0; 156 for(k = 0;k<K;k++) 157 { 158 sum += A.elements[i * K + k] * B.elements[k * N + j]; 159 } 160 C.elements[i * N + j] = sum; 161 } 162 } 163 } 164 int main() 165 { 166 Matrix A; 167 Matrix B; 168 Matrix C; 169 170 A.width = BLOCK_SIZE; 171 A.height = BLOCK_SIZE; 172 B.width = BLOCK_SIZE; 173 B.height = BLOCK_SIZE; 174 C.width = BLOCK_SIZE; 175 C.height = BLOCK_SIZE; 176 177 int size_A = A.width * A.height * sizeof(float); 178 int size_B = B.width * B.height * sizeof(float); 179 int size_C = C.width * C.height * sizeof(float); 180 181 A.elements = (float *)malloc(size_A); 182 B.elements = (float *)malloc(size_B); 183 C.elements = (float *)malloc(size_C); 184 185 initial(A.elements,A.height*A.width); 186 printf("A:\n"); 187 shuchu(A,A.width*A.height); 188 189 printf("\nB:\n"); 190 initial(B.elements,B.height*B.width); 191 shuchu(B,B.width*B.height); 192 193 CpuMatrix(A,B,C); 194 printf("\nCPU result :\n"); 195 shuchu(C,C.width*C.height); 196 MatMul(A,B,C); 197 return 0; 198 }
運行結果如下: