HIP編程

本文轉載自查看原文 2020-04-17 14:35 1275 HIP編程

1.使用hip實現矩陣乘

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 
  4 #include <hip/hip_runtime.h>
  5 #include <hip/hip_runtime_api.h>
  6 
  7 #define M 4
  8 #define K 4
  9 #define N 4
 10 
 11 void initial(double* list,int row,int col)
 12 {
 13     double *num = list;
 14     for (int i=0; i<row*col; i++)
 15     {
 16         num[i] = rand()%10;
 17     }
 18 }
 19 
 20 void CpuMatrix(double *A,double *B,double *C)
 21 {
 22        int i,j,k;
 23 
 24        for( i=0; i<M; i++)
 25        {
 26            for(j=0; j<N; j++)
 27            {
 28                double sum = 0;
 29                for(int k=0; k<K; k++)
 30                {
 31                    sum += A[i*K + k] * B[k * N + j];
 32                }
 33                C[i * N + j] = sum;
 34            }
 35        }
 36 }
 37 
 38 __global__ void GpuMatrix(double *dev_A,double *dev_B,double *dev_C)
 39 {
 40     int ix = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 41     int iy = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
 42     if(ix<K && iy<M)
 43     {
 44     double sum = 0;
 45     for( int k = 0; k < K;k++)
 46     {
 47         sum += dev_A[iy*K + k] * dev_B[k*N + ix];
 48     }
 49     dev_C[iy * N + ix] = sum;
 50    }
 51 }
 52 
 53 void printMatrix(double *list,int row,int col)
 54 {
 55     double *p = list;
 56     for(int i=0; i<row; i++)
 57     {
 58         for(int j=0; j<col; j++)
 59         {
 60             printf("%10lf",p[j]);
 61         }
 62         p = p + col;
 63         printf("\n");
 64     }
 65 }
 66 int main(int argc,char **argv)
 67 {
 68     int Axy = M*K;
 69     int Abytes = Axy * sizeof(double);
 70 
 71     int Bxy = K*N;
 72     int Bbytes = Bxy * sizeof(double);
 73 
 74     int nxy = M*N;
 75     int nbytes = nxy * sizeof(double);
 76     
 77     float time_cpu,time_gpu;
 78     
 79     clock_t start_cpu,stop_cpu;
 80     
 81     hipEvent_t start_GPU,stop_GPU;
 82 
 83     double *host_A, *host_B, *host_C, *c_CPU;
 84     host_A = (double*)malloc(Abytes);
 85     host_B = (double*)malloc(Bbytes);
 86     host_C = (double*)malloc(nbytes);
 87     c_CPU = (double*)malloc(nbytes);
 88 
 89 
 90     initial(host_A,M,K);
 91 
 92     printf("A:(%d,%d):\n",M,K);
 93     printMatrix(host_A,M,K);
 94 
 95     initial(host_B,K,N);
 96 
 97     printf("B:(%d,%d):\n",K,N);
 98     printMatrix(host_B,K,N);
 99 
100    // start_cpu = clock();
101     CpuMatrix(host_A,host_B,host_C);
102  //   stop_cpu = clock();
103 
104     printf("Host_C:(%d,%d):\n",M,N);
105    // printf("\nCPU time is %f(ms)\n",(float)(stop_cpu-start_cpu)/CLOCKS_PER_SEC);
106     printMatrix(host_C,M,N);
107     double *dev_A,*dev_B,*dev_C;
108     hipMalloc(&dev_A,Axy*sizeof(double));
109     hipMalloc(&dev_B,Bxy*sizeof(double));
110     hipMalloc(&dev_C,nxy*sizeof(double));
111 
112     dim3 block(1024,1);
113     dim3 grid(64,64);
114 
115     hipMemcpy(dev_A,host_A,Abytes,hipMemcpyDeviceToHost);
116     hipMemcpy(dev_B,host_B,Bbytes,hipMemcpyDeviceToHost);
117 
118     hipEventCreate(&start_GPU);
119     hipEventCreate(&stop_GPU);
120     hipEventRecord(start_GPU,0);
121     hipLaunchKernelGGL(GpuMatrix,grid,block,0,0,dev_A,dev_B,dev_C);
122     hipEventRecord(stop_GPU,0);
123     hipEventSynchronize(start_GPU);
124     hipEventSynchronize(stop_GPU);
125     hipEventElapsedTime(&time_gpu, start_GPU,stop_GPU);
126     printf("\nThe time from GPU:\t%f(ms)\n", time_GPU/1000);
127     hipDeviceSynchronize();
128     hipEventDestroy(start_GPU);
129     hipEventDestroy(stop_GPU);
130 
131     hipMemcpy(c_CPU,dev_C,nbytes,hipMemcpyDeviceToHost);
132     printf("device_C:(%d,%d):\n",M,N);
133     printMatrix(c_CPU,M,N);
134 
135 
136      hipFree(dev_A);
137      hipFree(dev_B);
138      hipFree(dev_C);
139      free(host_A);
140      free(host_B);
141      free(host_C);
142      free(c_CPU);
143 
144      return 0;
145  }

結果如下：

2.使用結構體實現HIP的矩陣乘

　　共享內存使用__shared__ 內存空間說明符來分配。

　　共享內存應該比全局內存快得多，這在線程結構中有提及並在共享內存中有詳細描述。因此，任何可以用

　　共享內存訪問替換全局內存訪問的機會都應該被利用，如下面的矩陣乘法示例所示。

　　下面的示例代碼是不利用共享內存的矩陣乘法的簡單實現。每個線程讀取 A 的一行和 B 的一列，並計算 C

的相應元素，如圖 9 所示。因此，A 將從全局內存中被讀取 B.width 次，而 B 將被讀取 A.height 次。

  1 #include <stdio.h>
  2 #include <time.h>
  3 #include <stdlib.h>
  4 #include <hip/hip_runtime.h>
  5 #include <hip/hip_runtime_api.h>
  6 
  7 typedef struct{
  8         int width;
  9         int height;
 10         float* elements;
 11 }Matrix;
 12 
 13 #define BLOCK_SIZE 4
 14 
 15 __global__ void MatMulKernel(const Matrix,const Matrix,Matrix);
 16 
 17 void initial(float* A,int N)
 18 {
 19      int i;
 20      for(i = 0;i<N;i++)
 21      {
 22         A[i] = rand()%10;
 23      }
 24 }
 25 
 26 void shuchu(Matrix A,int N)
 27 {
 28 
 29     int j=0;
 30     for(int i=0; i < N; i++)
 31     {
 32         if( j == A.width)
 33         {
 34                 printf("\n");
 35                 j = 0;
 36                 i--;
 37         }else
 38         {
 39               printf("%15lf",A.elements[i]);
 40                 j++;
 41         }
 42     }
 43 }
 44 
 45 __global__ void MatMulKernel(Matrix A,Matrix B,Matrix C)
 46 {
 47         float Cvalue = 0;
 48         int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
 49         int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 50         for(int e = 0; e < A.width; ++e)
 51         {
 52                 Cvalue += A.elements[row * A.width + e] * B.elements[e*B.width + col];
 53         }
 54         C.elements[row * C.width + col] = Cvalue;
 55 }
 56 
 57 //在CPU上計算矩陣乘 
 58 void CpuMatrix(Matrix A,Matrix B,Matrix C)
 59 {
 60         int M,N,K;
 61         M = A.height;
 62         N = B.width;
 63         K = A.width;
 64         int i,j,k;
 65         for(i = 0;i < M;i++)
 66         {
 67                 for(j = 0;j<N;j++)
 68                 {
 69                         float sum = 0.0;
 70                         for(k = 0;k<K;k++)
 71                         {
 72                                 sum += A.elements[i * K + k] * B.elements[k * N + j];
 73                         }
 74                         C.elements[i * N + j] = sum;
 75                 }
 76         }
 77 }
 78 void MatMul(Matrix A,Matrix B,Matrix C)
 79 {
 80         Matrix d_A;
 81         Matrix d_B;
 82         Matrix d_C;
 83         d_A.width = A.width;
 84         d_A.height = A.height;
 85         d_B.width = B.width;
 86         d_B.height = B.height;
 87         d_C.width = C.width;
 88         d_C.height = C.height;
 89         size_t size_A = A.width * A.height * sizeof(float);
 90         size_t  size_B = B.width * B.height * sizeof(float);
 91         size_t  size_C = C.width * C.height * sizeof(float);
 92 
 93         hipMalloc(&d_A.elements,size_A);
 94         hipMalloc(&d_B.elements,size_B);
 95         hipMalloc(&d_C.elements,size_C);
 96         dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
 97         dim3 dimGrid(B.width / dimBlock.x,A.height / dimBlock.y);
 98 
 99         hipMemcpy(d_A.elements,A.elements,size_A,hipMemcpyHostToDevice);
100         hipMemcpy(d_B.elements,B.elements,size_B,hipMemcpyHostToDevice);
101         //測試時間
102         float gpu_time;
103         hipEvent_t start_GPU,stop_GPU;
104         hipEventCreate(&start_GPU);
105         hipEventCreate(&stop_GPU);
106         hipEventRecord(start_GPU,0);
107 
108         hipLaunchKernelGGL(MatMulKernel,dimGrid,dimBlock,0,0,d_A,d_B,d_C);
109 
110         hipEventRecord(stop_GPU,0);
111         hipEventSynchronize(start_GPU);
112         hipEventSynchronize(stop_GPU);
113         hipEventElapsedTime(&gpu_time,start_GPU,stop_GPU);
114         hipDeviceSynchronize();
115         printf("\nGPU spend time is: %lf(ms)\n",gpu_time/1000);
116         hipEventDestroy(start_GPU);
117         hipEventDestroy(stop_GPU);
118         hipMemcpy(C.elements,d_C.elements,size_C,hipMemcpyDeviceToHost);
119 
120         printf("\nGPU result is :\n");
121         shuchu(C,C.width*C.height);
122         printf("\n");
123         hipFree(d_A.elements);
124         hipFree(d_B.elements);
125         hipFree(d_C.elements);
126 }
127 int main()
128 {
129         Matrix A;
130         Matrix B;
131         Matrix C;
132         A.width = BLOCK_SIZE;
133         A.height = BLOCK_SIZE;
134         B.width = BLOCK_SIZE;
135         B.height = BLOCK_SIZE;
136         C.width = BLOCK_SIZE;
137         C.height = BLOCK_SIZE;
138 
139         int size = BLOCK_SIZE * BLOCK_SIZE;
140         int size_A = A.width * A.height * sizeof(float);
141         int size_B = B.width * B.height * sizeof(float);
142         int size_C = C.width * C.height * sizeof(float);
143 
144         A.elements = (float *)malloc(size_A);
145         B.elements = (float *)malloc(size_B);
146         C.elements = (float *)malloc(size_C);
147         
148         initial(A.elements,A.height*A.width);
149         printf("A:\n");
150         shuchu(A,A.width*A.height);
151        
152         printf("\nB:\n");
153         initial(B.elements,B.height*B.width);
154         shuchu(B,B.width*B.height);
155 
156         //調用CPU計算
157         //測試CPU的計算時間
158         clock_t start_CPU,stop_CPU;
159         double cpu_time;
160         start_CPU = clock();
161 
162         CpuMatrix(A,B,C);
163         stop_CPU = clock();
164         //cpu_time = (double)(stop_CPU-start_CPU)/CLOCKS_PER_SEC; 
165         //printf("\nCPU time is %lf(ms)\n",cpu_time);
166         printf("\nCPU result :\n");
167         shuchu(C,C.width*C.height);
168        // shuchu(C,C.width*C.height);
169         printf("\n");
170         MatMul(A,B,C);
171         return 0;
172 }
173 
174

運行結果如下：

3.利用結構體實現HIP的數組相加

  1 #include <stdio.h>
  2 #include <time.h>
  3 #include <stdlib.h>
  4 #include <hip/hip_runtime.h>
  5 #include <hip/hip_runtime_api.h>
  6 
  7 typedef struct{
  8         int width;
  9         float* elements;
 10 }Matrix;
 11 
 12 #define BLOCK_SIZE 4
 13 
 14 __global__ void MatMulKernel(const Matrix,const Matrix,Matrix);
 15 
 16 void initial(float* A,int N)
 17 {
 18      int i;
 19      for(i = 0;i<N;i++)
 20      {
 21         A[i] = rand()%10;
 22      }
 23 }
 24 
 25 void shuchu(Matrix A,int N)
 26 {
 27     for(int i=0; i < N; i++)
 28     {
 29        printf("%10lf",A.elements[i]);  
 30     }
 31     printf("\n");
 32 }
 33 
 34 __global__ void MatMulKernel(Matrix A,Matrix B,Matrix C)
 35 {
 36         int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 37         
 38         C.elements[col] = A.elements[col]+B.elements[col];
 39 }
 40 
 41 
 42 void CpuMatrix(Matrix A,Matrix B,Matrix C)
 43 {
 44         int N;
 45         N = B.width;
 46         int i;
 47         for(i=0;i<N;i++)
 48         {
 49             C.elements[i] = A.elements[i] + B.elements[i];
 50         }
 51         
 52 }
 53 void MatMul(Matrix A,Matrix B,Matrix C)
 54 {
 55         Matrix d_A;
 56         Matrix d_B;
 57         Matrix d_C;
 58         d_A.width = A.width;
 59         d_B.width = B.width;
 60         d_C.width = C.width;
 61         
 62         size_t size_A = A.width * sizeof(float);
 63         size_t size_B = B.width * sizeof(float);
 64         size_t size_C = C.width * sizeof(float);
 65 
 66         hipMalloc(&d_A.elements,size_A);
 67         hipMalloc(&d_B.elements,size_B);
 68         hipMalloc(&d_C.elements,size_C);
 69         dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
 70         dim3 dimGrid(1);
 71 
 72         hipMemcpy(d_A.elements,A.elements,size_A,hipMemcpyHostToDevice);
 73         hipMemcpy(d_B.elements,B.elements,size_B,hipMemcpyHostToDevice);
 74  
 75         float gpu_time;
 76         hipEvent_t start_GPU,stop_GPU;
 77         hipEventCreate(&start_GPU);
 78         hipEventCreate(&stop_GPU);
 79         hipEventRecord(start_GPU,0);
 80 
 81         hipLaunchKernelGGL(MatMulKernel,dimGrid,dimBlock,0,0,d_A,d_B,d_C);
 82 
 83         hipEventRecord(stop_GPU,0);
 84         hipEventSynchronize(start_GPU);
 85         hipEventSynchronize(stop_GPU);
 86         hipEventElapsedTime(&gpu_time,start_GPU,stop_GPU);
 87         hipDeviceSynchronize();
 88         printf("\nGPU spend time is: %lf(ms)\n",gpu_time/1000);
 89         hipEventDestroy(start_GPU);
 90         hipEventDestroy(stop_GPU);
 91         hipMemcpy(C.elements,d_C.elements,size_C,hipMemcpyDeviceToHost);
 92 
 93         printf("\nGPU result is :\n");
 94         shuchu(C,C.width);
 95         printf("\n");
 96         hipFree(d_A.elements);
 97         hipFree(d_B.elements);
 98         hipFree(d_C.elements);
 99 }
100 int main()
101 {
102         Matrix A;
103         Matrix B;
104         Matrix C;
105         A.width = BLOCK_SIZE;
106         
107         B.width = BLOCK_SIZE;
108         
109         C.width = BLOCK_SIZE;
110         
111         int size_A = A.width * sizeof(float);
112         int size_B = B.width * sizeof(float);
113         int size_C = C.width * sizeof(float);
114 
115         A.elements = (float *)malloc(size_A);
116         B.elements = (float *)malloc(size_B);
117         C.elements = (float *)malloc(size_C);
118 
119         initial(A.elements,A.width);
120         printf("A:\n");
121         shuchu(A,A.width);
122         
123         printf("\nB:\n");
124         initial(B.elements,B.width);
125         shuchu(B,B.width);
126 
127         CpuMatrix(A,B,C);
128         
129         printf("\nCPU result :\n");
130         shuchu(C,C.width);
131 
132         printf("\n");
133         MatMul(A,B,C);
134         return 0;
135 }
136 
137

運行結果如下：

4.使用共享內存實現矩陣乘法（利用了結構體）

下面的示例代碼是利用共享內存的矩陣乘法的實現.在這個實現中,每個線程塊負責計算 C 的一個方形子矩陣 Csub ,塊內的每個線程負責計算 Csub 的一個元素.如圖 10 所示,Csub 等於兩個矩陣的乘積：維度為 (A.width, block_size)的子矩陣 A 與 Csub 有相同的行索引,維度為(block_size, A.width )的子矩陣 B 與 Csub 有相同的列索引.為了適應設備資源的需求,將這兩個矩陣根據需要分為維度為 block_size 的多個正方形矩陣.計算這些方形矩陣的乘積之和即可得到 Csub .這些乘積中的每一個的計算都是首先將兩個對應的正方形矩陣從全局內存加載到共享內存,一個線程加載一個元素,然后再讓每個線程計算一個元素.每個線程將這些乘積的結果累積到一個寄存器中,完成后再將結果寫入全局內存.

通過這種方式分塊計算，我們充分利用了快速的共享內存，並節省了大量的全局內存帶寬，因為 A 只從全局內存中讀取了(B.width / block_size)次，B 只從全局內存中讀取了(A.height / block_size)次。

前一段示例代碼中的矩陣類型使用了 stride 字段進行擴充，因此可以使用相同類型有效地表示子矩陣。__device__函數用於獲取和設置元素並從矩陣中構建任何子矩陣。

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <hip/hip_runtime.h>
  4 #include <hip/hip_runtime_api.h>
  5 
  6 typedef struct{
  7         int width;
  8         int height;
  9         int stride;
 10         float* elements;
 11 }Matrix;
 12 
 13 #define BLOCK_SIZE 4
 14 
 15 //初始化 
 16 void initial(float* A,int N)
 17 {
 18      int i;
 19      for(i = 0;i<N;i++)
 20      {
 21         A[i] = rand()%10;
 22      }
 23 }
 24 
 25 
 26 __device__ float GetElement(const Matrix A,int row,int col)
 27 {
 28         return A.elements[row*A.stride+col];
 29 }
 30 
 31 __device__ void SetElement(Matrix A,int row,int col,float value)
 32 {
 33         A.elements[row*A.stride+col]=value;
 34 }
 35 __device__ Matrix GetSubMatrix(Matrix A,int row,int col)
 36 {
 37         Matrix Asub;
 38         Asub.width = BLOCK_SIZE;
 39         Asub.height = BLOCK_SIZE;
 40         Asub.stride = A.stride;
 41         Asub.elements = &A.elements[A.stride*BLOCK_SIZE*row+BLOCK_SIZE*col];
 42         return Asub;
 43 }
 44 
 45 void shuchu(Matrix A,int N)
 46 {
 47 
 48     int j=0;
 49     for(int i=0; i < N; i++)
 50     {
 51         if( j == A.width)
 52         {
 53                 printf("\n");
 54                 j = 0;
 55                 i--;
 56         }else
 57         {
 58                 printf("%15lf",A.elements[i]);
 59                 j++;
 60         }
 61     }
 62     printf("\n");
 63 }
 64 __global__ void MatMulKernel(Matrix A,Matrix B,Matrix C)
 65 {
 66        int blockRow = hipBlockIdx_y;
 67        int blockCol = hipBlockIdx_x;
 68        Matrix Csub = GetSubMatrix(C,blockRow,blockCol);
 69        float Cvalue = 0;
 70        int row = hipThreadIdx_y;
 71        int col = hipThreadIdx_x;
 72        for(int m=0; m<(A.width/BLOCK_SIZE);++m)
 73        {
 74         Matrix Asub = GetSubMatrix(A,blockRow,m);
 75         Matrix Bsub = GetSubMatrix(B,m,blockCol);
 76         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 77         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 78 
 79         As[row][col]=GetElement(Asub,row,col);
 80         Bs[row][col]=GetElement(Bsub,row,col);
 81 
 82         __syncthreads();
 83         for(int e = 0;e<BLOCK_SIZE;++e)
 84         {
 85                 Cvalue += As[row][e]*Bs[e][col];
 86         }
 87         __syncthreads();
 88         SetElement(Csub,row,col,Cvalue);
 89        }
 90 }
 91 
 92 void MatMul(const Matrix A,const Matrix B,Matrix C)
 93 {
 94         Matrix d_A;
 95         d_A.width = d_A.stride = A.width;
 96         d_A.height = A.height;
 97         size_t size = A.width * A.height * sizeof(float);
 98         hipMalloc(&d_A.elements,size);
 99         hipMemcpy(d_A.elements,A.elements,size,hipMemcpyHostToDevice);
100 
101         Matrix d_B;
102         d_B.width = d_B.stride=B.width;
103         d_B.height = B.height;
104         size = B.width * B.height * sizeof(float);
105         hipMalloc(&d_B.elements,size);
106         hipMemcpy(d_B.elements,B.elements,size,hipMemcpyHostToDevice);
107 
108         Matrix d_C;
109         d_C.width = d_C.stride =  C.width;
110         d_C.height = C.height;
111         size = C.width * C.height * sizeof(float);
112         hipMalloc(&d_C.elements,size);
113         dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
114         dim3 dimGrid(B.width / dimBlock.x,A.height / dimBlock.y);
115         
116         float gpu_time;
117         hipEvent_t start_GPU,stop_GPU;
118         hipEventCreate(&start_GPU);
119         hipEventCreate(&stop_GPU);
120         hipEventRecord(start_GPU,0);
121         
122         hipLaunchKernelGGL(MatMulKernel,dimGrid,dimBlock,0,0,d_A,d_B,d_C);
123         
124         hipEventRecord(stop_GPU,0);
125         hipEventSynchronize(start_GPU);
126         hipEventSynchronize(stop_GPU);
127         hipEventElapsedTime(&gpu_time,start_GPU,stop_GPU);
128         hipDeviceSynchronize();
129         printf("\nGPU spend time is: %lf(ms)\n",gpu_time/1000);
130         hipEventDestroy(start_GPU);
131         hipEventDestroy(stop_GPU);
132         
133         hipMemcpy(C.elements,d_C.elements,size,hipMemcpyDeviceToHost);
134 
135         printf("\nGPU result is:\n");
136         shuchu(C,C.width*C.height);
137 
138         hipFree(d_A.elements);
139         hipFree(d_B.elements);
140         hipFree(d_C.elements);
141 }
142 
143 //使用CPU進行計算 
144 void CpuMatrix(Matrix A,Matrix B,Matrix C)
145 {
146         int M,N,K;
147         M = A.height;
148         N = B.width;
149         K = A.width;
150         int i,j,k;
151         for(i = 0;i < M;i++)
152         {
153                 for(j = 0;j<N;j++)
154                 {
155                         float sum = 0.0;
156                         for(k = 0;k<K;k++)
157                         {
158                                 sum += A.elements[i * K + k] * B.elements[k * N + j];
159                         }
160                         C.elements[i * N + j] = sum;
161                 }
162         }
163 }
164 int main()
165 {
166         Matrix A;
167         Matrix B;
168         Matrix C;
169 
170         A.width = BLOCK_SIZE;
171         A.height = BLOCK_SIZE;
172         B.width = BLOCK_SIZE;
173         B.height = BLOCK_SIZE;
174         C.width = BLOCK_SIZE;
175         C.height = BLOCK_SIZE;
176         
177         int size_A = A.width * A.height * sizeof(float);
178         int size_B = B.width * B.height * sizeof(float);
179         int size_C = C.width * C.height * sizeof(float);
180 
181         A.elements = (float *)malloc(size_A);
182         B.elements = (float *)malloc(size_B);
183         C.elements = (float *)malloc(size_C);
184         
185         initial(A.elements,A.height*A.width);
186         printf("A:\n");
187         shuchu(A,A.width*A.height);
188        
189         printf("\nB:\n");
190         initial(B.elements,B.height*B.width);
191         shuchu(B,B.width*B.height);
192         
193         CpuMatrix(A,B,C);
194         printf("\nCPU result :\n");
195         shuchu(C,C.width*C.height);
196         MatMul(A,B,C);
197         return 0;
198 }

運行結果如下：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 我用Python爬取網易雲音樂上的Hip-hop歌單，分析rapper如何押韻什么是編程？為什么要編程？【socket編程】什么是socket編程如何學好編程（一）:什么叫編程同步編程與異步編程並發編程（五） oracle 編程 zookeeper編程（二）網絡編程我的編程經歷