python對CUDA擴展有不錯的支持,CUDA通過大量線程的並行化可以大幅提高代碼計算速度,一般python常用numba、pycuda套件來支持CUDA擴展。numba通過JIT編譯器只需將numba裝飾器應用到python函數中即可實現CUDA加速,而pycuda需要基於C/C++編寫kernel,其移植性、直觀性更佳,這里主要介紹pycuda的使用。
1.向量加法
示例使用了1個block,block中含有400個線程,每個線程計算向量加法最終結果的一個值。
import numpy
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void vect_add(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] + b[i];
}
""")
vect_add = mod.get_function("vect_add")
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)
vect_add(cuda.Out(dest), cuda.In(a), cuda.In(b), block=(400,1,1), grid=(1,1))
print(dest-(a+b))
2.矩陣乘法
示例使用的是25x25個block,每個block中為16x16的線程,每個線程計算矩陣乘法最終結果的一個值。
import numpy
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void matrix_mul(float *dest, float *a, float *b, int width)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
float sum = 0;
for(int k=0;k<width;k++)
{
float a_k = a[j*width+k];
float b_k = b[k*width+i];
sum += a_k*b_k;
}
dest[j*width+i] = sum;
}
""")
matrix_mul = mod.get_function("matrix_mul")
a = numpy.random.randn(400, 400).astype(numpy.float32)
b = numpy.random.randn(400, 400).astype(numpy.float32)
dest = numpy.zeros_like(a)
width = numpy.int32(400)
matrix_mul(cuda.Out(dest), cuda.In(a), cuda.In(b), width, block=(16,16,1), grid=(25,25))
print(dest)
print(numpy.dot(a,b))
3.矩陣乘法優化
我們可以使用矩陣分塊乘法對上面的矩陣乘法代碼進行優化,例如將6x6的矩陣A切分成9個2x2的小塊,以A_11~A_33命名每個小塊,則對於矩陣乘法AxB=C(假設維度都為6x6),有A_11xB_11+A_12xB_21+A_13xB_31=C_11,具體做法是將每個矩陣小塊需要使用的數據先讀取到共享內存中再進行矩陣小塊的乘法,以減少對全局內存的訪問,從而達到有效加速的目的。
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
mod = SourceModule("""
#define BLOCK_SIZE 16
typedef struct {
int width;
int height;
int stride;
int __padding; //為了和64位的elements指針對齊
float* elements;
} Matrix;
// 讀取矩陣元素
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// 賦值矩陣元素
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// 獲取 16x16 的子矩陣
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col];
return Asub;
}
__global__ void matrix_mul(Matrix *A, Matrix *B, Matrix *C)
{
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
int row = threadIdx.y;
int col = threadIdx.x;
Matrix Csub = GetSubMatrix(*C, blockRow, blockCol);
// 每個線程通過累加Cvalue計算Csub的一個值
float Cvalue = 0;
// 為了計算Csub遍歷所有需要的Asub和Bsub
for (int m = 0; m < (A->width / BLOCK_SIZE); ++m)
{
Matrix Asub = GetSubMatrix(*A, blockRow, m);
Matrix Bsub = GetSubMatrix(*B, m, blockCol);
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
__syncthreads();
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
__syncthreads();
}
SetElement(Csub, row, col, Cvalue);
}
""")
class MatrixStruct(object):
def __init__(self, array):
self._cptr = None
self.shape, self.dtype = array.shape, array.dtype
self.width = np.int32(self.shape[1])
self.height = np.int32(self.shape[0])
self.stride = self.width
self.elements = cuda.to_device(array) # 分配內存並拷貝數組數據至device,返回其地址
def send_to_gpu(self):
self._cptr = cuda.mem_alloc(self.nbytes()) # 分配一個C結構體所占的內存
cuda.memcpy_htod(int(self._cptr), self.width.tobytes()) # 拷貝數據至device,下同
cuda.memcpy_htod(int(self._cptr)+4, self.height.tobytes())
cuda.memcpy_htod(int(self._cptr)+8, self.stride.tobytes())
cuda.memcpy_htod(int(self._cptr)+16, np.intp(int(self.elements)).tobytes())
def get_from_gpu(self):
return cuda.from_device(self.elements, self.shape, self.dtype) # 從device取回數組數據
def nbytes(self):
return self.width.nbytes * 4 + np.intp(0).nbytes
a = np.random.randn(400,400).astype(np.float32)
b = np.random.randn(400,400).astype(np.float32)
c = np.zeros_like(a)
A = MatrixStruct(a)
B = MatrixStruct(b)
C = MatrixStruct(c)
A.send_to_gpu()
B.send_to_gpu()
C.send_to_gpu()
matrix_mul = mod.get_function("matrix_mul")
matrix_mul(A._cptr, B._cptr, C._cptr, block=(16,16,1), grid=(25,25))
result = C.get_from_gpu()
print(np.dot(a,b))
print(result)