本文的起源來自最近一個讓我非常不爽的事。
我最近在改一個開源RNN工具包currennt(http://sourceforge.net/projects/currennt/),想用它實現RNNLM功能。
currennt使用了大量的面向對象的編程技巧,可以使用GPU,向量運算使用了thrust庫(https://code.google.com/p/thrust/)。
RNNLM(http://rnnlm.org/)也有相應開源實現,非常算法風格的代碼,向量運算就是自己使用數組實現的。
結果……大出我的語料,在不使用GPU的情況下,currennt慢成狗!我不斷的修改,直到最后幾乎完全在currennt里重寫了一個RNNLM……速度才終於一致了。這花費了我大量時間,最關鍵的是我根本沒打算花這些時間,算是計划外開銷。
所以這里干脆對常用的幾種向量運算做個評測,下回遇到至少心里有數。
參與評測的向量實現包括:
- C++ array
- C++ STL vector
- C++ thrust(CPU)
- C++ thrust(GPU)
- python
- python numpy
評測指標包括:
- 創建、填充向量
- 向量點乘,相乘
- 矩陣相乘
測試環境:
Intel Xeon CPU E5649@2.53GHz x24
VS2010
python 2.7.6 (32bit)
thrust v1.5
numpy 1.8.1
C++ array
創建全0向量:0.000s,幾乎不占用時間
int vector_size=100000000; float* vector=(float*)calloc(vector_size,sizeof(float));
創建+填充向量:0.140s
int vector_size=100000000;
float* vector=(float*)calloc(vector_size,sizeof(float));
for (int i=0;i<vector_size;++i){
vector[i]=0.01;
}
向量點乘:0.390s
float sum=0;
for(int i=0;i<vector_size;++i){
sum+=vector1[i]*vector2[i];
}
向量相乘:0.265s
float sum=0;
for(int i=0;i<vector_size;++i){
vector3[i]=vector1[i]*vector2[i];
}
矩陣乘向量:0.344s
int matrix1_colnum=50000;
int matrix1_rownum=2000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
float* vector1=(float*)calloc(matrix1_size,sizeof(float));
for (int i=0;i<matrix1_size;++i){
vector1[i]=0.01;
}
float* vector2=(float*)calloc(matrix1_colnum,sizeof(float));
for (int i=0;i<matrix1_colnum;++i){
vector2[i]=0.02;
}
start_t=clock();
float* vector3=(float*)calloc(matrix1_rownum,sizeof(float));
for(int row=0;row<matrix1_rownum;++row){
for(int col=0;col<matrix1_colnum;++col){
vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];
}
}
end_t=clock();
矩陣乘矩陣:0.749
(耗費時間與matrix1_rownum*matrix1_colnum*matrix2_colnum成正比)
int matrix1_rownum=200;
int matrix1_colnum=5000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
float* vector1=(float*)calloc(matrix1_size,sizeof(float));
for (int i=0;i<matrix1_size;++i){
vector1[i]=0.01;
}
int matrix2_rownum=5000;
int matrix2_colnum=200;
int matrix2_size=matrix2_rownum*matrix2_colnum;
float* vector2=(float*)calloc(matrix2_size,sizeof(float));
for (int i=0;i<matrix2_size;++i){
vector2[i]=0.02;
}
int matrix3_size=matrix1_rownum*matrix2_colnum;
float* vector3=(float*)calloc(matrix3_size,sizeof(float));
start_t=clock();
for(int row1=0;row1<matrix1_rownum;++row1){
for(int col2=0;col2<matrix2_colnum;++col2){
for(int col1=0;col1<matrix1_colnum;++col1){
vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2];
}
}
}
end_t=clock();
C++ STL vector
創建全0向量:0.140s
int vect_size=100000000;
vector<float> vector(vect_size);
創建+填充向量:0.140s
int vect_size=100000000; vector<float> vector(vect_size,0.01);
向量點乘:0.375s
int vect_size=100000000;
vector<float> vector1(vect_size,0.01);
vector<float> vector2(vect_size,0.02);
start_t=clock();
float sum=0;
for(int i=0;i<vect_size;++i){
sum+=vector1[i]*vector2[i];
}
end_t=clock();
向量相乘:0.250s
int vect_size=100000000;
vector<float> vector1(vect_size,0.01);
vector<float> vector2(vect_size,0.02);
vector<float> vector3(vect_size);
start_t=clock();
for(int i=0;i<vect_size;++i){
vector3[i]=vector1[i]*vector2[i];
}
end_t=clock();
矩陣乘向量:0.390s
int matrix1_colnum=50000;
int matrix1_rownum=2000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
vector<float> vector1(matrix1_size,0.01);
vector<float> vector2(matrix1_colnum,0.02);
vector<float> vector3(matrix1_rownum);
start_t=clock();
for(int row=0;row<matrix1_rownum;++row){
for(int col=0;col<matrix1_colnum;++col){
vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];
}
}
end_t=clock();
矩陣乘法:0.827s
int matrix1_rownum=200;
int matrix1_colnum=5000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
vector<float> vector1(matrix1_size,0.01);
int matrix2_rownum=5000;
int matrix2_colnum=200;
int matrix2_size=matrix2_rownum*matrix2_colnum;
vector<float> vector2(matrix2_size,0.02);
int matrix3_size=matrix1_rownum*matrix2_colnum;
vector<float> vector3(matrix3_size);
start_t=clock();
for(int row1=0;row1<matrix1_rownum;++row1){
for(int col2=0;col2<matrix2_colnum;++col2){
for(int col1=0;col1<matrix1_colnum;++col1){
vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2];
}
}
}
end_t=clock();
C++ thrust(CPU)
創建全0向量:0.140s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size);
創建+填充向量:0.140s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size,0.01);
填充向量:0.078s
thrust::fill(vector1.begin(),vector1.end(),0.01);
向量點乘:0.359s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size,(float)0.1); thrust::host_vector<float> vector2(vect_size,(float)0.2); thrust::host_vector<float> vector3(vect_size,(float)0.2); start_t=clock(); thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>()); float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies<float>()); end_t=clock();
向量相乘:0.187s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size,(float)0.1); thrust::host_vector<float> vector2(vect_size,(float)0.2); thrust::host_vector<float> vector3(vect_size); start_t=clock(); thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>()); end_t=clock();
矩陣乘向量:0.110s
struct matrixXvect_func
{
thrust::host_vector<float>* matrix;
thrust::host_vector<float>* vector;
int matrix_rownum;
int matrix_colnum;
__host__ __device__
float operator()(const int& idx) const{
float t=0;
for(int col=0;col<matrix_colnum;++col){
t+=(*matrix)[idx*matrix_colnum+col]* (*vector)[col];
}
return t;
}
};
int matrix1_rownum=2000;
int matrix1_colnum=50000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
thrust::host_vector<float> vector1(matrix1_size,(float)0.1);
thrust::host_vector<float> vector2(matrix1_colnum,(float)0.2);
thrust::host_vector<float> vector3(matrix1_rownum);
start_t=clock();
matrixXvect_func fn;
fn.matrix=&vector1;
fn.vector=&vector2;
fn.matrix_rownum=matrix1_rownum;
fn.matrix_colnum=matrix1_colnum;
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(0) + matrix1_rownum,
vector3.begin(),
fn
);
end_t=clock();
矩陣乘矩陣:0.655s
struct matrixXmatrix_func
{
thrust::host_vector<float>* matrix1;
thrust::host_vector<float>* matrix2;
int matrix1_rownum;
int matrix1_colnum;
int matrix2_rownum;
int matrix2_colnum;
__host__ __device__
float operator()(const int& idx) const{
int rownum=idx/matrix2_colnum;
int colnum=idx%matrix2_colnum;
float t=0;
for(int col=0;col<matrix1_colnum;++col){
t+=(*matrix1)[rownum*matrix1_colnum+col]* (*matrix2)[col*matrix2_colnum+colnum];
}
return t;
}
};
int matrix1_rownum=200;
int matrix1_colnum=5000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
thrust::host_vector<float> vector1(matrix1_size,(float)0.1);
int matrix2_rownum=5000;
int matrix2_colnum=200;
int matrix2_size=matrix2_rownum*matrix2_colnum;
thrust::host_vector<float> vector2(matrix2_size,(float)0.2);
int matrix3_size=matrix1_rownum*matrix2_colnum;
thrust::host_vector<float> vector3(matrix3_size);
start_t=clock();
matrixXmatrix_func fn;
fn.matrix1=&vector1;
fn.matrix2=&vector2;
fn.matrix1_rownum=matrix1_rownum;
fn.matrix1_colnum=matrix1_colnum;
fn.matrix2_rownum=matrix2_rownum;
fn.matrix2_colnum=matrix2_colnum;
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(0) + matrix3_size,
vector3.begin(),
fn
);
end_t=clock();
C++ thrust(GPU)
創建全0向量:0.140s
int vect_size=1000000; thrust::device_vector<float> vector1(vect_size);
創建+填充向量:0.140s
int vect_size=1000000; thrust::device_vector<float> vector1(vect_size,0.1);
CPU向量賦值:0.141s
int vect_size=1000000; thrust::host_vector<float> vector1(vect_size,0.1); start_t=clock(); thrust::device_vector<float> vector2=vector1; end_t=clock();
填充向量:0.000s
int vect_size=1000000; thrust::device_vector<float> vector(vect_size); start_t=clock(); thrust::fill(vector.begin(),vector.end(),(float)0.1); end_t=clock();
向量點乘:0.016s
int vect_size=100000000; thrust::device_vector<float> vector1(vect_size,(float)0.1); thrust::device_vector<float> vector2(vect_size,(float)0.2); thrust::device_vector<float> vector3(vect_size,(float)0.2); start_t=clock(); thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>()); float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies<float>()); end_t=clock();
向量相乘:0.000s
int vect_size=100000000; thrust::device_vector<float> vector1(vect_size,(float)0.1); thrust::device_vector<float> vector2(vect_size,(float)0.2); thrust::device_vector<float> vector3(vect_size); start_t=clock(); thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>()); end_t=clock();
矩陣乘向量(實現1):0.530s
int matrix1_rownum=2000;
int matrix1_colnum=50000;
int matrix1_size=matrix1_colnum*matrix1_rownum;
thrust::device_vector<float> vector1(matrix1_size,(float)0.1);
thrust::device_vector<float> vector2(matrix1_colnum,(float)0.2);
thrust::device_vector<float> tmp(matrix1_colnum);
thrust::device_vector<float> vector3(matrix1_rownum);
start_t=clock();
for(int row=0;row<matrix1_rownum;++row){
thrust::transform(vector1.begin()+row*matrix1_colnum,vector1.begin()+(row+1)*matrix1_colnum,vector2.begin(),tmp.begin(),thrust::multiplies<float>());
vector3[row]=thrust::reduce(tmp.begin(),tmp.end(),(float)0,thrust::multiplies<float>());
}
end_t=clock();
矩陣乘向量(實現2)CUBLAS,待試
矩陣乘矩陣CUBLAS,待試
Python
直接使用python的list實現上述功能實在太慢……而且由於無法指定float類型,其默認使用16位double類型來表示小數,使用10^8會超出list索引上限……故只使用10^7實驗,速度差距可以自行換算。
大致估算python的向量運算比c++慢50倍,矩陣運算慢1000。
初始化向量並賦值:1.51s
vector_size=10000000 vector=[] for i in range(vector_size): vector.append(0.1)
向量點乘:1.75s
vector_size=10000000
vector1=[] for i in range(vector_size): vector1.append(0.1) vector2=[] for i in range(vector_size): vector2.append(0.1) start_t=time.time() sum=0 for i in range(vector_size): sum+=vector1[i]*vector2[i] end_t=time.time()
向量相乘:2.39
vector_size=10000000 vector1=[] for i in range(vector_size): vector1.append(0.1) vector2=[] for i in range(vector_size): vector2.append(0.1) vector3=[] for i in range(vector_size): vector3.append(0.1) start_t=time.time() for i in range(vector_size): vector3[i]=vector1[i]*vector2[i] end_t=time.time()
矩陣乘向量:3.06s
matrix1_rownum=2000 matrix1_colnum=5000 matrix1_size=matrix1_rownum*matrix1_colnum vector1=[] for i in range(matrix1_size): vector1.append(0.1) vector2=[] for i in range(matrix1_colnum): vector2.append(0.1) vector3=[] for i in range(matrix1_rownum): vector3.append(0.1) start_t=time.time() for row in range(matrix1_rownum): for col in range(matrix1_colnum): vector3[row]=vector1[row*matrix1_colnum+col]*vector2[col] end_t=time.time()
矩陣相乘:11.37s
matrix1_rownum=200 matrix1_colnum=500 matrix1_size=matrix1_rownum*matrix1_colnum vector1=[] for i in range(matrix1_size): vector1.append(0.1) matrix2_rownum=500 matrix2_colnum=200 matrix2_size=matrix2_rownum*matrix2_colnum vector2=[] for i in range(matrix2_size): vector2.append(0.1) matrix3_size=matrix1_rownum*matrix2_colnum vector3=[] for i in range(matrix3_size): vector3.append(0.1) start_t=time.time() for row in range(matrix1_rownum): for col in range(matrix2_colnum): for i in range(matrix1_colnum): vector3[row*matrix2_colnum+col]+=vector1[row*matrix1_colnum+i]*vector2[i*matrix2_colnum+col] end_t=time.time()
當然實際進行向量運算沒人會拿python的list數據結構進行運算,這里只是好奇定量測一下list到底有多慢……
Python numpy
創建全0向量:0.0s
vector_size=100000000 vector=numpy.zeros(vector_size)
創建+填充向量:0.25s
vector_size=100000000 vector=numpy.zeros(vector_size) vector.fill(0.01)
向量點乘:0.125s(由於python是32位……內存原因,數據規模減半)
vector_size=50000000 vector1=numpy.zeros(vector_size) vector1.fill(0.01) vector2=numpy.zeros(vector_size) vector2.fill(0.02) start_t=time.time() sum=numpy.inner(vector1,vector2) end_t=time.time()
向量相乘:0.234s
vector_size=50000000 vector1=numpy.zeros(vector_size) vector1.fill(0.01) vector2=numpy.zeros(vector_size) vector2.fill(0.02) start_t=time.time() vector3=numpy.multiply(vector1,vector2) end_t=time.time()
矩陣乘向量:0.094s
matrix1_rownum=2000 matrix1_colnum=50000 matrix1_size=matrix1_rownum*matrix1_colnum vector1=numpy.zeros(matrix1_size) vector1.fill(0.01) vector2=numpy.zeros(matrix1_colnum) vector2.fill(0.02) start_t=time.time() vector1=vector1.reshape(matrix1_rownum,matrix1_colnum) vector2=vector2.reshape(matrix1_colnum,1) vector3=numpy.dot(vector1,vector2) end_t=time.time()
矩陣乘矩陣:23.16s(numpy.dot出乎意料的慢,使用numpy.matrix類時間為11.73s,依舊很慢而且占用更大內存,在創建matrix對象時也要0.4s)
matrix1_rownum=2000 matrix1_colnum=50000 matrix1_size=matrix1_rownum*matrix1_colnum vector1=numpy.zeros(matrix1_size) vector1.fill(0.01) matrix2_rownum=50000 matrix2_colnum=1000 matrix2_size=matrix2_rownum*matrix2_colnum vector2=numpy.zeros(matrix2_size) vector2.fill(0.02) start_t=time.time() vector1=vector1.reshape(matrix1_rownum,matrix1_colnum) vector2=vector2.reshape(matrix2_rownum,matrix2_colnum) vector3=numpy.dot(vector1,vector2) end_t=time.time()
