0.前言
本系列文章記錄筆者關於c語言多線程編程的學習過程
平台及相關環境:Windows;MinGW64;DevC++;cmd命令行;4 CPUs
(硬件原因,沒有選擇Linux,原理應該差不多)
參考書籍:《並行程序設計導論》Peter S.Pacheco 著 鄧倩妮 等譯
以下程序理解不難,大部分不作注釋
1.問題描述
如果\(A=(a_{ij})\)是一個m*n的矩陣,\(x=(x_0,x_1,...,x_{n-1})^T\)是一個n維列向量,矩陣-向量的乘積\(Ax=y\) 是個m維的列向量。\(y=(y_0,y_1,...,y_{n-1})^T\)中的第i個元素\(y_i\)是矩陣A的第i行與x的點積:
2.串行程序
2.1 生成數據
為了與后面並行程序產生對比,利用隨機數生成大量數據進行計算。為了簡便,矩陣中的元素為整型,范圍為[0,9]。
//generate_data.c
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#define random(a,b) (rand()%(b-a+1)+a) //[a,b]
const int m = 20;
const int n = 20;
const int NUM_OF_MATRIX = 1000;
int main() {
srand((unsigned)time(NULL));
FILE *mA = fopen("matrix_A","w");
for(int i=0; i<NUM_OF_MATRIX; ++i) {
for(int j=0; j<m*n; ++j) {
fprintf(mA,"%d",(int)random(1,9));
}
fprintf(mA,"\n");
}
FILE *mx = fopen("matrix_x","w");
for(int i=0; i<NUM_OF_MATRIX; ++i) {
for(int j=0; j<n; ++j) {
fprintf(mx,"%d",(int)random(1,9));
}
fprintf(mx,"\n");
}
printf("Success");
}
得到matrix_A文件和matrix_x文件。(圖中只顯示部分)
2.2 串行計算
《程序設計導論》給出了計算例題的矩陣-乘法 串行程序偽代碼:
以下代碼為筆者設計的代碼:
//calculate.c
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<assert.h>
#include<sys/time.h>
#include<stdint.h>
#define MAXL 100000005
const int m = 20;
const int n = 20;
const int DEBUG_MODE = 0;
int **A,*x,*y;
char bufferA[MAXL],bufferx[MAXL];
void init_storage() {
A = (int**) malloc(m*sizeof(int*));
for(int i=0; i<m; ++i) {
A[i] = (int*) malloc(n*sizeof(int));
}
x = (int*) malloc(n*sizeof(int));
y = (int*) malloc(m*sizeof(int));
}
void print_matrix() {
printf("Matrix A(%d rows and %d columns):\n",m,n);
for(int i=0; i<m; ++i) {
for(int j=0; j<n; ++j) {
printf("%d ",A[i][j]);
}
printf("\n");
}
printf("Matrix x(%d rows and one column):\n",n);
for(int i=0; i<n; ++i) {
printf("%d\n",x[i]);
}
printf("A*x:\n");
for(int i=0; i<m; ++i) {
printf("%d\n",y[i]);
}
}
void matrix_multi() {
for(int i=0; i<m; ++i) {
y[i]=0;
for(int j=0; j<n; ++j) {
y[i]+=A[i][j]*x[j];
}
}
}
int64_t now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
void work(int argc, char* argv[]) {
FILE* mA = fopen(argv[1], "r");
FILE* mx = fopen(argv[2], "r");
FILE* result=fopen("result","w");
while(fgets(bufferA, sizeof bufferA, mA)!=NULL) {
if(DEBUG_MODE) printf("argv[1]:%s\n",bufferA);
int count = 0;
for(int i=0; i<m; ++i)
for(int j=0; j<n; ++j)
A[i][j]=bufferA[count++]-'0';
fgets(bufferx, sizeof bufferx, mx);
if(DEBUG_MODE) printf("argv[2]:%s\n",bufferx);
count=0;
for(int i=0; i<n; ++i)
x[i]=bufferx[count++]-'0';
matrix_multi();
if(DEBUG_MODE) print_matrix();
for(int i=0; i<m; ++i) {
fprintf(result,"%d ",y[i]);
}
fprintf(result,"\n");
}
}
int main(int argc, char* argv[]) {
assert(argc==3);
init_storage();
int64_t start = now();
work(argc,argv);
int64_t end = now();
double sec = (end-start)/1000000.0;
printf("%f sec\n", sec);
}
2.3 操作步驟
可直接在DevC++軟件中編譯,將calculate.cpp編譯生成calculate.exe,然后在cmd命令行傳參數
在result文件中可查看結果:
3.並行程序
3.1 設計思路
通過把工作分配給各個各個線程將程序並行化,一種分配方法是將線程外層的循環分塊,每個線程計算y的一部分。
只需要編寫每一個線程的代碼,確定每個線程計算哪一部分的y。為了簡化代碼,假設m與n都能被t(線程數量)整除,每個線程能分配到m/t行的運算數據,線程0處理第一部分的m/t行,線程1處理第二部分的m/t行,以此類推。所以第q個線程處理的矩陣行是:
第一行:\(q*\frac{m}{t}\)
最后一行:\((q+1)*\frac{m}{t}-1\)
偽代碼:
3.2 並行計算
//thread_calculate.c
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<assert.h>
#include<pthread.h>
#include<sys/time.h>
#include<stdint.h>
#define ll long long
#define MAXL 100000005
const int m = 1e3;
const int n = 1e3;
const int DEBUG_MODE = 0;
char bufferA[MAXL],bufferx[MAXL];
int **A,*x,*y;
int thread_count;
void init_storage() {
A = (int**) malloc(m*sizeof(int*));
for(int i=0; i<m; ++i) {
A[i] = (int*) malloc(n*sizeof(int));
}
x = (int*) malloc(n*sizeof(int));
y = (int*) malloc(m*sizeof(int));
}
void print_matrix() {
printf("Matrix A(%d rows and %d columns):\n",m,n);
for(int i=0; i<m; ++i) {
for(int j=0; j<n; ++j) {
printf("%d ",A[i][j]);
}
printf("\n");
}
printf("Matrix x(%d rows and one column):\n",n);
for(int i=0; i<n; ++i) {
printf("%d\n",x[i]);
}
printf("A*x:\n");
for(int i=0; i<m; ++i) {
printf("%d\n",y[i]);
}
}
int64_t now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
void* Pth_mat_vect(void* rank) {
ll my_rank = (ll) rank;
int local_m = m/thread_count;
int my_first_row=my_rank*local_m;
int my_last_row = (my_rank+1)*local_m-1;
for(int i=my_first_row; i<=my_last_row; ++i) {
y[i]=0;
for(int j=0; j<n; ++j) {
y[i]+=A[i][j]*x[j];
}
}
return NULL;
}
void work(int argc, char* argv[]) {
FILE* mA = fopen(argv[1], "r");
FILE* mx = fopen(argv[2], "r");
FILE* result=fopen("result","w");
ll thread;
pthread_t* thread_handles;
thread_count = strtol(argv[3],NULL,10);
while(fgets(bufferA, sizeof bufferA, mA)!=NULL) {
if(DEBUG_MODE) printf("argv[1]:%s\n",bufferA);
int count = 0;
for(int i=0; i<m; ++i)
for(int j=0; j<n; ++j)
A[i][j]=bufferA[count++]-'0';
fgets(bufferx, sizeof bufferx, mx);
if(DEBUG_MODE) printf("argv[2]:%s\n",bufferx);
count=0;
for(int i=0; i<n; ++i)
x[i]=bufferx[count++]-'0';
thread_handles = (pthread_t *)malloc(thread_count*sizeof(pthread_t));
for(thread = 0; thread<thread_count; thread++) {
pthread_create(&thread_handles[thread],NULL,
Pth_mat_vect,(void*) thread);
}
// printf("Hello from the main thread\n");
for(thread = 0; thread<thread_count; thread++) {
pthread_join(thread_handles[thread],NULL);
}
free(thread_handles);
if(DEBUG_MODE) print_matrix();
for(int i=0; i<m; ++i) {
fprintf(result,"%d ",y[i]);
}
fprintf(result,"\n");
}
}
int main(int argc, char* argv[]) {
assert(argc==4);
init_storage();
int64_t start = now();
work(argc,argv);
int64_t end = now();
double sec = (end-start)/1000000.0;
printf("%f sec\n", sec);
}
3.3 操作步驟
和2.3類似,這里多設置了一個參數即線程數量。
可以對比,使用多線程數量耗時竟然還長。筆者也有點慌。不急,先假設不是代碼思路的鍋,而是數據規模過小,導致創建多線程等所耗的時間要多得多。
因此,重新生成數據,擴大規模試一波:
4.改良設計
4.1測試
測試到這一步,后知后覺的筆者知道了前面並行計算的設計一定出了問題。反反復復的創建、銷毀多線程耗費了太多時間。
因此將矩陣數量設置為1,擴大矩陣規模,再進行測試:
終於讓筆者看到了希望,多線程不是假的。
4.2 改良
現在情況應該比較明朗了,創建一次多線程即可,每個線程函數要處理多個矩陣的某一部分計算。這也許需要預先保存多個矩陣。(可以理解為化在線為離線
了吧)
同時,前面的代碼測試的時間都包括了讀數據、寫數據等。為了更好的對比時間的花費,將上面的兩份代碼都轉化為離線的並且僅測試矩陣乘法部分。
4.3 串行計算(離線)
//calculate2.c
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<assert.h>
#include<sys/time.h>
#include<stdint.h>
#define MAXL 100000005
const int m = 20;
const int n = 20;
const int num_of_matrix = 1e5;
int ***A,**x,**y;
char bufferA[MAXL],bufferx[MAXL];
void init_storage() {
A = (int***) malloc(num_of_matrix*sizeof(int**));
for(int i=0; i<num_of_matrix; ++i) {
A[i] = (int**) malloc(m*sizeof(int*));
for(int j=0; j<m; ++j) {
A[i][j] = (int*)malloc(n*sizeof(int));
}
}
x = (int**) malloc(num_of_matrix*sizeof(int*));
for(int i=0; i<num_of_matrix; ++i) {
x[i] = (int*) malloc(n*sizeof(int));
}
y = (int**) malloc(num_of_matrix*sizeof(int*));
for(int i=0; i<num_of_matrix; ++i) {
y[i] = (int*) malloc(m*sizeof(int));
}
}
void matrix_multi(int k) {
for(int i=0; i<m; ++i) {
y[k][i]=0;
for(int j=0; j<n; ++j) {
y[k][i]+=A[k][i][j]*x[k][j];
}
}
}
int64_t now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
void work(int argc, char* argv[]) {
FILE* mA = fopen(argv[1], "r");
FILE* mx = fopen(argv[2], "r");
FILE* result=fopen("result","w");
int k = 0;
while(fgets(bufferA, sizeof bufferA, mA)!=NULL) {
int count = 0;
for(int i=0; i<m; ++i)
for(int j=0; j<n; ++j)
A[k][i][j]=bufferA[count++]-'0';
fgets(bufferx, sizeof bufferx, mx);
count=0;
for(int i=0; i<n; ++i)
x[k][i]=bufferx[count++]-'0';
k++;
}
int64_t start = now();
for(k=0;k<num_of_matrix;++k)
matrix_multi(k);
int64_t end = now();
double sec = (end-start)/1000000.0;
printf("%f ms\n", 1000*sec);
printf("k: %d\n",k);
for(k=0; k<num_of_matrix; ++k) {
for(int i=0; i<m; ++i) {
fprintf(result,"%d ",y[k][i]);
}
fprintf(result,"\n");
}
}
int main(int argc, char* argv[]) {
assert(argc==3);
init_storage();
work(argc,argv);
}
4.4 並行計算(離線)
//thread_calculate.c
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<assert.h>
#include<pthread.h>
#include<sys/time.h>
#include<stdint.h>
#define ll long long
#define MAXL 100000005
const int m = 20;
const int n = 20;
const int num_of_matrix = 1e5;
char bufferA[MAXL],bufferx[MAXL];
int ***A,**x,**y;
int thread_count;
void init_storage() {
A = (int***) malloc(num_of_matrix*sizeof(int**));
for(int i=0; i<num_of_matrix; ++i) {
A[i] = (int**) malloc(m*sizeof(int*));
for(int j=0; j<m; ++j) {
A[i][j] = (int*)malloc(n*sizeof(int));
}
}
x = (int**) malloc(num_of_matrix*sizeof(int*));
for(int i=0; i<num_of_matrix; ++i) {
x[i] = (int*) malloc(n*sizeof(int));
}
y = (int**) malloc(num_of_matrix*sizeof(int*));
for(int i=0; i<num_of_matrix; ++i) {
y[i] = (int*) malloc(m*sizeof(int));
}
}
int64_t now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;
}
void* Pth_mat_vect(void* rank) {
ll my_rank = (ll) rank;
int local_m = m/thread_count;
int my_first_row=my_rank*local_m;
int my_last_row = (my_rank+1)*local_m-1;
for(int k=0; k<num_of_matrix; ++k) {
for(int i=my_first_row; i<=my_last_row; ++i) {
y[k][i]=0;
for(int j=0; j<n; ++j) {
y[k][i]+=A[k][i][j]*x[k][j];
}
}
}
return NULL;
}
void work(int argc, char* argv[]) {
FILE* mA = fopen(argv[1], "r");
FILE* mx = fopen(argv[2], "r");
FILE* result=fopen("result","w");
int k = 0;
while(fgets(bufferA, sizeof bufferA, mA)!=NULL) {
int count = 0;
for(int i=0; i<m; ++i)
for(int j=0; j<n; ++j)
A[k][i][j]=bufferA[count++]-'0';
fgets(bufferx, sizeof bufferx, mx);
count=0;
for(int i=0; i<n; ++i)
x[k][i]=bufferx[count++]-'0';
k++;
}
ll thread;
pthread_t* thread_handles;
thread_count = strtol(argv[3],NULL,10);
int64_t start = now();
thread_handles = (pthread_t *)malloc(thread_count*sizeof(pthread_t));
for(thread = 0; thread<thread_count; thread++) {
pthread_create(&thread_handles[thread],NULL,
Pth_mat_vect,(void*) thread);
}
for(thread = 0; thread<thread_count; thread++) {
pthread_join(thread_handles[thread],NULL);
}
int64_t end = now();
double sec = (end-start)/1000000.0;
printf("%f ms\n", 1000*sec);
free(thread_handles);
printf("k: %d\n",k);
for(int k=0; k<num_of_matrix; ++k) {
for(int i=0; i<m; ++i) {
fprintf(result,"%d ",y[k][i]);
}
fprintf(result,"\n");
}
}
int main(int argc, char* argv[]) {
assert(argc==4);
init_storage();
work(argc,argv);
}
4.5 耗時對比
可以發現多線程不是開玩笑的(廢話)。由於筆者電腦是有4個cpu,所以線程數為4的時候表現最好
5.后記
以上數據均為測試多次取平均值(有時候嫌計算麻煩取了中位數)
初入並行程序設計,不當之處在所難免