实现一个计算pi的小程序,探索hip+mpi+fortran的编译链接方式
有fortran文件、hip.cpp文件、C文件(嵌入mpi),其中, c文件调用fortran文件中的函数和hip.cpp文件中的函数。每个文件单独编译,最终用mpif90链接在一起。.c文件用mpicc编译,.f90文件用mpif90编译,hip文件用hipcc编译,并编译成动态库(.so).
文件结构
mpi-pi.f90 用mpif90编译
pi-hip.cpp 用hipcc编译
main.cpp 用mpicc编译
用mpif90链接
Makefile
编译链接
方法1:
分别将三个文件编译成对应的.o文件,直接链接在一起:
make ex1:
mpif90 -c mpi-pi.f90
hipcc -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -c pi_hip.cpp
mpicc -c -std=c++11 main.cpp
mpif90 mpi-pi.o pi_hip.o main.o -lm -lstdc++ -o out-pi
make run
run:
mpirun -mca pml ucx -np 8 ./out-pi
clean:
rm -fr *.o *.err *.out out-pi *.so
编译报错:
(.text+0x18835): undefined reference to `hip_impl::kernargs(bool)'
pi_hip.o: In function `std::vector<unsigned char, std::allocator<unsigned char> > hip_impl::make_kernarg<float*, int, float, float, int, int, float*, int, float, float, int, int>(void (*)(float*, int, float, float, int, int), std::tuple<float*, int, float, float, int, int>)':
解决:链接时加入hip的库,改Makefile如下:
HIPLIB=-L/opt/rocm/hip/lib -lhip_hcc
make ex1:
mpif90 -c mpi-pi.f90
hipcc -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -c pi_hip.cpp
mpicc -c -std=c++11 main.cpp
mpif90 mpi-pi.o pi_hip.o main.o $(HIPLIB) -lm -lstdc++ -o out-pi
make run
run:
mpirun -mca pml ucx -np 8 ./out-pi
clean:
rm -fr *.o *.err *.out out-pi *.so
报错:
terminate called after throwing an instance of 'std::runtime_error'
what(): Missing metadata for __global__ function: _Z6cal_piPfiffii
[h16r4n18:86129] *** Process received signal ***
[h16r4n18:86129] Signal: Aborted (6)
[h16r4n18:86129] Signal code: (-6)
[h16r4n18:86129] [ 0] /lib64/libpthread.so.0(+0xf5e0)[0x2b1da8eb95e0]
[h16r4n18:86129] [ 1] /lib64/libc.so.6(gsignal+0x37)[0x2b1da90fb1f7]
[h16r4n18:86129] [ 2] /lib64/libc.so.6(abort+0x148)[0x2b1da90fc8e8]
[h16r4n18:86129] [ 3] /lib64/libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0x165)[0x2b1da8064ac5]
[h16r4n18:86129] [ 4] /lib64/libstdc++.so.6(+0x5ea36)[0x2b1da8062a36]
[h16r4n18:86129] [ 5] /lib64/libstdc++.so.6(+0x5ea63)[0x2b1da8062a63]
[h16r4n18:86129] [ 6] /lib64/libstdc++.so.6(+0x5ec83)[0x2b1da8062c83]
[h16r4n18:86129] [ 7] ./out-pi[0x4239fd]
[h16r4n18:86129] [ 8] ./out-pi[0x40ba79]
[h16r4n18:86129] [ 9] ./out-pi[0x4243ea]
[h16r4n18:86129] [10] /lib64/libc.so.6(__libc_start_main+0xf5)[0x2b1da90e7c05]
[h16r4n18:86129] [11] ./out-pi[0x40a348]
[h16r4n18:86129] *** End of error message ***
方法2:
先将hip文件(pi-hip.cpp)编译成对应的.o文件,在编译成动态库pi_hip.so,链接时加入目标文件。
make ex2:
make clean
mpif90 -c mpi-pi.f90
hipcc -c -fpic -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -o pi_hip.o pi_hip.cpp
hipcc -shared pi_hip.o -o libpi_hip.so
mpicc -c -std=c++11 main.cpp
mpif90 -L./ -lpi_hip mpi-pi.o main.o -lm -lstdc++ -o out-pi
make run
run:
mpirun -mca pml ucx -np 8 ./out-pi
clean:
rm -fr *.o *.err *.out out-pi *.so
运行结果正确,如下
Process 1 says "Hello, world!".
Process 5 says "Hello, world!".
COMMUNICATOR_MPI - Master process:
FORTRAN90/MPI version
An MPI example program.
The number of processes is 8
Process 0 says "Hello, world!".
Process 4 says "Hello, world!".
Process 7 says "Hello, world!".
Process 3 says "Hello, world!".
Process 6 says "Hello, world!".
Process 2 says "Hello, world!".
Number of processes in even communicator = 4
Sum of global ID's in even communicator = 12
COMMUNICATOR_MPI:
Normal end of execution.
Number of processes in odd communicator = 4
Sum of global ID's in odd communicator = 16
step = 0.0000001
myid = 5: partial pi = 0.3396083
step = 0.0000001
myid = 1: partial pi = 0.4824923
step = 0.0000001
myid = 4: partial pi = 0.3798115
step = 0.0000001
myid = 6: partial pi = 0.3013121
step = 0.0000001
myid = 2: partial pi = 0.4551676
step = 0.0000001
myid = 3: partial pi = 0.4195001
step = 0.0000001
myid = 7: partial pi = 0.2662676
step = 0.0000001
myid = 0: partial pi = 0.4974218
PI = 3.1415813
源文件代码如下:
mpi-pi.f90
subroutine hello( nnn )
!*****************************************************************************80
! mpif90 -o exe mpi-pi.f90
include "mpif.h"
integer ( kind = 4 ) even_comm_id
integer ( kind = 4 ) even_group_id
integer ( kind = 4 ) even_id
integer ( kind = 4 ) even_id_sum
integer ( kind = 4 ) even_p
integer ( kind = 4 ), allocatable :: even_rank(:)
integer ( kind = 4 ) i
integer ( kind = 4 ) id
integer ( kind = 4 ) ierr
integer ( kind = 4 ) j
integer ( kind = 4 ) odd_comm_id
integer ( kind = 4 ) odd_group_id
integer ( kind = 4 ) odd_id
integer ( kind = 4 ) odd_id_sum
integer ( kind = 4 ) odd_p
integer ( kind = 4 ) p
integer ( kind = 4 ), allocatable :: odd_rank(:)
integer nnn
integer ( kind = 4 ) world_group_id
!
! Initialize MPI.
!
! call MPI_Init ( ierr )
!
! Get the number of processes.
!
call MPI_Comm_size ( MPI_COMM_WORLD, p, ierr )
!
! Get the individual process ID.
!
call MPI_Comm_rank ( MPI_COMM_WORLD, id, ierr )
!
! Process 0 prints an introductory message.
!
if ( id == 0 ) then
write ( *, '(a)' ) 'COMMUNICATOR_MPI - Master process:'
write ( *, '(a)' ) ' FORTRAN90/MPI version'
write ( *, '(a)' ) ' An MPI example program.'
write ( *, '(a)' ) ' '
write ( *, '(a,i4)' ) ' The number of processes is ', p
write ( *, '(a)' ) ' '
end if
!
! Every process prints a hello.
!
! write ( *, '(a,i4,a)' ) ' Process ', id, ' says "Hello, world!".'
!
! Get a group identifier for MPI_COMM_WORLD.
!
call MPI_Comm_group ( MPI_COMM_WORLD, world_group_id, ierr )
!
! List the even processes, and create their group.
!
even_p = ( p + 1 ) / 2
allocate ( even_rank(1:even_p) )
j = 0
do i = 0, p - 1, 2
j = j + 1
even_rank(j) = i
end do
call MPI_Group_incl ( world_group_id, even_p, even_rank, even_group_id, ierr )
call MPI_Comm_create ( MPI_COMM_WORLD, even_group_id, even_comm_id, ierr )
!
! List the odd processes, and create their group.
!
odd_p = p / 2
allocate ( odd_rank(1:odd_p) )
j = 0
do i = 1, p - 1, 2
j = j + 1
odd_rank(j) = i
end do
call MPI_Group_incl ( world_group_id, odd_p, odd_rank, odd_group_id, ierr )
call MPI_Comm_create ( MPI_COMM_WORLD, odd_group_id, odd_comm_id, ierr )
!
! Try to get ID of each process in both groups.
! If a process is not in a communicator, set its ID to -1.
!
if ( mod ( id, 2 ) == 0 ) then
call MPI_Comm_rank ( even_comm_id, even_id, ierr )
odd_id = -1
else
call MPI_Comm_rank ( odd_comm_id, odd_id, ierr )
even_id = -1
end if
!
! Use MPI_Reduce to sum the global ID of each process in the even
! group.
! Assuming 4 processes: EVEN_SUM = 0 + 2 = 2
!
if ( even_id /= -1 ) then
call MPI_Reduce ( id, even_id_sum, 1, MPI_INTEGER, MPI_SUM, 0, &
even_comm_id, ierr )
end if
if ( even_id == 0 ) then
write ( *, '(a,i4)' ) &
' Number of processes in even communicator = ', even_p
write ( *, '(a,i4)' ) &
' Sum of global ID''s in even communicator = ', even_id_sum
end if
!
! Use MPI_Reduce to sum the global ID of each process in the odd group.
! Assuming 4 processes: ODD_SUM = 1 + 3 = 4
!
if ( odd_id /= -1 ) then
call MPI_Reduce ( id, odd_id_sum, 1, MPI_INTEGER, MPI_SUM, 0, &
odd_comm_id, ierr )
end if
if ( odd_id == 0 ) then
write ( *, '(a,i4)' ) &
' Number of processes in odd communicator = ', odd_p
write ( *, '(a,i4)' ) &
' Sum of global ID''s in odd communicator = ', odd_id_sum
end if
!
! Terminate MPI.
!
! call MPI_Finalize ( ierr )
!
! Free memory.
!
deallocate ( even_rank )
deallocate ( odd_rank )
!
! Terminate
!
if ( id == 0 ) then
write ( *, '(a)' ) ' '
write ( *, '(a)' ) 'COMMUNICATOR_MPI:'
write ( *, '(a)' ) ' Normal end of execution.'
write ( *, '(a)' ) ' '
end if
end subroutine hello
pi-hip.cpp
#include<stdio.h>
#include<stdlib.h>
#include <hip/hip_runtime.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13 // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block
__global__ void cal_pi(float *sum,int nbin,float step,float offset,int nthreads,int nblocks)
{
int i;
float x;
int idx = blockIdx.x*blockDim.x+threadIdx.x; // Sequential thread index across blocks
for (i=idx; i< nbin; i+=nthreads*nblocks) { // Interleaved bin assignment to threads
x = offset+(i+0.5)*step;
sum[idx] += 4.0/(1.0+x*x);
}
}
void computePI(int nproc,int myid, float *sumHost,float step)
{
int nbin;
float offset;
float *sumDev; // Pointers to device arrays
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions (only use 1D)
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions (only use 1D)
nbin = NBIN/nproc; // Number of bins per MPI process
offset = myid*step*nbin; // Quadrature-point offset
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
hipMalloc((void **) &sumDev,size); // Allocate array on device
hipMemset(sumDev,0,size); // Reset array in device to 0
// // Calculate on device (call CUDA kernel)
hipLaunchKernelGGL(cal_pi,dimGrid,dimBlock,0,0,sumDev,nbin,step,offset,NUM_THREAD,NUM_BLOCK);
// // Retrieve result from device and store it in host array
hipMemcpy(sumHost,sumDev,size,hipMemcpyDeviceToHost);
hipFree(sumDev);
}
main.cpp
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13 // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block
// Kernel that executes on the CUDA device
void computePI(int nproc,int myid,float *sumHost,float step);
extern "C" { void hello_(int *nnn ); }
#define hello_f hello_
int main(int argc,char **argv) {
int myid,nproc,tid, nbin, nnn;
float pi=0.0, pig, step;
float *sumHost; // Pointers to host arrays
MPI_Init(&argc,&argv);
nnn = 1;//add by wangwu
hello_f(&nnn);//calling fortran mpi hello, just a test
MPI_Comm_rank(MPI_COMM_WORLD,&myid); // My MPI rank
MPI_Comm_size(MPI_COMM_WORLD,&nproc); // Number of MPI processes
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
sumHost = (float *)malloc(size); // Allocate array on host
nbin = NBIN/nproc; // Number of bins per MPI process
step = 1.0/(float)(nbin*nproc); // Step size with redefined number of bins
computePI(nproc,myid,sumHost,step);
// // Reduction over CUDA threads
for(tid=0; tid<NUM_THREAD*NUM_BLOCK; tid++) pi += sumHost[tid];
if (myid==0) printf("on DCUs: HIP CPP for pi\n");
// printf("step = %11.7f\n", step);
pi *=step;
free(sumHost);
printf("myid = %d: partial pi = %11.7f\n",myid, pi);
// // Reduction over MPI processes
MPI_Allreduce(&pi,&pig,1,MPI_FLOAT,MPI_SUM,MPI_COMM_WORLD);
if (myid==0) printf("PI = %11.7f\n",pig);
MPI_Finalize();
return 0;
}