Example: MPI ping-pong programs in C and f77
/* pong.c Generic Benchmark code
* Dave Turner - Ames Lab - July of 1994+++
*
* Most Unix timers can't be trusted for very short times, so take this
* into account when looking at the results. This code also only times
* a single message passing event for each size, so the results may vary
* between runs. For more accurate measurements, grab NetPIPE from
* http://www.scl.ameslab.gov/ .
*/
#include "mpi.h"
#include < stdio.h >
#include < stdlib.h >
void
main (int argc, char **argv)
{
int myproc, size, other_proc, nprocs, i, last;
double t0, t1, time;
double *a, *b;
double max_rate = 0.0, min_latency = 10e6;
MPI_Request request, request_a, request_b;
MPI_Status status;
#if defined (_CRAYT3E)
a = (double *) shmalloc (132000 * sizeof (double));
b = (double *) shmalloc (132000 * sizeof (double));
#else
a = (double *) malloc (132000 * sizeof (double));
b = (double *) malloc (132000 * sizeof (double));
#endif
for (i = 0; i < 132000; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
if (nprocs != 2) exit (1);
other_proc = (myproc + 1) % 2;
printf("Hello from %d of %d\n", myproc, nprocs);
MPI_Barrier(MPI_COMM_WORLD);
/* Timer accuracy test */
t0 = MPI_Wtime();
t1 = MPI_Wtime();
while (t1 == t0) t1 = MPI_Wtime();
if (myproc == 0)
printf("Timer accuracy of ~%f usecs\n\n", (t1 - t0) * 1000000);
/* Communications between nodes
* - Blocking sends and recvs
* - No guarantee of prepost, so might pass through comm buffer
*/
for (size = 8; size <= 1048576; size *= 2) {
for (i = 0; i < size / 8; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
last = size / 8 - 1;
MPI_Barrier(MPI_COMM_WORLD);
t0 = MPI_Wtime();
if (myproc == 0) {
MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);
} else {
MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);
b[0] += 1.0;
if (last != 0)
b[last] += 1.0;
MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
}
t1 = MPI_Wtime();
time = 1.e6 * (t1 - t0);
MPI_Barrier(MPI_COMM_WORLD);
if ((b[0] != 1.0 || b[last] != last + 1)) {
printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);
exit (1);
}
for (i = 1; i < last - 1; i++)
if (b[i] != (double) i)
printf("ERROR - b[%d] = %f\n", i, b[i]);
if (myproc == 0 && time > 0.000001) {
printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
size, time, 2.0 * size / time);
if (2 * size / time > max_rate) max_rate = 2 * size / time;
if (time / 2 < min_latency) min_latency = time / 2;
} else if (myproc == 0) {
printf(" %7d bytes took less than the timer accuracy\n", size);
}
}
/* Async communications
* - Prepost receives to guarantee bypassing the comm buffer
*/
MPI_Barrier(MPI_COMM_WORLD);
if (myproc == 0) printf("\n Asynchronous ping-pong\n\n");
for (size = 8; size <= 1048576; size *= 2) {
for (i = 0; i < size / 8; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
last = size / 8 - 1;
MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request);
MPI_Barrier(MPI_COMM_WORLD);
t0 = MPI_Wtime();
if (myproc == 0) {
MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Wait(&request, &status);
} else {
MPI_Wait(&request, &status);
b[0] += 1.0;
if (last != 0)
b[last] += 1.0;
MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
}
t1 = MPI_Wtime();
time = 1.e6 * (t1 - t0);
MPI_Barrier(MPI_COMM_WORLD);
if ((b[0] != 1.0 || b[last] != last + 1))
printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);
for (i = 1; i < last - 1; i++)
if (b[i] != (double) i)
printf("ERROR - b[%d] = %f\n", i, b[i]);
if (myproc == 0 && time > 0.000001) {
printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
size, time, 2.0 * size / time);
if (2 * size / time > max_rate) max_rate = 2 * size / time;
if (time / 2 < min_latency) min_latency = time / 2;
} else if (myproc == 0) {
printf(" %7d bytes took less than the timer accuracy\n", size);
}
}
/* Bidirectional communications
* - Prepost receives to guarantee bypassing the comm buffer
*/
MPI_Barrier(MPI_COMM_WORLD);
if (myproc == 0) printf("\n Bi-directional asynchronous ping-pong\n\n");
for (size = 8; size <= 1048576; size *= 2) {
for (i = 0; i < size / 8; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
last = size / 8 - 1;
MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_b);
MPI_Irecv(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_a);
MPI_Barrier(MPI_COMM_WORLD);
t0 = MPI_Wtime();
MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Wait(&request_b, &status);
b[0] += 1.0;
if (last != 0)
b[last] += 1.0;
MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Wait(&request_a, &status);
t1 = MPI_Wtime();
time = 1.e6 * (t1 - t0);
MPI_Barrier(MPI_COMM_WORLD);
if ((a[0] != 1.0 || a[last] != last + 1))
printf("ERROR - a[0] = %f a[%d] = %f\n", a[0], last, a[last]);
for (i = 1; i < last - 1; i++)
if (a[i] != (double) i)
printf("ERROR - a[%d] = %f\n", i, a[i]);
if (myproc == 0 && time > 0.000001) {
printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
size, time, 2.0 * size / time);
if (2 * size / time > max_rate) max_rate = 2 * size / time;
if (time / 2 < min_latency) min_latency = time / 2;
} else if (myproc == 0) {
printf(" %7d bytes took less than the timer accuracy\n", size);
}
}
if (myproc == 0)
printf("\n Max rate = %f MB/sec Min latency = %f usec\n",
max_rate, min_latency);
MPI_Finalize();
}
*-----------------------------------------------------------------------
* Filename: mpi_pong.f
*-----------------------------------------------------------------------
PROGRAM Pong
IMPLICIT NONE
INTEGER ierr, myproc, nprocs
INTEGER size, other_proc, i, last
DOUBLE PRECISION t0, t1, time
DOUBLE PRECISION max_rate, min_latency
DOUBLE PRECISION a(132000), b(132000)
*-----------------------------------------------------------------------
* Init MPI
*-----------------------------------------------------------------------
INCLUDE "mpif.h"
INTEGER status(MPI_STATUS_SIZE)
INTEGER request, request_a, request_b
CALL MPI_Init(ierr)
CALL MPI_Comm_Rank(MPI_COMM_WORLD,myproc,ierr)
CALL MPI_Comm_Size(MPI_COMM_WORLD,nprocs,ierr)
*-----------------------------------------------------------------------
*
*-----------------------------------------------------------------------
min_latency = 10e6
max_rate = 0.0
DO i = 1,132000
a(i) = i
b(i) = .0
ENDDO
IF( nprocs .NE. 2) STOP
other_proc = MOD(myproc + 1, 2)
PRINT*,'Hello from ',myproc,' of ',nprocs
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
*-----------------------------------------------------------------------
* Timer accuracy test
*-----------------------------------------------------------------------
t0 = MPI_Wtime()
10 t1 = MPI_Wtime()
IF( t1 .EQ. t0) GOTO 10
IF( myproc .EQ. 0 )THEN
PRINT*,'Timer accuracy of ',(t1-t0)*1000000, 'usecs'
ENDIF
*-----------------------------------------------------------------------
* Communications between nodes
* - Blocking sends and receives
* - No guarantee of prepost, so data might pass through comm buffer
*-----------------------------------------------------------------------
size = 8
20 CONTINUE
DO i = 1, size/8
a(i) = i
b(i) = 0.0
ENDDO
last = size/8
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
t0 = MPI_Wtime()
IF( myproc .EQ. 0 )THEN
CALL MPI_Send(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, ierr)
CALL MPI_Recv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, status, ierr)
ELSE
CALL MPI_Recv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, status, ierr)
b(1) = b(1) + 1.0
IF( last .NE. 1 )THEN
b(last) = b(last) + 1.0
ENDIF
CALL MPI_Send(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, ierr)
ENDIF
t1 = MPI_Wtime()
time = 1e6 * (t1 - t0)
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF( (b(1) .NE. 2.0) .OR. (b(last) .NE. last + 1) )THEN
PRINT*,'ERROR - b[1] = ',b(1),' b[',last,'] = ',b(last)
STOP
ENDIF
DO i = 2, last - 1
IF( b(i) .NE. i ) PRINT*,'ERROR - b[',i,'] = ',b(i)
ENDDO
IF( (myproc .EQ. 0) .AND. (time .GT. 0.000001) )THEN
PRINT 1020,size,' bytes took ',time,' usec (',2.0*size/time,
& ' MB/sec)'
1020 FORMAT(I8, A, F15.2, A, F10.7, A )
IF( 2*size/time .GT. max_rate ) max_rate = 2 * size / time
IF( time / 2 .LT. min_latency ) min_latency = time / 2
ELSE
IF( myproc .EQ. 0 )
& PRINT*,size,' bytes took less than the timer accuracy'
ENDIF
size = size * 2
IF( size .LE. 1048576 ) GOTO 20
*-----------------------------------------------------------------------
* Async communications
* - Prepost receives to guarantee bypassing the comm buffer
*-----------------------------------------------------------------------
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF( myproc .EQ. 0 )THEN
PRINT*,' '
PRINT*,'Asynchronous ping-pong'
PRINT*,' '
ENDIF
size = 8
30 CONTINUE
DO i = 1, size/8
a(i) = i
b(i) = 0.0
ENDDO
last = size/8
CALL MPI_Irecv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, request)
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
t0 = MPI_Wtime()
IF( myproc .EQ. 0 )THEN
CALL MPI_Send(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, ierr)
CALL MPI_Wait(request, status, ierr)
ELSE
CALL MPI_Wait(request, status, ierr)
b(1) = b(1) + 1.0
IF( last .NE. 1 ) b(last) = b(last) + 1.0
CALL MPI_Send(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, ierr)
ENDIF
t1 = MPI_Wtime()
time = 1e6 * (t1 - t0)
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF( (b(1) .NE. 2.0) .OR. (b(last) .NE. last + 1) )THEN
PRINT*,'ERROR - b[1] = ',b(1),' b[',last,'] = ',b(last)
STOP
ENDIF
DO i = 2, last - 1
IF( b(i) .NE. i ) PRINT*,'ERROR - b[',i,'] = ',b(i)
ENDDO
IF( (myproc .EQ. 0) .AND. (time .GT. 0.000001) )THEN
PRINT 1030,size,' bytes took ',time,' usec (',2.0*size/time,
& ' MB/sec)'
1030 FORMAT(I8, A, F15.2, A, F10.7, A )
IF( 2*size/time .GT. max_rate ) max_rate = 2 * size / time
IF( time / 2 .LT. min_latency ) min_latency = time / 2
ELSE IF( myproc .EQ. 0 )THEN
PRINT*,size,' bytes took less than the timer accuracy'
ENDIF
size = size * 2
IF( size .LE. 1048576 ) GOTO 30
*-----------------------------------------------------------------------
* Bi-directional asynchronous ping-pong
*-----------------------------------------------------------------------
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF( myproc .EQ. 0 )THEN
PRINT*,' '
PRINT*,'Bi-directional asynchronous ping-pong'
PRINT*,' '
ENDIF
size = 8
40 CONTINUE
DO i = 1, size/8
a(i) = i
b(i) = 0.0
ENDDO
last = size/8
CALL MPI_Irecv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, request_b)
CALL MPI_Irecv(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, request_a)
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
t0 = MPI_Wtime()
CALL MPI_Send(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, ierr)
CALL MPI_Wait(request_b, status, ierr)
b(1) = b(1) + 1.0
IF( last .NE. 1 )b(last) = b(last) + 1.0
CALL MPI_Send(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
& 0, MPI_COMM_WORLD, ierr)
CALL MPI_Wait(request_a, status, ierr)
t1 = MPI_Wtime()
time = 1e6 * (t1 - t0)
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF( (b(1) .NE. 2.0) .OR. (b(last) .NE. last + 1) )THEN
PRINT*,'ERROR - b[1] = ',b(1),' b[',last,'] = ',b(last)
STOP
ENDIF
DO i = 2, last - 1
IF( b(i) .NE. i ) PRINT*,'ERROR - b[',i,'] = ',b(i)
ENDDO
IF( (myproc .EQ. 0) .AND. (time .GT. 0.000001) )THEN
PRINT 1040,size,' bytes took ',time,' usec (',2.0*size/time,
& ' MB/sec)'
1040 FORMAT(I8, A, F15.2, A, F10.7, A )
IF( 2*size/time .GT. max_rate ) max_rate = 2 * size / time
IF( time / 2 .LT. min_latency ) min_latency = time / 2
ELSE
IF( myproc .EQ. 0 )
& PRINT*,size,' bytes took less than the timer accuracy'
ENDIF
size = size * 2
IF( size .LE. 1048576 ) GOTO 40
*-----------------------------------------------------------------------
* Max rate, Min latency
*-----------------------------------------------------------------------
IF( myproc .EQ. 0 )THEN
PRINT*,' '
PRINT 1050,'Max rate = ',max_rate,' MB/sec Min latency = ',
& min_latency
1050 FORMAT(A, F10.7, A, F25.12)
PRINT*,' '
ENDIF
*-----------------------------------------------------------------------
* Leave MPI
*-----------------------------------------------------------------------
CALL MPI_Finalize(ierr)
END
Grimm mprun -np 2 -h grimm grimm pong
[1] 2728
starting pong on grimm
pong on grimm
---------- Done handshaking ----------
Timer accuracy of ~975.966454 usecs
8 bytes took less than the timer accuracy
16 bytes took 977 usec ( 0.033 MB/sec)
32 bytes took 977 usec ( 0.066 MB/sec)
64 bytes took less than the timer accuracy
128 bytes took 976 usec ( 0.262 MB/sec)
256 bytes took 977 usec ( 0.524 MB/sec)
512 bytes took 1953 usec ( 0.524 MB/sec)
1024 bytes took 977 usec ( 2.096 MB/sec)
2048 bytes took 977 usec ( 4.192 MB/sec)
4096 bytes took 1953 usec ( 4.195 MB/sec)
8192 bytes took 977 usec ( 16.769 MB/sec)
16384 bytes took 976 usec ( 33.571 MB/sec)
32768 bytes took 2929 usec ( 22.375 MB/sec)
65536 bytes took 3907 usec ( 33.547 MB/sec)
131072 bytes took 4883 usec ( 53.684 MB/sec)
262144 bytes took 7813 usec ( 67.105 MB/sec)
524288 bytes took 15625 usec ( 67.109 MB/sec)
1048576 bytes took 32227 usec ( 65.074 MB/sec)
Asynchronous ping-pong
8 bytes took 977 usec ( 0.016 MB/sec)
16 bytes took 977 usec ( 0.033 MB/sec)
32 bytes took 977 usec ( 0.066 MB/sec)
64 bytes took 976 usec ( 0.131 MB/sec)
128 bytes took less than the timer accuracy
256 bytes took 977 usec ( 0.524 MB/sec)
512 bytes took 977 usec ( 1.048 MB/sec)
1024 bytes took 977 usec ( 2.096 MB/sec)
2048 bytes took 977 usec ( 4.192 MB/sec)
4096 bytes took 976 usec ( 8.394 MB/sec)
8192 bytes took 977 usec ( 16.769 MB/sec)
16384 bytes took 1954 usec ( 16.770 MB/sec)
32768 bytes took 2929 usec ( 22.375 MB/sec)
65536 bytes took 1953 usec ( 67.113 MB/sec)
131072 bytes took 3907 usec ( 67.095 MB/sec)
262144 bytes took 6836 usec ( 76.696 MB/sec)
524288 bytes took 17578 usec ( 59.653 MB/sec)
1048576 bytes took 33204 usec ( 63.160 MB/sec)
Bi-directional asynchronous ping-pong
8 bytes took 1953 usec ( 0.008 MB/sec)
16 bytes took 976 usec ( 0.033 MB/sec)
32 bytes took 977 usec ( 0.066 MB/sec)
64 bytes took 976 usec ( 0.131 MB/sec)
128 bytes took 976 usec ( 0.262 MB/sec)
256 bytes took less than the timer accuracy
512 bytes took 977 usec ( 1.048 MB/sec)
1024 bytes took 977 usec ( 2.096 MB/sec)
2048 bytes took less than the timer accuracy
4096 bytes took 976 usec ( 8.394 MB/sec)
8192 bytes took less than the timer accuracy
16384 bytes took 1953 usec ( 16.778 MB/sec)
32768 bytes took 2930 usec ( 22.367 MB/sec)
65536 bytes took 3906 usec ( 33.556 MB/sec)
131072 bytes took 6835 usec ( 38.353 MB/sec)
262144 bytes took 10742 usec ( 48.808 MB/sec)
524288 bytes took 27343 usec ( 38.349 MB/sec)
1048576 bytes took 65429 usec ( 32.052 MB/sec)
Max rate = 76.695845 MB/sec Min latency = 487.983227 usec
Ames Laboratory |
Condensed Matter Physics |
Disclaimer |
ISU Physics