Example: MPI ping-pong programs in C and f77
Download mpi_pong.c
Download mpi_pong.f

/*                  pong.c Generic Benchmark code
 *               Dave Turner - Ames Lab - July of 1994+++
 *
 *  Most Unix timers can't be trusted for very short times, so take this
 *  into account when looking at the results.  This code also only times
 *  a single message passing event for each size, so the results may vary
 *  between runs.  For more accurate measurements, grab NetPIPE from
 *  http://www.scl.ameslab.gov/ .
 */

#include "mpi.h"

#include < stdio.h >
#include < stdlib.h >

void 
main (int argc, char **argv)
{
   int myproc, size, other_proc, nprocs, i, last;
   double t0, t1, time;
   double *a, *b;
   double max_rate = 0.0, min_latency = 10e6;
   MPI_Request request, request_a, request_b;
   MPI_Status status;

#if defined (_CRAYT3E)
   a = (double *) shmalloc (132000 * sizeof (double));
   b = (double *) shmalloc (132000 * sizeof (double));
#else
   a = (double *) malloc (132000 * sizeof (double));
   b = (double *) malloc (132000 * sizeof (double));
#endif

   for (i = 0; i < 132000; i++) {
      a[i] = (double) i;
      b[i] = 0.0;
   }

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myproc);

   if (nprocs != 2) exit (1);
   other_proc = (myproc + 1) % 2;

   printf("Hello from %d of %d\n", myproc, nprocs);
   MPI_Barrier(MPI_COMM_WORLD);

/* Timer accuracy test */

   t0 = MPI_Wtime();
   t1 = MPI_Wtime();

   while (t1 == t0) t1 = MPI_Wtime();

   if (myproc == 0)
      printf("Timer accuracy of ~%f usecs\n\n", (t1 - t0) * 1000000);

/* Communications between nodes 
 *   - Blocking sends and recvs
 *   - No guarantee of prepost, so might pass through comm buffer
 */

   for (size = 8; size <= 1048576; size *= 2) {
      for (i = 0; i < size / 8; i++) {
         a[i] = (double) i;
         b[i] = 0.0;
      }
      last = size / 8 - 1;

      MPI_Barrier(MPI_COMM_WORLD);
      t0 = MPI_Wtime();

      if (myproc == 0) {

         MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
         MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);

      } else {

         MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);

         b[0] += 1.0;
         if (last != 0)
         b[last] += 1.0;

         MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);

      }

      t1 = MPI_Wtime();
      time = 1.e6 * (t1 - t0);
      MPI_Barrier(MPI_COMM_WORLD);

      if ((b[0] != 1.0 || b[last] != last + 1)) {
         printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);
         exit (1);
      }
      for (i = 1; i < last - 1; i++)
         if (b[i] != (double) i)
            printf("ERROR - b[%d] = %f\n", i, b[i]);
      if (myproc == 0 && time > 0.000001) {
         printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
                     size, time, 2.0 * size / time);
         if (2 * size / time > max_rate) max_rate = 2 * size / time;
         if (time / 2 < min_latency) min_latency = time / 2;
      } else if (myproc == 0) {
         printf(" %7d bytes took less than the timer accuracy\n", size);
      }
   }

/* Async communications
 *   - Prepost receives to guarantee bypassing the comm buffer
 */

   MPI_Barrier(MPI_COMM_WORLD);
   if (myproc == 0) printf("\n  Asynchronous ping-pong\n\n");

   for (size = 8; size <= 1048576; size *= 2) {
      for (i = 0; i < size / 8; i++) {
         a[i] = (double) i;
         b[i] = 0.0;
      }
      last = size / 8 - 1;

      MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request);
      MPI_Barrier(MPI_COMM_WORLD);
      t0 = MPI_Wtime();

      if (myproc == 0) {

         MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
         MPI_Wait(&request, &status);

      } else {

         MPI_Wait(&request, &status);

         b[0] += 1.0;
         if (last != 0)
         b[last] += 1.0;

         MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);

      }

      t1 = MPI_Wtime();
      time = 1.e6 * (t1 - t0);
      MPI_Barrier(MPI_COMM_WORLD);

      if ((b[0] != 1.0 || b[last] != last + 1))
         printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);

      for (i = 1; i < last - 1; i++)
         if (b[i] != (double) i)
            printf("ERROR - b[%d] = %f\n", i, b[i]);
      if (myproc == 0 && time > 0.000001) {
         printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
                  size, time, 2.0 * size / time);
         if (2 * size / time > max_rate) max_rate = 2 * size / time;
         if (time / 2 < min_latency) min_latency = time / 2;
      } else if (myproc == 0) {
         printf(" %7d bytes took less than the timer accuracy\n", size);
      }
   }

/* Bidirectional communications
 *   - Prepost receives to guarantee bypassing the comm buffer
 */

   MPI_Barrier(MPI_COMM_WORLD);
   if (myproc == 0) printf("\n  Bi-directional asynchronous ping-pong\n\n");

   for (size = 8; size <= 1048576; size *= 2) {
      for (i = 0; i < size / 8; i++) {
         a[i] = (double) i;
         b[i] = 0.0;
      }
      last = size / 8 - 1;

      MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_b);
      MPI_Irecv(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_a);
      MPI_Barrier(MPI_COMM_WORLD);

      t0 = MPI_Wtime();

      MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
      MPI_Wait(&request_b, &status);

      b[0] += 1.0;
      if (last != 0)
      b[last] += 1.0;

      MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
      MPI_Wait(&request_a, &status);

      t1 = MPI_Wtime();
      time = 1.e6 * (t1 - t0);
      MPI_Barrier(MPI_COMM_WORLD);


      if ((a[0] != 1.0 || a[last] != last + 1))
         printf("ERROR - a[0] = %f a[%d] = %f\n", a[0], last, a[last]);
      for (i = 1; i < last - 1; i++)
      if (a[i] != (double) i)
         printf("ERROR - a[%d] = %f\n", i, a[i]);
      if (myproc == 0 && time > 0.000001) {
         printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
                    size, time, 2.0 * size / time);
         if (2 * size / time > max_rate) max_rate = 2 * size / time;
         if (time / 2 < min_latency) min_latency = time / 2;
      } else if (myproc == 0) {
         printf(" %7d bytes took less than the timer accuracy\n", size);
      }
   }

   if (myproc == 0)
      printf("\n Max rate = %f MB/sec  Min latency = %f usec\n",
               max_rate, min_latency);

   MPI_Finalize();

}

*-----------------------------------------------------------------------
*     Filename: mpi_pong.f
*-----------------------------------------------------------------------

      PROGRAM Pong
      IMPLICIT NONE

      INTEGER ierr, myproc, nprocs
      INTEGER size, other_proc, i, last
      DOUBLE PRECISION t0, t1, time
      DOUBLE PRECISION max_rate, min_latency
      DOUBLE PRECISION a(132000), b(132000)


*-----------------------------------------------------------------------
*       Init MPI
*-----------------------------------------------------------------------

      INCLUDE "mpif.h"


      INTEGER status(MPI_STATUS_SIZE)
      INTEGER request, request_a, request_b

      CALL MPI_Init(ierr)
      CALL MPI_Comm_Rank(MPI_COMM_WORLD,myproc,ierr)
      CALL MPI_Comm_Size(MPI_COMM_WORLD,nprocs,ierr)

*-----------------------------------------------------------------------
*       
*-----------------------------------------------------------------------
      min_latency = 10e6
      max_rate = 0.0

      DO i = 1,132000
         a(i) = i
         b(i) = .0
      ENDDO

      IF( nprocs .NE. 2) STOP

      other_proc = MOD(myproc + 1, 2)

      PRINT*,'Hello from ',myproc,' of ',nprocs
      CALL MPI_Barrier(MPI_COMM_WORLD, ierr)

*-----------------------------------------------------------------------
*       Timer accuracy test       
*-----------------------------------------------------------------------

      t0 = MPI_Wtime()

   10 t1 = MPI_Wtime()
      IF( t1 .EQ. t0) GOTO 10

      IF( myproc .EQ. 0 )THEN
        PRINT*,'Timer accuracy of ',(t1-t0)*1000000, 'usecs'
      ENDIF

*-----------------------------------------------------------------------
* Communications between nodes
*   - Blocking sends and receives
*   - No guarantee of prepost, so data might pass through comm buffer
*-----------------------------------------------------------------------

      size = 8

   20 CONTINUE
        DO i = 1, size/8
           a(i) = i
           b(i) = 0.0
        ENDDO

        last = size/8

        CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
        t0 = MPI_Wtime()

        IF( myproc .EQ. 0 )THEN

          CALL MPI_Send(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                  0, MPI_COMM_WORLD, ierr)

          CALL MPI_Recv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                  0, MPI_COMM_WORLD, status, ierr)

        ELSE

          CALL MPI_Recv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                  0, MPI_COMM_WORLD, status, ierr)

          b(1) = b(1) + 1.0
          IF( last .NE. 1 )THEN
             b(last) = b(last) + 1.0
          ENDIF

          CALL MPI_Send(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                  0, MPI_COMM_WORLD, ierr)

        ENDIF

        t1 = MPI_Wtime()
        time = 1e6 * (t1 - t0)
        CALL MPI_Barrier(MPI_COMM_WORLD, ierr)

        IF( (b(1) .NE. 2.0) .OR. (b(last) .NE. last + 1) )THEN
           PRINT*,'ERROR - b[1] = ',b(1),' b[',last,'] = ',b(last)
           STOP
        ENDIF
        DO i = 2, last - 1
          IF( b(i) .NE. i ) PRINT*,'ERROR - b[',i,'] = ',b(i)
        ENDDO

        IF( (myproc .EQ. 0) .AND. (time .GT. 0.000001) )THEN
          PRINT 1020,size,' bytes took ',time,' usec (',2.0*size/time,
     &                  ' MB/sec)'
 1020     FORMAT(I8, A, F15.2, A, F10.7, A )
          IF( 2*size/time .GT. max_rate ) max_rate = 2 * size / time
          IF( time / 2 .LT. min_latency ) min_latency = time / 2
        ELSE
          IF( myproc .EQ. 0 )
     &       PRINT*,size,' bytes took less than the timer accuracy'
        ENDIF

        size = size * 2
      IF( size .LE. 1048576 ) GOTO 20


*-----------------------------------------------------------------------
* Async communications
*   - Prepost receives to guarantee bypassing the comm buffer
*-----------------------------------------------------------------------
      CALL MPI_Barrier(MPI_COMM_WORLD, ierr)

      IF( myproc .EQ. 0 )THEN
         PRINT*,' '
         PRINT*,'Asynchronous ping-pong'
         PRINT*,' '
      ENDIF

      size = 8

   30 CONTINUE
        DO i = 1, size/8
          a(i) = i
          b(i) = 0.0
        ENDDO

        last = size/8

        CALL MPI_Irecv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                 0, MPI_COMM_WORLD, request)

        CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
        t0 = MPI_Wtime()

        IF( myproc .EQ. 0 )THEN

          CALL MPI_Send(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                  0, MPI_COMM_WORLD, ierr)

          CALL MPI_Wait(request, status, ierr)

        ELSE

          CALL MPI_Wait(request, status, ierr)

          b(1) = b(1) + 1.0
          IF( last .NE. 1 ) b(last) = b(last) + 1.0

          CALL MPI_Send(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                  0, MPI_COMM_WORLD, ierr)

        ENDIF

        t1 = MPI_Wtime()
        time = 1e6 * (t1 - t0)
        CALL MPI_Barrier(MPI_COMM_WORLD, ierr)

        IF( (b(1) .NE. 2.0) .OR. (b(last) .NE. last + 1) )THEN
          PRINT*,'ERROR - b[1] = ',b(1),' b[',last,'] = ',b(last)
          STOP
        ENDIF
        DO i = 2, last - 1
          IF( b(i) .NE. i ) PRINT*,'ERROR - b[',i,'] = ',b(i)
        ENDDO

        IF( (myproc .EQ. 0) .AND. (time .GT. 0.000001) )THEN
          PRINT 1030,size,' bytes took ',time,' usec (',2.0*size/time,
     &             ' MB/sec)'
 1030     FORMAT(I8, A, F15.2, A, F10.7, A )
          IF( 2*size/time .GT. max_rate ) max_rate = 2 * size / time
          IF( time / 2 .LT. min_latency ) min_latency = time / 2
        ELSE IF( myproc .EQ. 0 )THEN
          PRINT*,size,' bytes took less than the timer accuracy'
        ENDIF

        size = size * 2
      IF( size .LE. 1048576 ) GOTO 30

*-----------------------------------------------------------------------
*         Bi-directional asynchronous ping-pong
*-----------------------------------------------------------------------
      CALL MPI_Barrier(MPI_COMM_WORLD, ierr)

      IF( myproc .EQ. 0 )THEN
        PRINT*,' '
        PRINT*,'Bi-directional asynchronous ping-pong'
        PRINT*,' '
      ENDIF

      size = 8

   40 CONTINUE
        DO i = 1, size/8
          a(i) = i
          b(i) = 0.0
        ENDDO

        last = size/8

        CALL MPI_Irecv(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                 0, MPI_COMM_WORLD, request_b)

        CALL MPI_Irecv(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                 0, MPI_COMM_WORLD, request_a)

        CALL MPI_Barrier(MPI_COMM_WORLD, ierr)

        t0 = MPI_Wtime()


        CALL MPI_Send(a, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                0, MPI_COMM_WORLD, ierr)

        CALL MPI_Wait(request_b, status, ierr)

        b(1) = b(1) + 1.0
        IF( last .NE. 1 )b(last) = b(last) + 1.0

        CALL MPI_Send(b, size/8, MPI_DOUBLE_PRECISION, other_proc,
     &                0, MPI_COMM_WORLD, ierr)

        CALL MPI_Wait(request_a, status, ierr)

        t1 = MPI_Wtime()
        time = 1e6 * (t1 - t0)
        CALL MPI_Barrier(MPI_COMM_WORLD, ierr)

        IF( (b(1) .NE. 2.0) .OR. (b(last) .NE. last + 1) )THEN
          PRINT*,'ERROR - b[1] = ',b(1),' b[',last,'] = ',b(last)
          STOP
        ENDIF
        DO i = 2, last - 1
          IF( b(i) .NE. i ) PRINT*,'ERROR - b[',i,'] = ',b(i)
        ENDDO

        IF( (myproc .EQ. 0) .AND. (time .GT. 0.000001) )THEN
          PRINT 1040,size,' bytes took ',time,' usec (',2.0*size/time,
     &              ' MB/sec)'
 1040     FORMAT(I8, A, F15.2, A, F10.7, A )
          IF( 2*size/time .GT. max_rate ) max_rate = 2 * size / time
          IF( time / 2 .LT. min_latency ) min_latency = time / 2
        ELSE
          IF( myproc .EQ. 0 )
     &      PRINT*,size,' bytes took less than the timer accuracy'
        ENDIF

        size = size * 2
      IF( size .LE. 1048576 ) GOTO 40

*-----------------------------------------------------------------------
*       Max rate, Min latency
*-----------------------------------------------------------------------

      IF( myproc .EQ. 0 )THEN
        PRINT*,' '
        PRINT 1050,'Max rate = ',max_rate,' MB/sec Min latency = ',
     &             min_latency
 1050   FORMAT(A, F10.7, A, F25.12)
        PRINT*,' '
      ENDIF

*-----------------------------------------------------------------------
*       Leave MPI
*-----------------------------------------------------------------------

        CALL MPI_Finalize(ierr)

        END

Output from MP_Lite run of pong.c on a dual-processor Compaq DS20

Grimm  mprun -np 2 -h grimm grimm pong
[1] 2728
 starting pong on grimm 
 pong on grimm 
----------  Done handshaking  ----------
Timer accuracy of ~975.966454 usecs

       8 bytes took less than the timer accuracy
      16 bytes took       977 usec (   0.033 MB/sec)
      32 bytes took       977 usec (   0.066 MB/sec)
      64 bytes took less than the timer accuracy
     128 bytes took       976 usec (   0.262 MB/sec)
     256 bytes took       977 usec (   0.524 MB/sec)
     512 bytes took      1953 usec (   0.524 MB/sec)
    1024 bytes took       977 usec (   2.096 MB/sec)
    2048 bytes took       977 usec (   4.192 MB/sec)
    4096 bytes took      1953 usec (   4.195 MB/sec)
    8192 bytes took       977 usec (  16.769 MB/sec)
   16384 bytes took       976 usec (  33.571 MB/sec)
   32768 bytes took      2929 usec (  22.375 MB/sec)
   65536 bytes took      3907 usec (  33.547 MB/sec)
  131072 bytes took      4883 usec (  53.684 MB/sec)
  262144 bytes took      7813 usec (  67.105 MB/sec)
  524288 bytes took     15625 usec (  67.109 MB/sec)
 1048576 bytes took     32227 usec (  65.074 MB/sec)

  Asynchronous ping-pong

       8 bytes took       977 usec (   0.016 MB/sec)
      16 bytes took       977 usec (   0.033 MB/sec)
      32 bytes took       977 usec (   0.066 MB/sec)
      64 bytes took       976 usec (   0.131 MB/sec)
     128 bytes took less than the timer accuracy
     256 bytes took       977 usec (   0.524 MB/sec)
     512 bytes took       977 usec (   1.048 MB/sec)
    1024 bytes took       977 usec (   2.096 MB/sec)
    2048 bytes took       977 usec (   4.192 MB/sec)
    4096 bytes took       976 usec (   8.394 MB/sec)
    8192 bytes took       977 usec (  16.769 MB/sec)
   16384 bytes took      1954 usec (  16.770 MB/sec)
   32768 bytes took      2929 usec (  22.375 MB/sec)
   65536 bytes took      1953 usec (  67.113 MB/sec)
  131072 bytes took      3907 usec (  67.095 MB/sec)
  262144 bytes took      6836 usec (  76.696 MB/sec)
  524288 bytes took     17578 usec (  59.653 MB/sec)
 1048576 bytes took     33204 usec (  63.160 MB/sec)

  Bi-directional asynchronous ping-pong

       8 bytes took      1953 usec (   0.008 MB/sec)
      16 bytes took       976 usec (   0.033 MB/sec)
      32 bytes took       977 usec (   0.066 MB/sec)
      64 bytes took       976 usec (   0.131 MB/sec)
     128 bytes took       976 usec (   0.262 MB/sec)
     256 bytes took less than the timer accuracy
     512 bytes took       977 usec (   1.048 MB/sec)
    1024 bytes took       977 usec (   2.096 MB/sec)
    2048 bytes took less than the timer accuracy
    4096 bytes took       976 usec (   8.394 MB/sec)
    8192 bytes took less than the timer accuracy
   16384 bytes took      1953 usec (  16.778 MB/sec)
   32768 bytes took      2930 usec (  22.367 MB/sec)
   65536 bytes took      3906 usec (  33.556 MB/sec)
  131072 bytes took      6835 usec (  38.353 MB/sec)
  262144 bytes took     10742 usec (  48.808 MB/sec)
  524288 bytes took     27343 usec (  38.349 MB/sec)
 1048576 bytes took     65429 usec (  32.052 MB/sec)

 Max rate = 76.695845 MB/sec  Min latency = 487.983227 usec


Links to more advanced topics

This ping-pong method is a very good way of determining the raw characteristics of the network. For small messages, the latency dominates, while for large message the bandwidth is the limitting factor.

However, the ping-pong program above is not very sophisticated. It is just a quick-and-dirty method of testing the network. Since timer accuracies can be as bad as 1 ms under Unix, measuring a single ping-pong event is not enough. The Netpipe software at the link below is a better tool for providing a more complete profile of the point-to-point characteristics of a network, and works for a variety of message-passing protocols such as raw TCP, MPI, MP_Lite, PVM, and TCGMSG.

Netpipe


Ames Laboratory | Condensed Matter Physics | Disclaimer | ISU Physics