teshsuite/smpi/mpich3-test/perf/manyrma.c

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
   2 /*
   3  *  (C) 2010 by Argonne National Laboratory.
   4  *      See COPYRIGHT in top-level directory.
   5  */
   6
   7 /* This test measures the performance of many rma operations to a single
   8    target process.
   9    It uses a number of operations (put or accumulate) to different
  10    locations in the target window
  11    This is one of the ways that RMA may be used, and is used in the
  12    reference implementation of the graph500 benchmark.
  13 */
  14 #include "mpi.h"
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17 #include <string.h>
  18
  19 #define MAX_COUNT 65536*4
  20 #define MAX_RMA_SIZE 16
  21 #define MAX_RUNS 10
  22
  23 typedef enum { SYNC_NONE=0,
  24                SYNC_ALL=-1, SYNC_FENCE=1, SYNC_LOCK=2, SYNC_PSCW=4 } sync_t;
  25 typedef enum { RMA_NONE=0, RMA_ALL=-1, RMA_PUT=1, RMA_ACC=2, RMA_GET=4 } rma_t;
  26 /* Note GET not yet implemented */
  27 sync_t syncChoice = SYNC_ALL;
  28 rma_t rmaChoice = RMA_ALL;
  29
  30 typedef struct {
  31     double startOp, endOp, endSync;
  32 } timing;
  33
  34 static int verbose = 1;
  35 static int barrierSync = 0;
  36 static double tickThreshold = 0.0;
  37
  38 void PrintResults( int cnt, timing t[] );
  39 void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
  40 void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
  41 void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
  42 void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
  43 void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
  44                  MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
  45 void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
  46                  MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
  47
  48 int main( int argc, char *argv[] )
  49 {
  50     int arraysize, i, cnt, sz, maxCount=MAX_COUNT, *arraybuffer;
  51     int wrank, wsize, destRank, srcRank;
  52     MPI_Win win;
  53     MPI_Group wgroup, accessGroup, exposureGroup;
  54     timing t[MAX_RUNS];
  55     int    maxSz = MAX_RMA_SIZE;
  56
  57     MPI_Init( &argc, &argv );
  58
  59     /* Determine clock accuracy */
  60     tickThreshold = 10.0 * MPI_Wtick();
  61     MPI_Allreduce( MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX,
  62                    MPI_COMM_WORLD );
  63
  64     for (i=1; i<argc; i++) {
  65         if (strcmp( argv[i], "-put" ) == 0) {
  66             if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
  67             rmaChoice  |= RMA_PUT;
  68         }
  69         else if (strcmp( argv[i], "-acc" ) == 0) {
  70             if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
  71             rmaChoice  |= RMA_ACC;
  72         }
  73         else if (strcmp( argv[i], "-fence" ) == 0) {
  74             if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
  75             syncChoice |= SYNC_FENCE;
  76         }
  77         else if (strcmp( argv[i], "-lock" ) == 0) {
  78             if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
  79             syncChoice |= SYNC_LOCK;
  80         }
  81         else if (strcmp( argv[i], "-pscw" ) == 0) {
  82             if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
  83             syncChoice |= SYNC_PSCW;
  84         }
  85         else if (strcmp( argv[i], "-maxsz" ) == 0) {
  86             i++;
  87             maxSz = atoi( argv[i] );
  88         }
  89         else if (strcmp( argv[i], "-maxcount" ) == 0) {
  90             i++;
  91             maxCount = atoi( argv[i] );
  92         }
  93         else if (strcmp( argv[i], "-barrier" ) == 0) {
  94             barrierSync = 1;
  95         }
  96         else {
  97             fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
  98             fprintf( stderr, "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ]  [ -maxsz msgsize ]\n", argv[0] );
  99             MPI_Abort( MPI_COMM_WORLD, 1 );
 100         }
 101     }
 102
 103     MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
 104     MPI_Comm_size( MPI_COMM_WORLD, &wsize );
 105     destRank = wrank + 1;
 106     while (destRank >= wsize) destRank = destRank - wsize;
 107     srcRank = wrank - 1;
 108     if (srcRank < 0) srcRank += wsize;
 109
 110     /* Create groups for PSCW */
 111     MPI_Comm_group( MPI_COMM_WORLD, &wgroup );
 112     MPI_Group_incl( wgroup, 1, &destRank, &accessGroup );
 113     MPI_Group_incl( wgroup, 1, &srcRank, &exposureGroup );
 114     MPI_Group_free( &wgroup );
 115
 116     arraysize = maxSz * MAX_COUNT;
 117     arraybuffer = (int*)malloc( arraysize * sizeof(int) );
 118     if (!arraybuffer) {
 119         fprintf( stderr, "Unable to allocate %d words\n", arraysize );
 120         MPI_Abort( MPI_COMM_WORLD, 1 );
 121     }
 122
 123     MPI_Win_create( arraybuffer, arraysize*sizeof(int), (int)sizeof(int),
 124                     MPI_INFO_NULL, MPI_COMM_WORLD, &win );
 125
 126     /* FIXME: we need a test on performance consistency.
 127        The test needs to have both a relative growth limit and
 128        an absolute limit.
 129     */
 130
 131     if (maxCount > MAX_COUNT) {
 132         fprintf( stderr, "MaxCount must not exceed %d\n", MAX_COUNT );
 133         MPI_Abort( MPI_COMM_WORLD, 1 );
 134     }
 135
 136     if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
 137         for (sz=1; sz<=maxSz; sz = sz + sz) {
 138             if (wrank == 0)
 139                 printf( "Accumulate with fence, %d elements\n", sz );
 140             cnt = 1;
 141             while (cnt <= maxCount) {
 142                 RunAccFence( win, destRank, cnt, sz, t );
 143                 if (wrank == 0) {
 144                     PrintResults( cnt, t );
 145                 }
 146                 cnt = 2 * cnt;
 147             }
 148         }
 149     }
 150
 151     if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
 152         for (sz=1; sz<=maxSz; sz = sz + sz) {
 153             if (wrank == 0)
 154                 printf( "Accumulate with lock, %d elements\n", sz );
 155             cnt = 1;
 156             while (cnt <= maxCount) {
 157                 RunAccLock( win, destRank, cnt, sz, t );
 158                 if (wrank == 0) {
 159                     PrintResults( cnt, t );
 160                 }
 161                 cnt = 2 * cnt;
 162             }
 163         }
 164     }
 165
 166     if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
 167         for (sz=1; sz<=maxSz; sz = sz + sz) {
 168             if (wrank == 0)
 169                 printf( "Put with fence, %d elements\n", sz );
 170             cnt = 1;
 171             while (cnt <= maxCount) {
 172                 RunPutFence( win, destRank, cnt, sz, t );
 173                 if (wrank == 0) {
 174                     PrintResults( cnt, t );
 175                 }
 176                 cnt = 2 * cnt;
 177             }
 178         }
 179     }
 180
 181     if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
 182         for (sz=1; sz<=maxSz; sz = sz + sz) {
 183             if (wrank == 0)
 184                 printf( "Put with lock, %d elements\n", sz );
 185             cnt = 1;
 186             while (cnt <= maxCount) {
 187                 RunPutLock( win, destRank, cnt, sz, t );
 188                 if (wrank == 0) {
 189                     PrintResults( cnt, t );
 190                 }
 191                 cnt = 2 * cnt;
 192             }
 193         }
 194     }
 195
 196     if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
 197         for (sz=1; sz<=maxSz; sz = sz + sz) {
 198             if (wrank == 0)
 199                 printf( "Put with pscw, %d elements\n", sz );
 200             cnt = 1;
 201             while (cnt <= maxCount) {
 202                 RunPutPSCW( win, destRank, cnt, sz,
 203                             exposureGroup, accessGroup, t );
 204                 if (wrank == 0) {
 205                     PrintResults( cnt, t );
 206                 }
 207                 cnt = 2 * cnt;
 208             }
 209         }
 210     }
 211
 212     if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
 213         for (sz=1; sz<=maxSz; sz = sz + sz) {
 214             if (wrank == 0)
 215                 printf( "Accumulate with pscw, %d elements\n", sz );
 216             cnt = 1;
 217             while (cnt <= maxCount) {
 218                 RunAccPSCW( win, destRank, cnt, sz,
 219                             exposureGroup, accessGroup, t );
 220                 if (wrank == 0) {
 221                     PrintResults( cnt, t );
 222                 }
 223                 cnt = 2 * cnt;
 224             }
 225         }
 226     }
 227
 228     MPI_Win_free( &win );
 229
 230     MPI_Group_free( &accessGroup );
 231     MPI_Group_free( &exposureGroup );
 232
 233     MPI_Finalize();
 234     return 0;
 235 }
 236
 237
 238 void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
 239 {
 240     int k, i, j, one = 1;
 241
 242     for (k=0; k<MAX_RUNS; k++) {
 243         MPI_Barrier( MPI_COMM_WORLD );
 244         MPI_Win_fence( 0, win );
 245         j = 0;
 246         t[k].startOp = MPI_Wtime();
 247         for (i=0; i<cnt; i++) {
 248             MPI_Accumulate( &one, sz, MPI_INT, destRank,
 249                             j, sz, MPI_INT, MPI_SUM, win );
 250             j += sz;
 251         }
 252         t[k].endOp = MPI_Wtime();
 253         if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
 254         MPI_Win_fence( 0, win );
 255         t[k].endSync = MPI_Wtime();
 256     }
 257 }
 258
 259 void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
 260 {
 261     int k, i, j, one = 1;
 262
 263     for (k=0; k<MAX_RUNS; k++) {
 264         MPI_Barrier( MPI_COMM_WORLD );
 265         MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
 266         j = 0;
 267         t[k].startOp = MPI_Wtime();
 268         for (i=0; i<cnt; i++) {
 269             MPI_Accumulate( &one, sz, MPI_INT, destRank,
 270                             j, sz, MPI_INT, MPI_SUM, win );
 271             j += sz;
 272         }
 273         t[k].endOp = MPI_Wtime();
 274         if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
 275         MPI_Win_unlock( destRank, win );
 276         t[k].endSync = MPI_Wtime();
 277     }
 278 }
 279
 280 void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
 281 {
 282     int k, i, j, one = 1;
 283
 284     for (k=0; k<MAX_RUNS; k++) {
 285         MPI_Barrier( MPI_COMM_WORLD );
 286         MPI_Win_fence( 0, win );
 287         j = 0;
 288         t[k].startOp = MPI_Wtime();
 289         for (i=0; i<cnt; i++) {
 290             MPI_Put( &one, sz, MPI_INT, destRank,
 291                             j, sz, MPI_INT, win );
 292             j += sz;
 293         }
 294         t[k].endOp = MPI_Wtime();
 295         if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
 296         MPI_Win_fence( 0, win );
 297         t[k].endSync = MPI_Wtime();
 298     }
 299 }
 300
 301 void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
 302 {
 303     int k, i, j, one = 1;
 304
 305     for (k=0; k<MAX_RUNS; k++) {
 306         MPI_Barrier( MPI_COMM_WORLD );
 307         MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
 308         j = 0;
 309         t[k].startOp = MPI_Wtime();
 310         for (i=0; i<cnt; i++) {
 311             MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
 312             j += sz;
 313         }
 314         t[k].endOp = MPI_Wtime();
 315         if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
 316         MPI_Win_unlock( destRank, win );
 317         t[k].endSync = MPI_Wtime();
 318     }
 319 }
 320
 321 void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
 322                  MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
 323 {
 324     int k, i, j, one = 1;
 325
 326     for (k=0; k<MAX_RUNS; k++) {
 327         MPI_Barrier( MPI_COMM_WORLD );
 328         MPI_Win_post( exposureGroup, 0, win );
 329         MPI_Win_start( accessGroup, 0, win );
 330         j = 0;
 331         t[k].startOp = MPI_Wtime();
 332         for (i=0; i<cnt; i++) {
 333             MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
 334             j += sz;
 335         }
 336         t[k].endOp = MPI_Wtime();
 337         if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
 338         MPI_Win_complete( win );
 339         MPI_Win_wait( win );
 340         t[k].endSync = MPI_Wtime();
 341     }
 342 }
 343
 344 void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
 345                  MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
 346 {
 347     int k, i, j, one = 1;
 348
 349     for (k=0; k<MAX_RUNS; k++) {
 350         MPI_Barrier( MPI_COMM_WORLD );
 351         MPI_Win_post( exposureGroup, 0, win );
 352         MPI_Win_start( accessGroup, 0, win );
 353         j = 0;
 354         t[k].startOp = MPI_Wtime();
 355         for (i=0; i<cnt; i++) {
 356             MPI_Accumulate( &one, sz, MPI_INT, destRank,
 357                             j, sz, MPI_INT, MPI_SUM, win );
 358             j += sz;
 359         }
 360         t[k].endOp = MPI_Wtime();
 361         if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
 362         MPI_Win_complete( win );
 363         MPI_Win_wait( win );
 364         t[k].endSync = MPI_Wtime();
 365     }
 366 }
 367
 368 void PrintResults( int cnt, timing t[] )
 369 {
 370     int k;
 371     double d1=0, d2=0;
 372     double minD1 = 1e10, minD2 = 1e10;
 373     double tOp, tSync;
 374     for (k=0; k<MAX_RUNS; k++) {
 375         tOp   = t[k].endOp - t[k].startOp;
 376         tSync = t[k].endSync - t[k].endOp;
 377         d1    += tOp;
 378         d2    += tSync;
 379         if (tOp < minD1)   minD1 = tOp;
 380         if (tSync < minD2) minD2 = tSync;
 381     }
 382     if (verbose) {
 383         long rate = 0;
 384         /* Use the minimum times because they are more stable - if timing
 385            accuracy is an issue, use the min over multiple trials */
 386         d1 = minD1;
 387         d2 = minD2;
 388         /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
 389         if (d2 > 0) rate = (long)(cnt) / d2;
 390         /* count, op, sync, op/each, sync/each, rate */
 391         printf( "%d\t%e\t%e\t%e\t%e\t%ld\n", cnt,
 392                 d1, d2,
 393                 d1 / cnt, d2 / cnt, rate );
 394     }
 395 }