teshsuite/smpi/mpich3-test/perf/manyrma.c

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
   2 /*
   3  *  (C) 2010 by Argonne National Laboratory.
   4  *      See COPYRIGHT in top-level directory.
   5  */
   6
   7 /* This test measures the performance of many rma operations to a single
   8    target process.
   9    It uses a number of operations (put or accumulate) to different
  10    locations in the target window
  11    This is one of the ways that RMA may be used, and is used in the
  12    reference implementation of the graph500 benchmark.
  13 */
  14 #include "mpi.h"
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17 #include <string.h>
  18
  19 #define MAX_COUNT 65536*4
  20 #define MAX_RMA_SIZE 16
  21 #define MAX_RUNS 10
  22
  23 typedef enum { SYNC_NONE = 0,
  24     SYNC_ALL = -1, SYNC_FENCE = 1, SYNC_LOCK = 2, SYNC_PSCW = 4
  25 } sync_t;
  26 typedef enum { RMA_NONE = 0, RMA_ALL = -1, RMA_PUT = 1, RMA_ACC = 2, RMA_GET = 4 } rma_t;
  27 /* Note GET not yet implemented */
  28 sync_t syncChoice = SYNC_ALL;
  29 rma_t rmaChoice = RMA_ALL;
  30
  31 typedef struct {
  32     double startOp, endOp, endSync;
  33 } timing;
  34
  35 static int verbose = 1;
  36 static int barrierSync = 0;
  37 static double tickThreshold = 0.0;
  38
  39 void PrintResults(int cnt, timing t[]);
  40 void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
  41 void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
  42 void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
  43 void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
  44 void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
  45                 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
  46 void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
  47                 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
  48
  49 int main(int argc, char *argv[])
  50 {
  51     int arraysize, i, cnt, sz, maxCount = MAX_COUNT, *arraybuffer;
  52     int wrank, wsize, destRank, srcRank;
  53     MPI_Win win;
  54     MPI_Group wgroup, accessGroup, exposureGroup;
  55     timing t[MAX_RUNS];
  56     int maxSz = MAX_RMA_SIZE;
  57
  58     MPI_Init(&argc, &argv);
  59
  60     /* Determine clock accuracy */
  61     tickThreshold = 10.0 * MPI_Wtick();
  62     MPI_Allreduce(MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  63
  64     for (i = 1; i < argc; i++) {
  65         if (strcmp(argv[i], "-put") == 0) {
  66             if (rmaChoice == RMA_ALL)
  67                 rmaChoice = RMA_NONE;
  68             rmaChoice |= RMA_PUT;
  69         }
  70         else if (strcmp(argv[i], "-acc") == 0) {
  71             if (rmaChoice == RMA_ALL)
  72                 rmaChoice = RMA_NONE;
  73             rmaChoice |= RMA_ACC;
  74         }
  75         else if (strcmp(argv[i], "-fence") == 0) {
  76             if (syncChoice == SYNC_ALL)
  77                 syncChoice = SYNC_NONE;
  78             syncChoice |= SYNC_FENCE;
  79         }
  80         else if (strcmp(argv[i], "-lock") == 0) {
  81             if (syncChoice == SYNC_ALL)
  82                 syncChoice = SYNC_NONE;
  83             syncChoice |= SYNC_LOCK;
  84         }
  85         else if (strcmp(argv[i], "-pscw") == 0) {
  86             if (syncChoice == SYNC_ALL)
  87                 syncChoice = SYNC_NONE;
  88             syncChoice |= SYNC_PSCW;
  89         }
  90         else if (strcmp(argv[i], "-maxsz") == 0) {
  91             i++;
  92             maxSz = atoi(argv[i]);
  93         }
  94         else if (strcmp(argv[i], "-maxcount") == 0) {
  95             i++;
  96             maxCount = atoi(argv[i]);
  97         }
  98         else if (strcmp(argv[i], "-barrier") == 0) {
  99             barrierSync = 1;
 100         }
 101         else {
 102             fprintf(stderr, "Unrecognized argument %s\n", argv[i]);
 103             fprintf(stderr,
 104                     "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ]  [ -maxsz msgsize ]\n",
 105                     argv[0]);
 106             MPI_Abort(MPI_COMM_WORLD, 1);
 107         }
 108     }
 109
 110     MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
 111     MPI_Comm_size(MPI_COMM_WORLD, &wsize);
 112     destRank = wrank + 1;
 113     while (destRank >= wsize)
 114         destRank = destRank - wsize;
 115     srcRank = wrank - 1;
 116     if (srcRank < 0)
 117         srcRank += wsize;
 118
 119     /* Create groups for PSCW */
 120     MPI_Comm_group(MPI_COMM_WORLD, &wgroup);
 121     MPI_Group_incl(wgroup, 1, &destRank, &accessGroup);
 122     MPI_Group_incl(wgroup, 1, &srcRank, &exposureGroup);
 123     MPI_Group_free(&wgroup);
 124
 125     arraysize = maxSz * MAX_COUNT;
 126     arraybuffer = (int *) malloc(arraysize * sizeof(int));
 127     if (!arraybuffer) {
 128         fprintf(stderr, "Unable to allocate %d words\n", arraysize);
 129         MPI_Abort(MPI_COMM_WORLD, 1);
 130     }
 131
 132     MPI_Win_create(arraybuffer, arraysize * sizeof(int), (int) sizeof(int),
 133                    MPI_INFO_NULL, MPI_COMM_WORLD, &win);
 134
 135     /* FIXME: we need a test on performance consistency.
 136      * The test needs to have both a relative growth limit and
 137      * an absolute limit.
 138      */
 139
 140     if (maxCount > MAX_COUNT) {
 141         fprintf(stderr, "MaxCount must not exceed %d\n", MAX_COUNT);
 142         MPI_Abort(MPI_COMM_WORLD, 1);
 143     }
 144
 145     if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
 146         for (sz = 1; sz <= maxSz; sz = sz + sz) {
 147             if (wrank == 0)
 148                 printf("Accumulate with fence, %d elements\n", sz);
 149             cnt = 1;
 150             while (cnt <= maxCount) {
 151                 RunAccFence(win, destRank, cnt, sz, t);
 152                 if (wrank == 0) {
 153                     PrintResults(cnt, t);
 154                 }
 155                 cnt = 2 * cnt;
 156             }
 157         }
 158     }
 159
 160     if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
 161         for (sz = 1; sz <= maxSz; sz = sz + sz) {
 162             if (wrank == 0)
 163                 printf("Accumulate with lock, %d elements\n", sz);
 164             cnt = 1;
 165             while (cnt <= maxCount) {
 166                 RunAccLock(win, destRank, cnt, sz, t);
 167                 if (wrank == 0) {
 168                     PrintResults(cnt, t);
 169                 }
 170                 cnt = 2 * cnt;
 171             }
 172         }
 173     }
 174
 175     if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
 176         for (sz = 1; sz <= maxSz; sz = sz + sz) {
 177             if (wrank == 0)
 178                 printf("Put with fence, %d elements\n", sz);
 179             cnt = 1;
 180             while (cnt <= maxCount) {
 181                 RunPutFence(win, destRank, cnt, sz, t);
 182                 if (wrank == 0) {
 183                     PrintResults(cnt, t);
 184                 }
 185                 cnt = 2 * cnt;
 186             }
 187         }
 188     }
 189
 190     if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
 191         for (sz = 1; sz <= maxSz; sz = sz + sz) {
 192             if (wrank == 0)
 193                 printf("Put with lock, %d elements\n", sz);
 194             cnt = 1;
 195             while (cnt <= maxCount) {
 196                 RunPutLock(win, destRank, cnt, sz, t);
 197                 if (wrank == 0) {
 198                     PrintResults(cnt, t);
 199                 }
 200                 cnt = 2 * cnt;
 201             }
 202         }
 203     }
 204
 205     if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
 206         for (sz = 1; sz <= maxSz; sz = sz + sz) {
 207             if (wrank == 0)
 208                 printf("Put with pscw, %d elements\n", sz);
 209             cnt = 1;
 210             while (cnt <= maxCount) {
 211                 RunPutPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
 212                 if (wrank == 0) {
 213                     PrintResults(cnt, t);
 214                 }
 215                 cnt = 2 * cnt;
 216             }
 217         }
 218     }
 219
 220     if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
 221         for (sz = 1; sz <= maxSz; sz = sz + sz) {
 222             if (wrank == 0)
 223                 printf("Accumulate with pscw, %d elements\n", sz);
 224             cnt = 1;
 225             while (cnt <= maxCount) {
 226                 RunAccPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
 227                 if (wrank == 0) {
 228                     PrintResults(cnt, t);
 229                 }
 230                 cnt = 2 * cnt;
 231             }
 232         }
 233     }
 234
 235     MPI_Win_free(&win);
 236
 237     MPI_Group_free(&accessGroup);
 238     MPI_Group_free(&exposureGroup);
 239
 240     MPI_Finalize();
 241     return 0;
 242 }
 243
 244
 245 void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
 246 {
 247     int k, i, j, one = 1;
 248
 249     for (k = 0; k < MAX_RUNS; k++) {
 250         MPI_Barrier(MPI_COMM_WORLD);
 251         MPI_Win_fence(0, win);
 252         j = 0;
 253         t[k].startOp = MPI_Wtime();
 254         for (i = 0; i < cnt; i++) {
 255             MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
 256             j += sz;
 257         }
 258         t[k].endOp = MPI_Wtime();
 259         if (barrierSync)
 260             MPI_Barrier(MPI_COMM_WORLD);
 261         MPI_Win_fence(0, win);
 262         t[k].endSync = MPI_Wtime();
 263     }
 264 }
 265
 266 void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
 267 {
 268     int k, i, j, one = 1;
 269
 270     for (k = 0; k < MAX_RUNS; k++) {
 271         MPI_Barrier(MPI_COMM_WORLD);
 272         MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
 273         j = 0;
 274         t[k].startOp = MPI_Wtime();
 275         for (i = 0; i < cnt; i++) {
 276             MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
 277             j += sz;
 278         }
 279         t[k].endOp = MPI_Wtime();
 280         if (barrierSync)
 281             MPI_Barrier(MPI_COMM_WORLD);
 282         MPI_Win_unlock(destRank, win);
 283         t[k].endSync = MPI_Wtime();
 284     }
 285 }
 286
 287 void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
 288 {
 289     int k, i, j, one = 1;
 290
 291     for (k = 0; k < MAX_RUNS; k++) {
 292         MPI_Barrier(MPI_COMM_WORLD);
 293         MPI_Win_fence(0, win);
 294         j = 0;
 295         t[k].startOp = MPI_Wtime();
 296         for (i = 0; i < cnt; i++) {
 297             MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
 298             j += sz;
 299         }
 300         t[k].endOp = MPI_Wtime();
 301         if (barrierSync)
 302             MPI_Barrier(MPI_COMM_WORLD);
 303         MPI_Win_fence(0, win);
 304         t[k].endSync = MPI_Wtime();
 305     }
 306 }
 307
 308 void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
 309 {
 310     int k, i, j, one = 1;
 311
 312     for (k = 0; k < MAX_RUNS; k++) {
 313         MPI_Barrier(MPI_COMM_WORLD);
 314         MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
 315         j = 0;
 316         t[k].startOp = MPI_Wtime();
 317         for (i = 0; i < cnt; i++) {
 318             MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
 319             j += sz;
 320         }
 321         t[k].endOp = MPI_Wtime();
 322         if (barrierSync)
 323             MPI_Barrier(MPI_COMM_WORLD);
 324         MPI_Win_unlock(destRank, win);
 325         t[k].endSync = MPI_Wtime();
 326     }
 327 }
 328
 329 void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
 330                 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
 331 {
 332     int k, i, j, one = 1;
 333
 334     for (k = 0; k < MAX_RUNS; k++) {
 335         MPI_Barrier(MPI_COMM_WORLD);
 336         MPI_Win_post(exposureGroup, 0, win);
 337         MPI_Win_start(accessGroup, 0, win);
 338         j = 0;
 339         t[k].startOp = MPI_Wtime();
 340         for (i = 0; i < cnt; i++) {
 341             MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
 342             j += sz;
 343         }
 344         t[k].endOp = MPI_Wtime();
 345         if (barrierSync)
 346             MPI_Barrier(MPI_COMM_WORLD);
 347         MPI_Win_complete(win);
 348         MPI_Win_wait(win);
 349         t[k].endSync = MPI_Wtime();
 350     }
 351 }
 352
 353 void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
 354                 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
 355 {
 356     int k, i, j, one = 1;
 357
 358     for (k = 0; k < MAX_RUNS; k++) {
 359         MPI_Barrier(MPI_COMM_WORLD);
 360         MPI_Win_post(exposureGroup, 0, win);
 361         MPI_Win_start(accessGroup, 0, win);
 362         j = 0;
 363         t[k].startOp = MPI_Wtime();
 364         for (i = 0; i < cnt; i++) {
 365             MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
 366             j += sz;
 367         }
 368         t[k].endOp = MPI_Wtime();
 369         if (barrierSync)
 370             MPI_Barrier(MPI_COMM_WORLD);
 371         MPI_Win_complete(win);
 372         MPI_Win_wait(win);
 373         t[k].endSync = MPI_Wtime();
 374     }
 375 }
 376
 377 void PrintResults(int cnt, timing t[])
 378 {
 379     int k;
 380     double d1 = 0, d2 = 0;
 381     double minD1 = 1e10, minD2 = 1e10;
 382     double tOp, tSync;
 383     for (k = 0; k < MAX_RUNS; k++) {
 384         tOp = t[k].endOp - t[k].startOp;
 385         tSync = t[k].endSync - t[k].endOp;
 386         d1 += tOp;
 387         d2 += tSync;
 388         if (tOp < minD1)
 389             minD1 = tOp;
 390         if (tSync < minD2)
 391             minD2 = tSync;
 392     }
 393     if (verbose) {
 394         long rate = 0;
 395         /* Use the minimum times because they are more stable - if timing
 396          * accuracy is an issue, use the min over multiple trials */
 397         d1 = minD1;
 398         d2 = minD2;
 399         /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
 400         if (d2 > 0)
 401             rate = (long) (cnt) / d2;
 402         /* count, op, sync, op/each, sync/each, rate */
 403         printf("%d\t%e\t%e\t%e\t%e\t%ld\n", cnt, d1, d2, d1 / cnt, d2 / cnt, rate);
 404     }
 405 }