1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
3 * (C) 2010 by Argonne National Laboratory.
4 * See COPYRIGHT in top-level directory.
7 /* This test measures the performance of many rma operations to a single
9 It uses a number of operations (put or accumulate) to different
10 locations in the target window
11 This is one of the ways that RMA may be used, and is used in the
12 reference implementation of the graph500 benchmark.
19 #define MAX_COUNT 65536*4
20 #define MAX_RMA_SIZE 16
23 typedef enum { SYNC_NONE = 0,
24 SYNC_ALL = -1, SYNC_FENCE = 1, SYNC_LOCK = 2, SYNC_PSCW = 4
26 typedef enum { RMA_NONE = 0, RMA_ALL = -1, RMA_PUT = 1, RMA_ACC = 2, RMA_GET = 4 } rma_t;
27 /* Note GET not yet implemented */
28 sync_t syncChoice = SYNC_ALL;
29 rma_t rmaChoice = RMA_ALL;
32 double startOp, endOp, endSync;
35 static int verbose = 1;
36 static int barrierSync = 0;
37 static double tickThreshold = 0.0;
39 void PrintResults(int cnt, timing t[]);
40 void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
41 void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
42 void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
43 void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
44 void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
45 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
46 void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
47 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
49 int main(int argc, char *argv[])
51 int arraysize, i, cnt, sz, maxCount = MAX_COUNT, *arraybuffer;
52 int wrank, wsize, destRank, srcRank;
54 MPI_Group wgroup, accessGroup, exposureGroup;
56 int maxSz = MAX_RMA_SIZE;
58 MPI_Init(&argc, &argv);
60 /* Determine clock accuracy */
61 tickThreshold = 10.0 * MPI_Wtick();
62 MPI_Allreduce(MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
64 for (i = 1; i < argc; i++) {
65 if (strcmp(argv[i], "-put") == 0) {
66 if (rmaChoice == RMA_ALL)
70 else if (strcmp(argv[i], "-acc") == 0) {
71 if (rmaChoice == RMA_ALL)
75 else if (strcmp(argv[i], "-fence") == 0) {
76 if (syncChoice == SYNC_ALL)
77 syncChoice = SYNC_NONE;
78 syncChoice |= SYNC_FENCE;
80 else if (strcmp(argv[i], "-lock") == 0) {
81 if (syncChoice == SYNC_ALL)
82 syncChoice = SYNC_NONE;
83 syncChoice |= SYNC_LOCK;
85 else if (strcmp(argv[i], "-pscw") == 0) {
86 if (syncChoice == SYNC_ALL)
87 syncChoice = SYNC_NONE;
88 syncChoice |= SYNC_PSCW;
90 else if (strcmp(argv[i], "-maxsz") == 0) {
92 maxSz = atoi(argv[i]);
94 else if (strcmp(argv[i], "-maxcount") == 0) {
96 maxCount = atoi(argv[i]);
98 else if (strcmp(argv[i], "-barrier") == 0) {
102 fprintf(stderr, "Unrecognized argument %s\n", argv[i]);
104 "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ] [ -maxsz msgsize ]\n",
106 MPI_Abort(MPI_COMM_WORLD, 1);
110 MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
111 MPI_Comm_size(MPI_COMM_WORLD, &wsize);
112 destRank = wrank + 1;
113 while (destRank >= wsize)
114 destRank = destRank - wsize;
119 /* Create groups for PSCW */
120 MPI_Comm_group(MPI_COMM_WORLD, &wgroup);
121 MPI_Group_incl(wgroup, 1, &destRank, &accessGroup);
122 MPI_Group_incl(wgroup, 1, &srcRank, &exposureGroup);
123 MPI_Group_free(&wgroup);
125 arraysize = maxSz * MAX_COUNT;
126 arraybuffer = (int *) malloc(arraysize * sizeof(int));
128 fprintf(stderr, "Unable to allocate %d words\n", arraysize);
129 MPI_Abort(MPI_COMM_WORLD, 1);
132 MPI_Win_create(arraybuffer, arraysize * sizeof(int), (int) sizeof(int),
133 MPI_INFO_NULL, MPI_COMM_WORLD, &win);
135 /* FIXME: we need a test on performance consistency.
136 * The test needs to have both a relative growth limit and
140 if (maxCount > MAX_COUNT) {
141 fprintf(stderr, "MaxCount must not exceed %d\n", MAX_COUNT);
142 MPI_Abort(MPI_COMM_WORLD, 1);
145 if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
146 for (sz = 1; sz <= maxSz; sz = sz + sz) {
148 printf("Accumulate with fence, %d elements\n", sz);
150 while (cnt <= maxCount) {
151 RunAccFence(win, destRank, cnt, sz, t);
153 PrintResults(cnt, t);
160 if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
161 for (sz = 1; sz <= maxSz; sz = sz + sz) {
163 printf("Accumulate with lock, %d elements\n", sz);
165 while (cnt <= maxCount) {
166 RunAccLock(win, destRank, cnt, sz, t);
168 PrintResults(cnt, t);
175 if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
176 for (sz = 1; sz <= maxSz; sz = sz + sz) {
178 printf("Put with fence, %d elements\n", sz);
180 while (cnt <= maxCount) {
181 RunPutFence(win, destRank, cnt, sz, t);
183 PrintResults(cnt, t);
190 if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
191 for (sz = 1; sz <= maxSz; sz = sz + sz) {
193 printf("Put with lock, %d elements\n", sz);
195 while (cnt <= maxCount) {
196 RunPutLock(win, destRank, cnt, sz, t);
198 PrintResults(cnt, t);
205 if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
206 for (sz = 1; sz <= maxSz; sz = sz + sz) {
208 printf("Put with pscw, %d elements\n", sz);
210 while (cnt <= maxCount) {
211 RunPutPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
213 PrintResults(cnt, t);
220 if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
221 for (sz = 1; sz <= maxSz; sz = sz + sz) {
223 printf("Accumulate with pscw, %d elements\n", sz);
225 while (cnt <= maxCount) {
226 RunAccPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
228 PrintResults(cnt, t);
237 MPI_Group_free(&accessGroup);
238 MPI_Group_free(&exposureGroup);
245 void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
247 int k, i, j, one = 1;
249 for (k = 0; k < MAX_RUNS; k++) {
250 MPI_Barrier(MPI_COMM_WORLD);
251 MPI_Win_fence(0, win);
253 t[k].startOp = MPI_Wtime();
254 for (i = 0; i < cnt; i++) {
255 MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
258 t[k].endOp = MPI_Wtime();
260 MPI_Barrier(MPI_COMM_WORLD);
261 MPI_Win_fence(0, win);
262 t[k].endSync = MPI_Wtime();
266 void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
268 int k, i, j, one = 1;
270 for (k = 0; k < MAX_RUNS; k++) {
271 MPI_Barrier(MPI_COMM_WORLD);
272 MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
274 t[k].startOp = MPI_Wtime();
275 for (i = 0; i < cnt; i++) {
276 MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
279 t[k].endOp = MPI_Wtime();
281 MPI_Barrier(MPI_COMM_WORLD);
282 MPI_Win_unlock(destRank, win);
283 t[k].endSync = MPI_Wtime();
287 void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
289 int k, i, j, one = 1;
291 for (k = 0; k < MAX_RUNS; k++) {
292 MPI_Barrier(MPI_COMM_WORLD);
293 MPI_Win_fence(0, win);
295 t[k].startOp = MPI_Wtime();
296 for (i = 0; i < cnt; i++) {
297 MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
300 t[k].endOp = MPI_Wtime();
302 MPI_Barrier(MPI_COMM_WORLD);
303 MPI_Win_fence(0, win);
304 t[k].endSync = MPI_Wtime();
308 void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
310 int k, i, j, one = 1;
312 for (k = 0; k < MAX_RUNS; k++) {
313 MPI_Barrier(MPI_COMM_WORLD);
314 MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
316 t[k].startOp = MPI_Wtime();
317 for (i = 0; i < cnt; i++) {
318 MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
321 t[k].endOp = MPI_Wtime();
323 MPI_Barrier(MPI_COMM_WORLD);
324 MPI_Win_unlock(destRank, win);
325 t[k].endSync = MPI_Wtime();
329 void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
330 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
332 int k, i, j, one = 1;
334 for (k = 0; k < MAX_RUNS; k++) {
335 MPI_Barrier(MPI_COMM_WORLD);
336 MPI_Win_post(exposureGroup, 0, win);
337 MPI_Win_start(accessGroup, 0, win);
339 t[k].startOp = MPI_Wtime();
340 for (i = 0; i < cnt; i++) {
341 MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
344 t[k].endOp = MPI_Wtime();
346 MPI_Barrier(MPI_COMM_WORLD);
347 MPI_Win_complete(win);
349 t[k].endSync = MPI_Wtime();
353 void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
354 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
356 int k, i, j, one = 1;
358 for (k = 0; k < MAX_RUNS; k++) {
359 MPI_Barrier(MPI_COMM_WORLD);
360 MPI_Win_post(exposureGroup, 0, win);
361 MPI_Win_start(accessGroup, 0, win);
363 t[k].startOp = MPI_Wtime();
364 for (i = 0; i < cnt; i++) {
365 MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
368 t[k].endOp = MPI_Wtime();
370 MPI_Barrier(MPI_COMM_WORLD);
371 MPI_Win_complete(win);
373 t[k].endSync = MPI_Wtime();
377 void PrintResults(int cnt, timing t[])
380 double d1 = 0, d2 = 0;
381 double minD1 = 1e10, minD2 = 1e10;
383 for (k = 0; k < MAX_RUNS; k++) {
384 tOp = t[k].endOp - t[k].startOp;
385 tSync = t[k].endSync - t[k].endOp;
395 /* Use the minimum times because they are more stable - if timing
396 * accuracy is an issue, use the min over multiple trials */
399 /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
401 rate = (long) (cnt) / d2;
402 /* count, op, sync, op/each, sync/each, rate */
403 printf("%d\t%e\t%e\t%e\t%e\t%ld\n", cnt, d1, d2, d1 / cnt, d2 / cnt, rate);