1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
3 * (C) 2010 by Argonne National Laboratory.
4 * See COPYRIGHT in top-level directory.
7 /* This test measures the performance of many rma operations to a single
9 It uses a number of operations (put or accumulate) to different
10 locations in the target window
11 This is one of the ways that RMA may be used, and is used in the
12 reference implementation of the graph500 benchmark.
19 #define MAX_COUNT 65536*4
20 #define MAX_RMA_SIZE 16
23 typedef enum { SYNC_NONE=0,
24 SYNC_ALL=-1, SYNC_FENCE=1, SYNC_LOCK=2, SYNC_PSCW=4 } sync_t;
25 typedef enum { RMA_NONE=0, RMA_ALL=-1, RMA_PUT=1, RMA_ACC=2, RMA_GET=4 } rma_t;
26 /* Note GET not yet implemented */
27 sync_t syncChoice = SYNC_ALL;
28 rma_t rmaChoice = RMA_ALL;
31 double startOp, endOp, endSync;
34 static int verbose = 1;
35 static int barrierSync = 0;
36 static double tickThreshold = 0.0;
38 void PrintResults( int cnt, timing t[] );
39 void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
40 void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
41 void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
42 void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
43 void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
44 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
45 void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
46 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
48 int main( int argc, char *argv[] )
50 int arraysize, i, cnt, sz, maxCount=MAX_COUNT, *arraybuffer;
51 int wrank, wsize, destRank, srcRank;
53 MPI_Group wgroup, accessGroup, exposureGroup;
55 int maxSz = MAX_RMA_SIZE;
57 MPI_Init( &argc, &argv );
59 /* Determine clock accuracy */
60 tickThreshold = 10.0 * MPI_Wtick();
61 MPI_Allreduce( MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX,
64 for (i=1; i<argc; i++) {
65 if (strcmp( argv[i], "-put" ) == 0) {
66 if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
69 else if (strcmp( argv[i], "-acc" ) == 0) {
70 if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
73 else if (strcmp( argv[i], "-fence" ) == 0) {
74 if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
75 syncChoice |= SYNC_FENCE;
77 else if (strcmp( argv[i], "-lock" ) == 0) {
78 if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
79 syncChoice |= SYNC_LOCK;
81 else if (strcmp( argv[i], "-pscw" ) == 0) {
82 if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
83 syncChoice |= SYNC_PSCW;
85 else if (strcmp( argv[i], "-maxsz" ) == 0) {
87 maxSz = atoi( argv[i] );
89 else if (strcmp( argv[i], "-maxcount" ) == 0) {
91 maxCount = atoi( argv[i] );
93 else if (strcmp( argv[i], "-barrier" ) == 0) {
97 fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
98 fprintf( stderr, "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ] [ -maxsz msgsize ]\n", argv[0] );
99 MPI_Abort( MPI_COMM_WORLD, 1 );
103 MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
104 MPI_Comm_size( MPI_COMM_WORLD, &wsize );
105 destRank = wrank + 1;
106 while (destRank >= wsize) destRank = destRank - wsize;
108 if (srcRank < 0) srcRank += wsize;
110 /* Create groups for PSCW */
111 MPI_Comm_group( MPI_COMM_WORLD, &wgroup );
112 MPI_Group_incl( wgroup, 1, &destRank, &accessGroup );
113 MPI_Group_incl( wgroup, 1, &srcRank, &exposureGroup );
114 MPI_Group_free( &wgroup );
116 arraysize = maxSz * MAX_COUNT;
117 arraybuffer = (int*)malloc( arraysize * sizeof(int) );
119 fprintf( stderr, "Unable to allocate %d words\n", arraysize );
120 MPI_Abort( MPI_COMM_WORLD, 1 );
123 MPI_Win_create( arraybuffer, arraysize*sizeof(int), (int)sizeof(int),
124 MPI_INFO_NULL, MPI_COMM_WORLD, &win );
126 /* FIXME: we need a test on performance consistency.
127 The test needs to have both a relative growth limit and
131 if (maxCount > MAX_COUNT) {
132 fprintf( stderr, "MaxCount must not exceed %d\n", MAX_COUNT );
133 MPI_Abort( MPI_COMM_WORLD, 1 );
136 if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
137 for (sz=1; sz<=maxSz; sz = sz + sz) {
139 printf( "Accumulate with fence, %d elements\n", sz );
141 while (cnt <= maxCount) {
142 RunAccFence( win, destRank, cnt, sz, t );
144 PrintResults( cnt, t );
151 if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
152 for (sz=1; sz<=maxSz; sz = sz + sz) {
154 printf( "Accumulate with lock, %d elements\n", sz );
156 while (cnt <= maxCount) {
157 RunAccLock( win, destRank, cnt, sz, t );
159 PrintResults( cnt, t );
166 if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
167 for (sz=1; sz<=maxSz; sz = sz + sz) {
169 printf( "Put with fence, %d elements\n", sz );
171 while (cnt <= maxCount) {
172 RunPutFence( win, destRank, cnt, sz, t );
174 PrintResults( cnt, t );
181 if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
182 for (sz=1; sz<=maxSz; sz = sz + sz) {
184 printf( "Put with lock, %d elements\n", sz );
186 while (cnt <= maxCount) {
187 RunPutLock( win, destRank, cnt, sz, t );
189 PrintResults( cnt, t );
196 if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
197 for (sz=1; sz<=maxSz; sz = sz + sz) {
199 printf( "Put with pscw, %d elements\n", sz );
201 while (cnt <= maxCount) {
202 RunPutPSCW( win, destRank, cnt, sz,
203 exposureGroup, accessGroup, t );
205 PrintResults( cnt, t );
212 if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
213 for (sz=1; sz<=maxSz; sz = sz + sz) {
215 printf( "Accumulate with pscw, %d elements\n", sz );
217 while (cnt <= maxCount) {
218 RunAccPSCW( win, destRank, cnt, sz,
219 exposureGroup, accessGroup, t );
221 PrintResults( cnt, t );
228 MPI_Win_free( &win );
230 MPI_Group_free( &accessGroup );
231 MPI_Group_free( &exposureGroup );
238 void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
240 int k, i, j, one = 1;
242 for (k=0; k<MAX_RUNS; k++) {
243 MPI_Barrier( MPI_COMM_WORLD );
244 MPI_Win_fence( 0, win );
246 t[k].startOp = MPI_Wtime();
247 for (i=0; i<cnt; i++) {
248 MPI_Accumulate( &one, sz, MPI_INT, destRank,
249 j, sz, MPI_INT, MPI_SUM, win );
252 t[k].endOp = MPI_Wtime();
253 if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
254 MPI_Win_fence( 0, win );
255 t[k].endSync = MPI_Wtime();
259 void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
261 int k, i, j, one = 1;
263 for (k=0; k<MAX_RUNS; k++) {
264 MPI_Barrier( MPI_COMM_WORLD );
265 MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
267 t[k].startOp = MPI_Wtime();
268 for (i=0; i<cnt; i++) {
269 MPI_Accumulate( &one, sz, MPI_INT, destRank,
270 j, sz, MPI_INT, MPI_SUM, win );
273 t[k].endOp = MPI_Wtime();
274 if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
275 MPI_Win_unlock( destRank, win );
276 t[k].endSync = MPI_Wtime();
280 void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
282 int k, i, j, one = 1;
284 for (k=0; k<MAX_RUNS; k++) {
285 MPI_Barrier( MPI_COMM_WORLD );
286 MPI_Win_fence( 0, win );
288 t[k].startOp = MPI_Wtime();
289 for (i=0; i<cnt; i++) {
290 MPI_Put( &one, sz, MPI_INT, destRank,
291 j, sz, MPI_INT, win );
294 t[k].endOp = MPI_Wtime();
295 if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
296 MPI_Win_fence( 0, win );
297 t[k].endSync = MPI_Wtime();
301 void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
303 int k, i, j, one = 1;
305 for (k=0; k<MAX_RUNS; k++) {
306 MPI_Barrier( MPI_COMM_WORLD );
307 MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
309 t[k].startOp = MPI_Wtime();
310 for (i=0; i<cnt; i++) {
311 MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
314 t[k].endOp = MPI_Wtime();
315 if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
316 MPI_Win_unlock( destRank, win );
317 t[k].endSync = MPI_Wtime();
321 void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
322 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
324 int k, i, j, one = 1;
326 for (k=0; k<MAX_RUNS; k++) {
327 MPI_Barrier( MPI_COMM_WORLD );
328 MPI_Win_post( exposureGroup, 0, win );
329 MPI_Win_start( accessGroup, 0, win );
331 t[k].startOp = MPI_Wtime();
332 for (i=0; i<cnt; i++) {
333 MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
336 t[k].endOp = MPI_Wtime();
337 if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
338 MPI_Win_complete( win );
340 t[k].endSync = MPI_Wtime();
344 void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
345 MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
347 int k, i, j, one = 1;
349 for (k=0; k<MAX_RUNS; k++) {
350 MPI_Barrier( MPI_COMM_WORLD );
351 MPI_Win_post( exposureGroup, 0, win );
352 MPI_Win_start( accessGroup, 0, win );
354 t[k].startOp = MPI_Wtime();
355 for (i=0; i<cnt; i++) {
356 MPI_Accumulate( &one, sz, MPI_INT, destRank,
357 j, sz, MPI_INT, MPI_SUM, win );
360 t[k].endOp = MPI_Wtime();
361 if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
362 MPI_Win_complete( win );
364 t[k].endSync = MPI_Wtime();
368 void PrintResults( int cnt, timing t[] )
372 double minD1 = 1e10, minD2 = 1e10;
374 for (k=0; k<MAX_RUNS; k++) {
375 tOp = t[k].endOp - t[k].startOp;
376 tSync = t[k].endSync - t[k].endOp;
379 if (tOp < minD1) minD1 = tOp;
380 if (tSync < minD2) minD2 = tSync;
384 /* Use the minimum times because they are more stable - if timing
385 accuracy is an issue, use the min over multiple trials */
388 /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
389 if (d2 > 0) rate = (long)(cnt) / d2;
390 /* count, op, sync, op/each, sync/each, rate */
391 printf( "%d\t%e\t%e\t%e\t%e\t%ld\n", cnt,
393 d1 / cnt, d2 / cnt, rate );