1 /*************************************************************************
3 * N A S P A R A L L E L B E N C H M A R K S 3.3 *
7 *************************************************************************
9 * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
10 * It is described in NAS Technical Report 95-020. *
12 * Permission to use, copy, distribute and modify this software *
13 * for any purpose with or without fee is hereby granted. We *
14 * request, however, that all derived work reference the NAS *
15 * Parallel Benchmarks 3.3. This software is provided "as is" *
16 * without express or implied warranty. *
18 * Information on NPB 3.3, including the technical report, the *
19 * original specifications, source code, results and information *
20 * on how to submit new results, is available at: *
22 * http://www.nas.nasa.gov/Software/NPB *
24 * Send comments or suggestions to npb@nas.nasa.gov *
25 * Send bug reports to npb-bugs@nas.nasa.gov *
27 * NAS Parallel Benchmarks Group *
28 * NASA Ames Research Center *
30 * Moffett Field, CA 94035-1000 *
32 * E-mail: npb@nas.nasa.gov *
33 * Fax: (650) 604-3957 *
35 *************************************************************************
40 *************************************************************************/
43 #include "nas_common.h"
47 #include "simgrid/instr.h" //TRACE_
55 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
56 * INCREASE max_procs AT YOUR PERIL */
65 #define MAX_ITERATIONS 10
66 #define TEST_ARRAY_SIZE 5
68 /* Typedef: if necessary, change the size of int here by changing the int type to, say, long */
70 typedef long INT_TYPE2;
71 #define MP_KEY_TYPE MPI_INT
75 int my_rank, comm_size;
76 /* Some global info */
77 INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
78 total_local_keys, /* copies of rank info */
81 int passed_verification;
82 /* These are the three main arrays. See SIZE_OF_BUFFERS def above */
83 INT_TYPE *key_array, *key_buff1, *key_buff2,
84 *bucket_size, /* Top 5 elements for */
85 *bucket_size_totals, /* part. ver. vals */
86 *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
87 int send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
89 /* Partial verif info */
90 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
91 test_rank_array[TEST_ARRAY_SIZE];
95 S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
96 S_test_rank_array[TEST_ARRAY_SIZE] = {0,18,346,64917,65463},
97 W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
98 W_test_rank_array[TEST_ARRAY_SIZE] = {1249,11698,1039987,1043896,1048018},
100 A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
101 A_test_rank_array[TEST_ARRAY_SIZE] = {104,17523,123928,8288932,8388264},
103 B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
104 B_test_rank_array[TEST_ARRAY_SIZE] = {33422937,10244,59149,33135281,99},
106 C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
107 C_test_rank_array[TEST_ARRAY_SIZE] = {61147,882988,266290,133997595,133525895},
109 D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
110 D_test_rank_array[TEST_ARRAY_SIZE] = {1,36538729,1978098519,2145192618,2147425337};
112 void full_verify( global_data* gd );
114 /************ returns parallel random number seq seed ************/
116 * Create a random number sequence of total length nn residing on np number of processors. Each processor will
117 * therefore have a subsequence of length nn/np. This routine returns that random number which is the first random
118 * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
120 static double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
121 int np, /* np = num procs */
122 long nn, /* total num of ran numbers, all procs */
123 double s, /* Ran num seed, for ex.: 314159265.00 */
124 double a ) /* Ran num gen mult, try 1220703125.00 */
132 for( mq=0; nq>1; mq++,nq/=2);
136 for( i=1; i<=mq; i++ )
137 t2 = randlc( &t1, &t1 );
145 for( i=1; i<=100; i++ ){
158 static void create_seq( global_data* gd, double seed, double a )
165 for (i=0; i<num_keys; i++){
166 x = randlc(&seed, &a);
167 x += randlc(&seed, &a);
168 x += randlc(&seed, &a);
169 x += randlc(&seed, &a);
171 gd->key_array[i] = k*x;
175 void full_verify( global_data* gd )
181 INT_TYPE k, last_local_key;
183 /* Now, finally, sort the keys: */
184 for( i=0; i<gd->total_local_keys; i++ )
185 gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
186 last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
188 /* Send largest key value to next processor */
189 if( gd->my_rank > 0 )
190 MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
191 if( gd->my_rank < gd->comm_size-1 )
192 MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
193 if( gd->my_rank > 0 )
194 MPI_Wait( &request, &status );
196 /* Confirm that neighbor's greatest key value is not greater than my least key value */
198 if( gd->my_rank > 0 && gd->total_local_keys > 0 )
199 if( k > gd->key_array[0] )
202 /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
203 for( i=1; i<gd->total_local_keys; i++ )
204 if( gd->key_array[i-1] > gd->key_array[i] )
208 printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
210 gd->passed_verification++;
213 static void rank( global_data* gd, int iteration )
216 INT_TYPE shift = max_key_log_2 - num_bucket_log_2;
218 INT_TYPE2 bucket_sum_accumulator, j, m;
219 INT_TYPE local_bucket_sum_accumulator;
220 INT_TYPE min_key_val, max_key_val;
221 INT_TYPE *key_buff_ptr;
223 /* Iteration alteration of keys */
224 if(gd->my_rank == 0){
225 gd->key_array[iteration] = iteration;
226 gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
230 for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
231 gd->bucket_size[i] = 0;
232 gd->bucket_size_totals[i] = 0;
233 gd->process_bucket_distrib_ptr1[i] = 0;
234 gd->process_bucket_distrib_ptr2[i] = 0;
237 /* Determine where the partial verify test keys are, load into top of array bucket_size */
238 for( i=0; i<TEST_ARRAY_SIZE; i++ )
239 if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
240 gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
242 /* Determine the number of keys in each bucket */
243 for( i=0; i<num_keys; i++ )
244 gd->bucket_size[gd->key_array[i] >> shift]++;
246 /* Accumulative bucket sizes are the bucket pointers */
247 gd->bucket_ptrs[0] = 0;
248 for( i=1; i< num_buckets; i++ )
249 gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
251 /* Sort into appropriate bucket */
252 for( i=0; i<num_keys; i++ ) {
253 key = gd->key_array[i];
254 gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
257 /* Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
258 MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
261 /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
262 * average number of keys per processor). Then all keys in these buckets go to processor 0.
263 Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc. This
264 algorithm guarantees that all processors have work ranking; no processors are left idle.
265 The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
266 of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
267 computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
268 Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
269 processor will have after the redistribution is done.
272 bucket_sum_accumulator = 0;
273 local_bucket_sum_accumulator = 0;
274 gd->send_displ[0] = 0;
275 gd->process_bucket_distrib_ptr1[0] = 0;
276 for( i=0, j=0; i<num_buckets; i++ ) {
277 bucket_sum_accumulator += gd->bucket_size_totals[i];
278 local_bucket_sum_accumulator += gd->bucket_size[i];
279 if( bucket_sum_accumulator >= (j+1)*num_keys ) {
280 gd->send_count[j] = local_bucket_sum_accumulator;
282 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
283 gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
285 gd->process_bucket_distrib_ptr2[j++] = i;
286 local_bucket_sum_accumulator = 0;
290 /* When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
291 * So, we need to set counts properly in this case to avoid any fallouts. */
292 while( j < gd->comm_size ) {
293 gd->send_count[j] = 0;
294 gd->process_bucket_distrib_ptr1[j] = 1;
298 /* This is the redistribution section: first find out how many keys
299 each processor will send to every other processor: */
300 MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
302 /* Determine the receive array displacements for the buckets */
303 gd->recv_displ[0] = 0;
304 for( i=1; i<gd->comm_size; i++ )
305 gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
307 /* Now send the keys to respective processors */
308 MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
309 gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
311 /* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
312 * obtain the smallest possible min and greatest possible max value of any key on each processor
314 min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
315 max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
317 /* Clear the work array */
318 for( i=0; i<max_key_val-min_key_val+1; i++ )
319 gd->key_buff1[i] = 0;
321 /* Determine the total number of keys on all other processors holding keys of lesser value */
323 for( k=0; k<gd->my_rank; k++ )
324 for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
325 m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
327 /* Determine total number of keys on this processor */
329 for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
330 j += gd->bucket_size_totals[i]; /* j has total # of local keys */
332 /* Ranking of all keys occurs in this section: */
333 /* shift it backwards so no subtractions are necessary in loop */
334 key_buff_ptr = gd->key_buff1 - min_key_val;
336 /* In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
337 individual population */
339 key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key population */
341 /* To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
343 NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
344 key population here, but still needed during the partial verify test. This is to ensure that 32-bit key_buff can
345 still be used for class D. */
346 /* key_buff_ptr[min_key_val] += m; */
347 for( i=min_key_val; i<max_key_val; i++ )
348 key_buff_ptr[i+1] += key_buff_ptr[i];
350 /* This is the partial verify test section */
351 /* Observe that test_rank_array vals are shifted differently for different cases */
352 for( i=0; i<TEST_ARRAY_SIZE; i++ ){
353 k = gd->bucket_size_totals[i+num_buckets]; /* Keys were hidden here */
354 if( min_key_val <= k && k <= max_key_val ){
355 /* Add the total of lesser keys, m, here */
356 INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
362 if( key_rank != gd->test_rank_array[i]+iteration )
365 gd->passed_verification++;
367 if( key_rank != gd->test_rank_array[i]-iteration )
370 gd->passed_verification++;
375 if( key_rank != gd->test_rank_array[i]+(iteration-2) )
378 gd->passed_verification++;
380 if( key_rank != gd->test_rank_array[i]-iteration )
383 gd->passed_verification++;
388 if( key_rank != gd->test_rank_array[i]+(iteration-1) )
391 gd->passed_verification++;
393 if( key_rank != gd->test_rank_array[i]-(iteration-1) )
396 gd->passed_verification++;
400 if( i == 1 || i == 2 || i == 4 ) {
401 if( key_rank != gd->test_rank_array[i]+iteration )
404 gd->passed_verification++;
406 if( key_rank != gd->test_rank_array[i]-iteration )
409 gd->passed_verification++;
414 if( key_rank != gd->test_rank_array[i]+iteration )
417 gd->passed_verification++;
419 if( key_rank != gd->test_rank_array[i]-iteration )
422 gd->passed_verification++;
427 if( key_rank != gd->test_rank_array[i]+iteration )
430 gd->passed_verification++;
432 if( key_rank != gd->test_rank_array[i]-iteration )
435 gd->passed_verification++;
440 printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
441 iteration, gd->my_rank, (int)i );
445 /* Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
446 * the code, probably since they cannot be made register by compiler */
448 if( iteration == MAX_ITERATIONS ) {
449 gd->key_buff_ptr_global = key_buff_ptr;
450 gd->total_local_keys = j;
451 gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
455 int main( int argc, char **argv )
457 int i, iteration, itemp;
458 double timecounter, maxtime;
460 global_data* gd = malloc(sizeof(global_data));
462 MPI_Init( &argc, &argv );
463 MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
464 MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
466 get_info(argc, argv, &nprocs, &class);
467 check_info(IS, nprocs, class);
468 /* Initialize the verification arrays if a valid class */
469 for( i=0; i<TEST_ARRAY_SIZE; i++ )
473 total_keys_log2 = 16;
475 num_bucket_log_2 = 9;
477 gd->test_index_array[i] = S_test_index_array[i];
478 gd->test_rank_array[i] = S_test_rank_array[i];
481 total_keys_log2 = 23;
483 num_bucket_log_2 = 10;
484 gd->test_index_array[i] = A_test_index_array[i];
485 gd->test_rank_array[i] = A_test_rank_array[i];
488 total_keys_log2 = 20;
490 num_bucket_log_2 = 10;
491 gd->test_index_array[i] = W_test_index_array[i];
492 gd->test_rank_array[i] = W_test_rank_array[i];
495 total_keys_log2 = 25;
497 num_bucket_log_2 = 10;
498 gd->test_index_array[i] = B_test_index_array[i];
499 gd->test_rank_array[i] = B_test_rank_array[i];
502 total_keys_log2 = 27;
504 num_bucket_log_2 = 10;
505 gd->test_index_array[i] = C_test_index_array[i];
506 gd->test_rank_array[i] = C_test_rank_array[i];
509 total_keys_log2 = 29;
511 num_bucket_log_2 = 10;
513 gd->test_index_array[i] = D_test_index_array[i];
514 gd->test_rank_array[i] = D_test_rank_array[i];
518 total_keys = (1 << total_keys_log2);
519 max_key = (1 << max_key_log_2);
520 num_buckets = (1 << num_bucket_log_2);
521 num_keys = (total_keys/nprocs*min_procs);
523 /* On larger number of processors, since the keys are (roughly) gaussian distributed, the first and last processor
524 * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
525 * however, a small number The required array size also depends on the bucket size used. The following values are
526 * validated for the 1024-bucket setup. */
528 size_of_buffers = 3*num_keys/2;
529 else if (nprocs < 512)
530 size_of_buffers = 5*num_keys/2;
531 else if (nprocs < 1024)
532 size_of_buffers = 4*num_keys/2;
534 size_of_buffers = 13*num_keys/2;
536 gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
537 gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
538 gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
539 gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* Top 5 elements for */
540 gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
541 gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
542 gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
543 gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
544 // int send_count[max_procs], recv_count[max_procs],
545 // send_displ[max_procs], recv_displ[max_procs];
547 /* Printout initial NPB info */
548 if( gd->my_rank == 0 ){
549 printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
550 printf( " Size: %ld (class %c)\n", (long)total_keys*min_procs, class);
551 printf( " Iterations: %d\n", MAX_ITERATIONS );
552 printf( " Number of processes: %d\n",gd->comm_size );
555 /* Check that actual and compiled number of processors agree */
556 if( gd->comm_size != nprocs) {
557 if( gd->my_rank == 0 )
558 printf( "\n ERROR: compiled for %d processes\n"
559 " Number of active processes: %d\n"
560 " Exiting program!\n\n", nprocs, gd->comm_size );
565 /* Check to see whether total number of processes is within bounds.
566 This could in principle be checked in setparams.c, but it is more convenient to do it here */
567 if( gd->comm_size < min_procs || gd->comm_size > max_procs){
568 if( gd->my_rank == 0 )
569 printf( "\n ERROR: number of processes %d not within range %d-%d"
570 "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
575 /* Generate random number sequence and subsequent keys on all procs */
576 create_seq(gd, find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
577 314159265.00, /* Random number gen seed */
578 1220703125.00 ), /* Random number gen mult */
579 1220703125.00 ); /* Random number gen mult */
581 /* Do one iteration for free (i.e., untimed) to guarantee initialization of
582 all data and code pages and respective tables */
585 /* Start verification counter */
586 gd->passed_verification = 0;
588 if( gd->my_rank == 0 && class != 'S' ) printf( "\n iteration\n" );
590 /* Initialize timer */
596 char smpi_category[100];
597 snprintf (smpi_category, 100, "%d", gd->my_rank);
598 TRACE_smpi_set_category (smpi_category);
600 /* This is the main iteration */
601 for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
602 if( gd->my_rank == 0 && class != 'S' ) printf( " %d\n", iteration );
603 rank(gd, iteration );
605 TRACE_smpi_set_category (NULL);
607 /* Stop timer, obtain time for processors */
610 timecounter = timer_read(0);
612 /* End of timing, obtain maximum time of all processors */
613 MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
615 /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
618 /* Obtain verification counter sum */
619 itemp =gd->passed_verification;
620 MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
622 /* The final printout */
623 if( gd->my_rank == 0 ) {
624 if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
625 gd->passed_verification = 0;
626 c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
627 ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
628 gd->passed_verification);