1 /*************************************************************************
3 * N A S P A R A L L E L B E N C H M A R K S 3.3 *
7 *************************************************************************
9 * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
10 * It is described in NAS Technical Report 95-020. *
12 * Permission to use, copy, distribute and modify this software *
13 * for any purpose with or without fee is hereby granted. We *
14 * request, however, that all derived work reference the NAS *
15 * Parallel Benchmarks 3.3. This software is provided "as is" *
16 * without express or implied warranty. *
18 * Information on NPB 3.3, including the technical report, the *
19 * original specifications, source code, results and information *
20 * on how to submit new results, is available at: *
22 * http://www.nas.nasa.gov/Software/NPB *
24 * Send comments or suggestions to npb@nas.nasa.gov *
25 * Send bug reports to npb-bugs@nas.nasa.gov *
27 * NAS Parallel Benchmarks Group *
28 * NASA Ames Research Center *
30 * Moffett Field, CA 94035-1000 *
32 * E-mail: npb@nas.nasa.gov *
33 * Fax: (650) 604-3957 *
35 *************************************************************************
40 *************************************************************************/
43 #include "nas_common.h"
47 #include "simgrid/instr.h" //TRACE_
55 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
56 * INCREASE max_procs AT YOUR PERIL
66 #define MAX_ITERATIONS 10
67 #define TEST_ARRAY_SIZE 5
69 /* Typedef: if necessary, change the size of int here by changing the int type to, say, long */
71 typedef long INT_TYPE2;
72 #define MP_KEY_TYPE MPI_INT
76 int my_rank, comm_size;
77 /* Some global info */
78 INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
79 total_local_keys, /* copies of rank info */
82 int passed_verification;
83 /* These are the three main arrays. See SIZE_OF_BUFFERS def above */
84 INT_TYPE *key_array, *key_buff1, *key_buff2,
85 *bucket_size, /* Top 5 elements for */
86 *bucket_size_totals, /* part. ver. vals */
87 *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
88 int send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
90 /* Partial verif info */
91 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
92 test_rank_array[TEST_ARRAY_SIZE];
96 S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
97 S_test_rank_array[TEST_ARRAY_SIZE] = {0,18,346,64917,65463},
98 W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
99 W_test_rank_array[TEST_ARRAY_SIZE] = {1249,11698,1039987,1043896,1048018},
101 A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
102 A_test_rank_array[TEST_ARRAY_SIZE] = {104,17523,123928,8288932,8388264},
104 B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
105 B_test_rank_array[TEST_ARRAY_SIZE] = {33422937,10244,59149,33135281,99},
107 C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
108 C_test_rank_array[TEST_ARRAY_SIZE] = {61147,882988,266290,133997595,133525895},
110 D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
111 D_test_rank_array[TEST_ARRAY_SIZE] = {1,36538729,1978098519,2145192618,2147425337};
113 void full_verify( global_data* gd );
115 /************ returns parallel random number seq seed ************/
117 * Create a random number sequence of total length nn residing on np number of processors. Each processor will
118 * therefore have a subsequence of length nn/np. This routine returns that random number which is the first random
119 * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
121 static double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
122 int np, /* np = num procs */
123 long nn, /* total num of ran numbers, all procs */
124 double s, /* Ran num seed, for ex.: 314159265.00 */
125 double a ) /* Ran num gen mult, try 1220703125.00 */
133 for( mq=0; nq>1; mq++,nq/=2);
137 for( i=1; i<=mq; i++ )
138 t2 = randlc( &t1, &t1 );
146 for( i=1; i<=100; i++ ){
149 t3 = randlc( &t1, &t2 );
152 t3 = randlc( &t2, &t2 );
155 an=t3;//added to silence paranoid compilers
160 static void create_seq( global_data* gd, double seed, double a )
167 for (i=0; i<num_keys; i++){
168 x = randlc(&seed, &a);
169 x += randlc(&seed, &a);
170 x += randlc(&seed, &a);
171 x += randlc(&seed, &a);
173 gd->key_array[i] = k*x;
177 void full_verify( global_data* gd )
183 INT_TYPE k, last_local_key;
185 /* Now, finally, sort the keys: */
186 for( i=0; i<gd->total_local_keys; i++ )
187 gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
188 last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
190 /* Send largest key value to next processor */
191 if( gd->my_rank > 0 )
192 MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
193 if( gd->my_rank < gd->comm_size-1 )
194 MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
195 if( gd->my_rank > 0 )
196 MPI_Wait( &request, &status );
198 /* Confirm that neighbor's greatest key value is not greater than my least key value */
200 if( gd->my_rank > 0 && gd->total_local_keys > 0 )
201 if( k > gd->key_array[0] )
204 /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
205 for( i=1; i<gd->total_local_keys; i++ )
206 if( gd->key_array[i-1] > gd->key_array[i] )
210 printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
212 gd->passed_verification++;
215 static void rank( global_data* gd, int iteration )
218 INT_TYPE shift = max_key_log_2 - num_bucket_log_2;
220 INT_TYPE2 bucket_sum_accumulator, j, m;
221 INT_TYPE local_bucket_sum_accumulator;
222 INT_TYPE min_key_val, max_key_val;
223 INT_TYPE *key_buff_ptr;
225 /* Iteration alteration of keys */
226 if(gd->my_rank == 0){
227 gd->key_array[iteration] = iteration;
228 gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
232 for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
233 gd->bucket_size[i] = 0;
234 gd->bucket_size_totals[i] = 0;
235 gd->process_bucket_distrib_ptr1[i] = 0;
236 gd->process_bucket_distrib_ptr2[i] = 0;
239 /* Determine where the partial verify test keys are, load into top of array bucket_size */
240 for( i=0; i<TEST_ARRAY_SIZE; i++ )
241 if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
242 gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
244 /* Determine the number of keys in each bucket */
245 for( i=0; i<num_keys; i++ )
246 gd->bucket_size[gd->key_array[i] >> shift]++;
248 /* Accumulative bucket sizes are the bucket pointers */
249 gd->bucket_ptrs[0] = 0;
250 for( i=1; i< num_buckets; i++ )
251 gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
253 /* Sort into appropriate bucket */
254 for( i=0; i<num_keys; i++ ) {
255 key = gd->key_array[i];
256 gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
259 /* Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
260 MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
263 /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
264 * average number of keys per processor). Then all keys in these buckets go to processor 0.
265 Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc. This
266 algorithm guarantees that all processors have work ranking; no processors are left idle.
267 The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
268 of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
269 computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
270 Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
271 processor will have after the redistribution is done.
274 bucket_sum_accumulator = 0;
275 local_bucket_sum_accumulator = 0;
276 gd->send_displ[0] = 0;
277 gd->process_bucket_distrib_ptr1[0] = 0;
278 for( i=0, j=0; i<num_buckets; i++ ) {
279 bucket_sum_accumulator += gd->bucket_size_totals[i];
280 local_bucket_sum_accumulator += gd->bucket_size[i];
281 if( bucket_sum_accumulator >= (j+1)*num_keys ) {
282 gd->send_count[j] = local_bucket_sum_accumulator;
284 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
285 gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
287 gd->process_bucket_distrib_ptr2[j++] = i;
288 local_bucket_sum_accumulator = 0;
292 /* When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
293 * So, we need to set counts properly in this case to avoid any fallouts. */
294 while( j < gd->comm_size ) {
295 gd->send_count[j] = 0;
296 gd->process_bucket_distrib_ptr1[j] = 1;
300 /* This is the redistribution section: first find out how many keys
301 each processor will send to every other processor: */
302 MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
304 /* Determine the receive array displacements for the buckets */
305 gd->recv_displ[0] = 0;
306 for( i=1; i<gd->comm_size; i++ )
307 gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
309 /* Now send the keys to respective processors */
310 MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
311 gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
313 /* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
314 * obtain the smallest possible min and greatest possible max value of any key on each processor
316 min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
317 max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
319 /* Clear the work array */
320 for( i=0; i<max_key_val-min_key_val+1; i++ )
321 gd->key_buff1[i] = 0;
323 /* Determine the total number of keys on all other processors holding keys of lesser value */
325 for( k=0; k<gd->my_rank; k++ )
326 for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
327 m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
329 /* Determine total number of keys on this processor */
331 for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
332 j += gd->bucket_size_totals[i]; /* j has total # of local keys */
334 /* Ranking of all keys occurs in this section: */
335 /* shift it backwards so no subtractions are necessary in loop */
336 key_buff_ptr = gd->key_buff1 - min_key_val;
338 /* In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
339 individual population */
341 key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key population */
343 /* To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
345 NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
346 key population here, but still needed during the partial verify test. This is to ensure that 32-bit key_buff can
347 still be used for class D. */
348 /* key_buff_ptr[min_key_val] += m; */
349 for( i=min_key_val; i<max_key_val; i++ )
350 key_buff_ptr[i+1] += key_buff_ptr[i];
352 /* This is the partial verify test section */
353 /* Observe that test_rank_array vals are shifted differently for different cases */
354 for( i=0; i<TEST_ARRAY_SIZE; i++ ){
355 k = gd->bucket_size_totals[i+num_buckets]; /* Keys were hidden here */
356 if( min_key_val <= k && k <= max_key_val ){
357 /* Add the total of lesser keys, m, here */
358 INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
364 if( key_rank != gd->test_rank_array[i]+iteration )
367 gd->passed_verification++;
369 if( key_rank != gd->test_rank_array[i]-iteration )
372 gd->passed_verification++;
377 if( key_rank != gd->test_rank_array[i]+(iteration-2) )
380 gd->passed_verification++;
382 if( key_rank != gd->test_rank_array[i]-iteration )
385 gd->passed_verification++;
390 if( key_rank != gd->test_rank_array[i]+(iteration-1) )
393 gd->passed_verification++;
395 if( key_rank != gd->test_rank_array[i]-(iteration-1) )
398 gd->passed_verification++;
402 if( i == 1 || i == 2 || i == 4 ) {
403 if( key_rank != gd->test_rank_array[i]+iteration )
406 gd->passed_verification++;
408 if( key_rank != gd->test_rank_array[i]-iteration )
411 gd->passed_verification++;
416 if( key_rank != gd->test_rank_array[i]+iteration )
419 gd->passed_verification++;
421 if( key_rank != gd->test_rank_array[i]-iteration )
424 gd->passed_verification++;
429 if( key_rank != gd->test_rank_array[i]+iteration )
432 gd->passed_verification++;
434 if( key_rank != gd->test_rank_array[i]-iteration )
437 gd->passed_verification++;
442 printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
443 iteration, gd->my_rank, (int)i );
447 /* Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
448 * the code, probably since they cannot be made register by compiler */
450 if( iteration == MAX_ITERATIONS ) {
451 gd->key_buff_ptr_global = key_buff_ptr;
452 gd->total_local_keys = j;
453 gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
457 int main( int argc, char **argv )
459 int i, iteration, itemp;
460 double timecounter, maxtime;
462 global_data* gd = malloc(sizeof(global_data));
464 MPI_Init( &argc, &argv );
465 MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
466 MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
468 get_info(argc, argv, &nprocs, &class);
469 check_info(IS, nprocs, class);
470 /* Initialize the verification arrays if a valid class */
471 for( i=0; i<TEST_ARRAY_SIZE; i++ )
475 total_keys_log2 = 16;
477 num_bucket_log_2 = 9;
479 gd->test_index_array[i] = S_test_index_array[i];
480 gd->test_rank_array[i] = S_test_rank_array[i];
483 total_keys_log2 = 23;
485 num_bucket_log_2 = 10;
486 gd->test_index_array[i] = A_test_index_array[i];
487 gd->test_rank_array[i] = A_test_rank_array[i];
490 total_keys_log2 = 20;
492 num_bucket_log_2 = 10;
493 gd->test_index_array[i] = W_test_index_array[i];
494 gd->test_rank_array[i] = W_test_rank_array[i];
497 total_keys_log2 = 25;
499 num_bucket_log_2 = 10;
500 gd->test_index_array[i] = B_test_index_array[i];
501 gd->test_rank_array[i] = B_test_rank_array[i];
504 total_keys_log2 = 27;
506 num_bucket_log_2 = 10;
507 gd->test_index_array[i] = C_test_index_array[i];
508 gd->test_rank_array[i] = C_test_rank_array[i];
511 total_keys_log2 = 29;
513 num_bucket_log_2 = 10;
515 gd->test_index_array[i] = D_test_index_array[i];
516 gd->test_rank_array[i] = D_test_rank_array[i];
520 total_keys = (1 << total_keys_log2);
521 max_key = (1 << max_key_log_2);
522 num_buckets = (1 << num_bucket_log_2);
523 num_keys = (total_keys/nprocs*min_procs);
525 /* On larger number of processors, since the keys are (roughly) gaussian distributed, the first and last processor
526 * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
527 * however, a small number The required array size also depends on the bucket size used. The following values are
528 * validated for the 1024-bucket setup. */
530 size_of_buffers = 3*num_keys/2;
531 else if (nprocs < 512)
532 size_of_buffers = 5*num_keys/2;
533 else if (nprocs < 1024)
534 size_of_buffers = 4*num_keys/2;
536 size_of_buffers = 13*num_keys/2;
538 gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
539 gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
540 gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
541 gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* Top 5 elements for */
542 gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
543 gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
544 gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
545 gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
546 // int send_count[max_procs], recv_count[max_procs],
547 // send_displ[max_procs], recv_displ[max_procs];
549 /* Printout initial NPB info */
550 if( gd->my_rank == 0 ){
551 printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
552 printf( " Size: %ld (class %c)\n", (long)total_keys*min_procs, class);
553 printf( " Iterations: %d\n", MAX_ITERATIONS );
554 printf( " Number of processes: %d\n",gd->comm_size );
557 /* Check that actual and compiled number of processors agree */
558 if( gd->comm_size != nprocs) {
559 if( gd->my_rank == 0 )
560 printf( "\n ERROR: compiled for %d processes\n"
561 " Number of active processes: %d\n"
562 " Exiting program!\n\n", nprocs, gd->comm_size );
567 /* Check to see whether total number of processes is within bounds.
568 This could in principle be checked in setparams.c, but it is more convenient to do it here */
569 if( gd->comm_size < min_procs || gd->comm_size > max_procs){
570 if( gd->my_rank == 0 )
571 printf( "\n ERROR: number of processes %d not within range %d-%d"
572 "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
577 /* Generate random number sequence and subsequent keys on all procs */
578 create_seq(gd, find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
579 314159265.00, /* Random number gen seed */
580 1220703125.00 ), /* Random number gen mult */
581 1220703125.00 ); /* Random number gen mult */
583 /* Do one interation for free (i.e., untimed) to guarantee initialization of
584 all data and code pages and respective tables */
587 /* Start verification counter */
588 gd->passed_verification = 0;
590 if( gd->my_rank == 0 && class != 'S' ) printf( "\n iteration\n" );
592 /* Initialize timer */
598 char smpi_category[100];
599 snprintf (smpi_category, 100, "%d", gd->my_rank);
600 TRACE_smpi_set_category (smpi_category);
602 /* This is the main iteration */
603 for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
604 if( gd->my_rank == 0 && class != 'S' ) printf( " %d\n", iteration );
605 rank(gd, iteration );
607 TRACE_smpi_set_category (NULL);
609 /* Stop timer, obtain time for processors */
612 timecounter = timer_read(0);
614 /* End of timing, obtain maximum time of all processors */
615 MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
617 /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
620 /* Obtain verification counter sum */
621 itemp =gd->passed_verification;
622 MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
624 /* The final printout */
625 if( gd->my_rank == 0 ) {
626 if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
627 gd->passed_verification = 0;
628 c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
629 ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
630 gd->passed_verification);