1 /*************************************************************************
3 * N A S P A R A L L E L B E N C H M A R K S 3.3 *
7 *************************************************************************
9 * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
10 * It is described in NAS Technical Report 95-020. *
12 * Permission to use, copy, distribute and modify this software *
13 * for any purpose with or without fee is hereby granted. We *
14 * request, however, that all derived work reference the NAS *
15 * Parallel Benchmarks 3.3. This software is provided "as is" *
16 * without express or implied warranty. *
18 * Information on NPB 3.3, including the technical report, the *
19 * original specifications, source code, results and information *
20 * on how to submit new results, is available at: *
22 * http://www.nas.nasa.gov/Software/NPB *
24 * Send comments or suggestions to npb@nas.nasa.gov *
25 * Send bug reports to npb-bugs@nas.nasa.gov *
27 * NAS Parallel Benchmarks Group *
28 * NASA Ames Research Center *
30 * Moffett Field, CA 94035-1000 *
32 * E-mail: npb@nas.nasa.gov *
33 * Fax: (650) 604-3957 *
35 *************************************************************************
40 *************************************************************************/
43 #include "nas_common.h"
47 #include "simgrid/instr.h" //TRACE_
55 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
56 * INCREASE max_procs AT YOUR PERIL */
65 #define MAX_ITERATIONS 10
66 #define TEST_ARRAY_SIZE 5
68 /* Typedef: if necessary, change the size of int here by changing the int type to, say, long */
70 typedef long INT_TYPE2;
71 #define MP_KEY_TYPE MPI_INT
75 int my_rank, comm_size;
76 /* Some global info */
77 INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
78 total_local_keys, /* copies of rank info */
81 int passed_verification;
82 /* These are the three main arrays. See SIZE_OF_BUFFERS def above */
83 INT_TYPE *key_array, *key_buff1, *key_buff2,
84 *bucket_size, /* Top 5 elements for */
85 *bucket_size_totals, /* part. ver. vals */
86 *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
87 int send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
89 /* Partial verif info */
90 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
91 test_rank_array[TEST_ARRAY_SIZE];
95 S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
96 S_test_rank_array[TEST_ARRAY_SIZE] = {0,18,346,64917,65463},
97 W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
98 W_test_rank_array[TEST_ARRAY_SIZE] = {1249,11698,1039987,1043896,1048018},
100 A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
101 A_test_rank_array[TEST_ARRAY_SIZE] = {104,17523,123928,8288932,8388264},
103 B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
104 B_test_rank_array[TEST_ARRAY_SIZE] = {33422937,10244,59149,33135281,99},
106 C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
107 C_test_rank_array[TEST_ARRAY_SIZE] = {61147,882988,266290,133997595,133525895},
109 D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
110 D_test_rank_array[TEST_ARRAY_SIZE] = {1,36538729,1978098519,2145192618,2147425337};
112 void full_verify( global_data* gd );
114 /************ returns parallel random number seq seed ************/
116 * Create a random number sequence of total length nn residing on np number of processors. Each processor will
117 * therefore have a subsequence of length nn/np. This routine returns that random number which is the first random
118 * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
120 static double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
121 int np, /* np = num procs */
122 long nn, /* total num of ran numbers, all procs */
123 double s, /* Ran num seed, for ex.: 314159265.00 */
124 double a ) /* Ran num gen mult, try 1220703125.00 */
132 for( mq=0; nq>1; mq++,nq/=2);
136 for( i=1; i<=mq; i++ )
137 t2 = randlc( &t1, &t1 );
145 for( i=1; i<=100; i++ ){
148 t3 = randlc( &t1, &t2 );
151 t3 = randlc( &t2, &t2 );
154 an=t3;//added to silence paranoid compilers
159 static void create_seq( global_data* gd, double seed, double a )
166 for (i=0; i<num_keys; i++){
167 x = randlc(&seed, &a);
168 x += randlc(&seed, &a);
169 x += randlc(&seed, &a);
170 x += randlc(&seed, &a);
172 gd->key_array[i] = k*x;
176 void full_verify( global_data* gd )
182 INT_TYPE k, last_local_key;
184 /* Now, finally, sort the keys: */
185 for( i=0; i<gd->total_local_keys; i++ )
186 gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
187 last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
189 /* Send largest key value to next processor */
190 if( gd->my_rank > 0 )
191 MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
192 if( gd->my_rank < gd->comm_size-1 )
193 MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
194 if( gd->my_rank > 0 )
195 MPI_Wait( &request, &status );
197 /* Confirm that neighbor's greatest key value is not greater than my least key value */
199 if( gd->my_rank > 0 && gd->total_local_keys > 0 )
200 if( k > gd->key_array[0] )
203 /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
204 for( i=1; i<gd->total_local_keys; i++ )
205 if( gd->key_array[i-1] > gd->key_array[i] )
209 printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
211 gd->passed_verification++;
214 static void rank( global_data* gd, int iteration )
217 INT_TYPE shift = max_key_log_2 - num_bucket_log_2;
219 INT_TYPE2 bucket_sum_accumulator, j, m;
220 INT_TYPE local_bucket_sum_accumulator;
221 INT_TYPE min_key_val, max_key_val;
222 INT_TYPE *key_buff_ptr;
224 /* Iteration alteration of keys */
225 if(gd->my_rank == 0){
226 gd->key_array[iteration] = iteration;
227 gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
231 for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
232 gd->bucket_size[i] = 0;
233 gd->bucket_size_totals[i] = 0;
234 gd->process_bucket_distrib_ptr1[i] = 0;
235 gd->process_bucket_distrib_ptr2[i] = 0;
238 /* Determine where the partial verify test keys are, load into top of array bucket_size */
239 for( i=0; i<TEST_ARRAY_SIZE; i++ )
240 if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
241 gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
243 /* Determine the number of keys in each bucket */
244 for( i=0; i<num_keys; i++ )
245 gd->bucket_size[gd->key_array[i] >> shift]++;
247 /* Accumulative bucket sizes are the bucket pointers */
248 gd->bucket_ptrs[0] = 0;
249 for( i=1; i< num_buckets; i++ )
250 gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
252 /* Sort into appropriate bucket */
253 for( i=0; i<num_keys; i++ ) {
254 key = gd->key_array[i];
255 gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
258 /* Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
259 MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
262 /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
263 * average number of keys per processor). Then all keys in these buckets go to processor 0.
264 Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc. This
265 algorithm guarantees that all processors have work ranking; no processors are left idle.
266 The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
267 of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
268 computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
269 Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
270 processor will have after the redistribution is done.
273 bucket_sum_accumulator = 0;
274 local_bucket_sum_accumulator = 0;
275 gd->send_displ[0] = 0;
276 gd->process_bucket_distrib_ptr1[0] = 0;
277 for( i=0, j=0; i<num_buckets; i++ ) {
278 bucket_sum_accumulator += gd->bucket_size_totals[i];
279 local_bucket_sum_accumulator += gd->bucket_size[i];
280 if( bucket_sum_accumulator >= (j+1)*num_keys ) {
281 gd->send_count[j] = local_bucket_sum_accumulator;
283 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
284 gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
286 gd->process_bucket_distrib_ptr2[j++] = i;
287 local_bucket_sum_accumulator = 0;
291 /* When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
292 * So, we need to set counts properly in this case to avoid any fallouts. */
293 while( j < gd->comm_size ) {
294 gd->send_count[j] = 0;
295 gd->process_bucket_distrib_ptr1[j] = 1;
299 /* This is the redistribution section: first find out how many keys
300 each processor will send to every other processor: */
301 MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
303 /* Determine the receive array displacements for the buckets */
304 gd->recv_displ[0] = 0;
305 for( i=1; i<gd->comm_size; i++ )
306 gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
308 /* Now send the keys to respective processors */
309 MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
310 gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
312 /* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
313 * obtain the smallest possible min and greatest possible max value of any key on each processor
315 min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
316 max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
318 /* Clear the work array */
319 for( i=0; i<max_key_val-min_key_val+1; i++ )
320 gd->key_buff1[i] = 0;
322 /* Determine the total number of keys on all other processors holding keys of lesser value */
324 for( k=0; k<gd->my_rank; k++ )
325 for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
326 m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
328 /* Determine total number of keys on this processor */
330 for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
331 j += gd->bucket_size_totals[i]; /* j has total # of local keys */
333 /* Ranking of all keys occurs in this section: */
334 /* shift it backwards so no subtractions are necessary in loop */
335 key_buff_ptr = gd->key_buff1 - min_key_val;
337 /* In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
338 individual population */
340 key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key population */
342 /* To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
344 NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
345 key population here, but still needed during the partial verify test. This is to ensure that 32-bit key_buff can
346 still be used for class D. */
347 /* key_buff_ptr[min_key_val] += m; */
348 for( i=min_key_val; i<max_key_val; i++ )
349 key_buff_ptr[i+1] += key_buff_ptr[i];
351 /* This is the partial verify test section */
352 /* Observe that test_rank_array vals are shifted differently for different cases */
353 for( i=0; i<TEST_ARRAY_SIZE; i++ ){
354 k = gd->bucket_size_totals[i+num_buckets]; /* Keys were hidden here */
355 if( min_key_val <= k && k <= max_key_val ){
356 /* Add the total of lesser keys, m, here */
357 INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
363 if( key_rank != gd->test_rank_array[i]+iteration )
366 gd->passed_verification++;
368 if( key_rank != gd->test_rank_array[i]-iteration )
371 gd->passed_verification++;
376 if( key_rank != gd->test_rank_array[i]+(iteration-2) )
379 gd->passed_verification++;
381 if( key_rank != gd->test_rank_array[i]-iteration )
384 gd->passed_verification++;
389 if( key_rank != gd->test_rank_array[i]+(iteration-1) )
392 gd->passed_verification++;
394 if( key_rank != gd->test_rank_array[i]-(iteration-1) )
397 gd->passed_verification++;
401 if( i == 1 || i == 2 || i == 4 ) {
402 if( key_rank != gd->test_rank_array[i]+iteration )
405 gd->passed_verification++;
407 if( key_rank != gd->test_rank_array[i]-iteration )
410 gd->passed_verification++;
415 if( key_rank != gd->test_rank_array[i]+iteration )
418 gd->passed_verification++;
420 if( key_rank != gd->test_rank_array[i]-iteration )
423 gd->passed_verification++;
428 if( key_rank != gd->test_rank_array[i]+iteration )
431 gd->passed_verification++;
433 if( key_rank != gd->test_rank_array[i]-iteration )
436 gd->passed_verification++;
441 printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
442 iteration, gd->my_rank, (int)i );
446 /* Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
447 * the code, probably since they cannot be made register by compiler */
449 if( iteration == MAX_ITERATIONS ) {
450 gd->key_buff_ptr_global = key_buff_ptr;
451 gd->total_local_keys = j;
452 gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
456 int main( int argc, char **argv )
458 int i, iteration, itemp;
459 double timecounter, maxtime;
461 global_data* gd = malloc(sizeof(global_data));
463 MPI_Init( &argc, &argv );
464 MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
465 MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
467 get_info(argc, argv, &nprocs, &class);
468 check_info(IS, nprocs, class);
469 /* Initialize the verification arrays if a valid class */
470 for( i=0; i<TEST_ARRAY_SIZE; i++ )
474 total_keys_log2 = 16;
476 num_bucket_log_2 = 9;
478 gd->test_index_array[i] = S_test_index_array[i];
479 gd->test_rank_array[i] = S_test_rank_array[i];
482 total_keys_log2 = 23;
484 num_bucket_log_2 = 10;
485 gd->test_index_array[i] = A_test_index_array[i];
486 gd->test_rank_array[i] = A_test_rank_array[i];
489 total_keys_log2 = 20;
491 num_bucket_log_2 = 10;
492 gd->test_index_array[i] = W_test_index_array[i];
493 gd->test_rank_array[i] = W_test_rank_array[i];
496 total_keys_log2 = 25;
498 num_bucket_log_2 = 10;
499 gd->test_index_array[i] = B_test_index_array[i];
500 gd->test_rank_array[i] = B_test_rank_array[i];
503 total_keys_log2 = 27;
505 num_bucket_log_2 = 10;
506 gd->test_index_array[i] = C_test_index_array[i];
507 gd->test_rank_array[i] = C_test_rank_array[i];
510 total_keys_log2 = 29;
512 num_bucket_log_2 = 10;
514 gd->test_index_array[i] = D_test_index_array[i];
515 gd->test_rank_array[i] = D_test_rank_array[i];
519 total_keys = (1 << total_keys_log2);
520 max_key = (1 << max_key_log_2);
521 num_buckets = (1 << num_bucket_log_2);
522 num_keys = (total_keys/nprocs*min_procs);
524 /* On larger number of processors, since the keys are (roughly) gaussian distributed, the first and last processor
525 * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
526 * however, a small number The required array size also depends on the bucket size used. The following values are
527 * validated for the 1024-bucket setup. */
529 size_of_buffers = 3*num_keys/2;
530 else if (nprocs < 512)
531 size_of_buffers = 5*num_keys/2;
532 else if (nprocs < 1024)
533 size_of_buffers = 4*num_keys/2;
535 size_of_buffers = 13*num_keys/2;
537 gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
538 gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
539 gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
540 gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* Top 5 elements for */
541 gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
542 gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
543 gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
544 gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
545 // int send_count[max_procs], recv_count[max_procs],
546 // send_displ[max_procs], recv_displ[max_procs];
548 /* Printout initial NPB info */
549 if( gd->my_rank == 0 ){
550 printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
551 printf( " Size: %ld (class %c)\n", (long)total_keys*min_procs, class);
552 printf( " Iterations: %d\n", MAX_ITERATIONS );
553 printf( " Number of processes: %d\n",gd->comm_size );
556 /* Check that actual and compiled number of processors agree */
557 if( gd->comm_size != nprocs) {
558 if( gd->my_rank == 0 )
559 printf( "\n ERROR: compiled for %d processes\n"
560 " Number of active processes: %d\n"
561 " Exiting program!\n\n", nprocs, gd->comm_size );
566 /* Check to see whether total number of processes is within bounds.
567 This could in principle be checked in setparams.c, but it is more convenient to do it here */
568 if( gd->comm_size < min_procs || gd->comm_size > max_procs){
569 if( gd->my_rank == 0 )
570 printf( "\n ERROR: number of processes %d not within range %d-%d"
571 "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
576 /* Generate random number sequence and subsequent keys on all procs */
577 create_seq(gd, find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
578 314159265.00, /* Random number gen seed */
579 1220703125.00 ), /* Random number gen mult */
580 1220703125.00 ); /* Random number gen mult */
582 /* Do one interation for free (i.e., untimed) to guarantee initialization of
583 all data and code pages and respective tables */
586 /* Start verification counter */
587 gd->passed_verification = 0;
589 if( gd->my_rank == 0 && class != 'S' ) printf( "\n iteration\n" );
591 /* Initialize timer */
597 char smpi_category[100];
598 snprintf (smpi_category, 100, "%d", gd->my_rank);
599 TRACE_smpi_set_category (smpi_category);
601 /* This is the main iteration */
602 for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
603 if( gd->my_rank == 0 && class != 'S' ) printf( " %d\n", iteration );
604 rank(gd, iteration );
606 TRACE_smpi_set_category (NULL);
608 /* Stop timer, obtain time for processors */
611 timecounter = timer_read(0);
613 /* End of timing, obtain maximum time of all processors */
614 MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
616 /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
619 /* Obtain verification counter sum */
620 itemp =gd->passed_verification;
621 MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
623 /* The final printout */
624 if( gd->my_rank == 0 ) {
625 if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
626 gd->passed_verification = 0;
627 c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
628 ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
629 gd->passed_verification);