examples/smpi/NAS/is.c

   1 /*************************************************************************
   2  *                                                                       *
   3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
   4  *                                                                       *
   5  *                                  I S                                  *
   6  *                                                                       *
   7  *************************************************************************
   8  *                                                                       *
   9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
  10  *   It is described in NAS Technical Report 95-020.                     *
  11  *                                                                       *
  12  *   Permission to use, copy, distribute and modify this software        *
  13  *   for any purpose with or without fee is hereby granted.  We          *
  14  *   request, however, that all derived work reference the NAS           *
  15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
  16  *   without express or implied warranty.                                *
  17  *                                                                       *
  18  *   Information on NPB 3.3, including the technical report, the         *
  19  *   original specifications, source code, results and information       *
  20  *   on how to submit new results, is available at:                      *
  21  *                                                                       *
  22  *          http://www.nas.nasa.gov/Software/NPB                         *
  23  *                                                                       *
  24  *   Send comments or suggestions to  npb@nas.nasa.gov                   *
  25  *   Send bug reports to              npb-bugs@nas.nasa.gov              *
  26  *                                                                       *
  27  *         NAS Parallel Benchmarks Group                                 *
  28  *         NASA Ames Research Center                                     *
  29  *         Mail Stop: T27A-1                                             *
  30  *         Moffett Field, CA   94035-1000                                *
  31  *                                                                       *
  32  *         E-mail:  npb@nas.nasa.gov                                     *
  33  *         Fax:     (650) 604-3957                                       *
  34  *                                                                       *
  35  *************************************************************************
  36  *                                                                       *
  37  *   Author: M. Yarrow                                                   *
  38  *           H. Jin                                                      *
  39  *                                                                       *
  40  *************************************************************************/
  41
  42 #include "smpi/mpi.h"
  43 #include "nas_common.h"
  44 #include <stdlib.h>
  45 #include <stdio.h>
  46
  47 #include "simgrid/instr.h" //TRACE_
  48
  49 char class;
  50 int nprocs;
  51 int total_keys_log2;
  52 int max_key_log_2;
  53 int num_bucket_log_2;
  54 int min_procs=1;
  55 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
  56  * INCREASE max_procs AT YOUR PERIL
  57  */
  58 int max_procs=1024;
  59
  60 int total_keys;
  61 int max_key;
  62 int num_buckets;
  63 int num_keys;
  64 long size_of_buffers;
  65
  66 #define  MAX_ITERATIONS      10
  67 #define  TEST_ARRAY_SIZE     5
  68
  69 /* Typedef: if necessary, change the size of int here by changing the  int type to, say, long */
  70 typedef  int  INT_TYPE;
  71 typedef  long INT_TYPE2;
  72 #define MP_KEY_TYPE MPI_INT
  73
  74 typedef struct {
  75 /* MPI properties:  */
  76 int      my_rank, comm_size;
  77 /* Some global info */
  78 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
  79          total_local_keys,             /* copies of rank info        */
  80          total_lesser_keys;
  81
  82 int      passed_verification;
  83 /* These are the three main arrays. See SIZE_OF_BUFFERS def above    */
  84 INT_TYPE *key_array, *key_buff1, *key_buff2,
  85          *bucket_size,     /* Top 5 elements for */
  86          *bucket_size_totals, /* part. ver. vals */
  87          *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
  88 int      send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
  89
  90 /* Partial verif info */
  91 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
  92          test_rank_array[TEST_ARRAY_SIZE];
  93 } global_data;
  94
  95 const INT_TYPE2
  96          S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
  97          S_test_rank_array[TEST_ARRAY_SIZE] =  {0,18,346,64917,65463},
  98          W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
  99          W_test_rank_array[TEST_ARRAY_SIZE] =  {1249,11698,1039987,1043896,1048018},
 100
 101          A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
 102          A_test_rank_array[TEST_ARRAY_SIZE] =  {104,17523,123928,8288932,8388264},
 103
 104          B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
 105          B_test_rank_array[TEST_ARRAY_SIZE] =  {33422937,10244,59149,33135281,99},
 106
 107          C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
 108          C_test_rank_array[TEST_ARRAY_SIZE] =  {61147,882988,266290,133997595,133525895},
 109
 110          D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
 111          D_test_rank_array[TEST_ARRAY_SIZE] =  {1,36538729,1978098519,2145192618,2147425337};
 112
 113 void full_verify( global_data* gd );
 114
 115 /************ returns parallel random number seq seed ************/
 116 /*
 117  * Create a random number sequence of total length nn residing on np number of processors.  Each processor will
 118  * therefore have a subsequence of length nn/np.  This routine returns that random number which is the first random
 119  * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
 120  */
 121 static double  find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
 122                              int  np,       /* np = num procs                      */
 123                              long nn,       /* total num of ran numbers, all procs */
 124                              double s,      /* Ran num seed, for ex.: 314159265.00 */
 125                              double a )     /* Ran num gen mult, try 1220703125.00 */
 126 {
 127   long   i;
 128   double t1,t2,t3,an;
 129   long   mq,nq,kk,ik;
 130
 131   nq = nn / np;
 132
 133   for( mq=0; nq>1; mq++,nq/=2);
 134
 135   t1 = a;
 136
 137   for( i=1; i<=mq; i++ )
 138     t2 = randlc( &t1, &t1 );
 139
 140   an = t1;
 141
 142   kk = kn;
 143   t1 = s;
 144   t2 = an;
 145
 146   for( i=1; i<=100; i++ ){
 147     ik = kk / 2;
 148     if( 2 * ik !=  kk )
 149       t3 = randlc( &t1, &t2 );
 150     if( ik == 0 )
 151       break;
 152     t3 = randlc( &t2, &t2 );
 153     kk = ik;
 154   }
 155   an=t3;//added to silence paranoid compilers
 156
 157   return t1;
 158 }
 159
 160 static void create_seq( global_data* gd, double seed, double a )
 161 {
 162   double x;
 163   int    i, k;
 164
 165   k = max_key/4;
 166
 167   for (i=0; i<num_keys; i++){
 168     x = randlc(&seed, &a);
 169     x += randlc(&seed, &a);
 170     x += randlc(&seed, &a);
 171     x += randlc(&seed, &a);
 172
 173     gd->key_array[i] = k*x;
 174   }
 175 }
 176
 177 void full_verify( global_data* gd )
 178 {
 179   MPI_Status  status;
 180   MPI_Request request;
 181
 182   INT_TYPE    i, j;
 183   INT_TYPE    k, last_local_key;
 184
 185 /*  Now, finally, sort the keys:  */
 186   for( i=0; i<gd->total_local_keys; i++ )
 187     gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
 188   last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
 189
 190 /*  Send largest key value to next processor  */
 191   if( gd->my_rank > 0 )
 192     MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
 193   if( gd->my_rank < gd->comm_size-1 )
 194     MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
 195   if( gd->my_rank > 0 )
 196     MPI_Wait( &request, &status );
 197
 198 /*  Confirm that neighbor's greatest key value is not greater than my least key value */
 199   j = 0;
 200   if( gd->my_rank > 0 && gd->total_local_keys > 0 )
 201     if( k > gd->key_array[0] )
 202       j++;
 203
 204 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
 205   for( i=1; i<gd->total_local_keys; i++ )
 206     if( gd->key_array[i-1] > gd->key_array[i] )
 207       j++;
 208
 209   if( j != 0 ) {
 210     printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
 211   } else
 212     gd->passed_verification++;
 213 }
 214
 215 static void rank( global_data* gd, int iteration )
 216 {
 217   INT_TYPE    i, k;
 218   INT_TYPE    shift = max_key_log_2 - num_bucket_log_2;
 219   INT_TYPE    key;
 220   INT_TYPE2   bucket_sum_accumulator, j, m;
 221   INT_TYPE    local_bucket_sum_accumulator;
 222   INT_TYPE    min_key_val, max_key_val;
 223   INT_TYPE    *key_buff_ptr;
 224
 225 /*  Iteration alteration of keys */
 226   if(gd->my_rank == 0){
 227     gd->key_array[iteration] = iteration;
 228     gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
 229   }
 230
 231 /*  Initialize */
 232   for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
 233     gd->bucket_size[i] = 0;
 234     gd->bucket_size_totals[i] = 0;
 235     gd->process_bucket_distrib_ptr1[i] = 0;
 236     gd->process_bucket_distrib_ptr2[i] = 0;
 237   }
 238
 239 /*  Determine where the partial verify test keys are, load into top of array bucket_size */
 240   for( i=0; i<TEST_ARRAY_SIZE; i++ )
 241     if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
 242       gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
 243
 244 /*  Determine the number of keys in each bucket */
 245   for( i=0; i<num_keys; i++ )
 246     gd->bucket_size[gd->key_array[i] >> shift]++;
 247
 248 /*  Accumulative bucket sizes are the bucket pointers */
 249   gd->bucket_ptrs[0] = 0;
 250   for( i=1; i< num_buckets; i++ )
 251     gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
 252
 253 /*  Sort into appropriate bucket */
 254   for( i=0; i<num_keys; i++ ) {
 255     key = gd->key_array[i];
 256     gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
 257   }
 258
 259 /*  Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
 260   MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
 261                 MPI_COMM_WORLD);
 262
 263 /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
 264  * average number of keys per processor).  Then all keys in these buckets go to processor 0.
 265    Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc.  This
 266    algorithm guarantees that all processors have work ranking; no processors are left idle.
 267    The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
 268    of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
 269    computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
 270    Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
 271    processor will have after the redistribution is done.
 272 */
 273
 274   bucket_sum_accumulator = 0;
 275   local_bucket_sum_accumulator = 0;
 276   gd->send_displ[0] = 0;
 277   gd->process_bucket_distrib_ptr1[0] = 0;
 278   for( i=0, j=0; i<num_buckets; i++ ) {
 279     bucket_sum_accumulator       += gd->bucket_size_totals[i];
 280     local_bucket_sum_accumulator += gd->bucket_size[i];
 281     if( bucket_sum_accumulator >= (j+1)*num_keys ) {
 282       gd->send_count[j] = local_bucket_sum_accumulator;
 283       if( j != 0 ){
 284          gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
 285          gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
 286       }
 287       gd->process_bucket_distrib_ptr2[j++] = i;
 288       local_bucket_sum_accumulator = 0;
 289     }
 290   }
 291
 292 /*  When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
 293  *  So, we need to set counts properly in this case to avoid any fallouts.    */
 294   while( j < gd->comm_size ) {
 295     gd->send_count[j] = 0;
 296     gd->process_bucket_distrib_ptr1[j] = 1;
 297     j++;
 298   }
 299
 300 /*  This is the redistribution section:  first find out how many keys
 301     each processor will send to every other processor:                 */
 302   MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
 303
 304 /*  Determine the receive array displacements for the buckets */
 305   gd->recv_displ[0] = 0;
 306   for( i=1; i<gd->comm_size; i++ )
 307     gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
 308
 309   /*  Now send the keys to respective processors  */
 310   MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
 311                 gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
 312
 313 /* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
 314  * obtain the smallest possible min and greatest possible max value of any key on each processor
 315  */
 316   min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
 317   max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
 318
 319 /*  Clear the work array */
 320   for( i=0; i<max_key_val-min_key_val+1; i++ )
 321     gd->key_buff1[i] = 0;
 322
 323 /*  Determine the total number of keys on all other processors holding keys of lesser value         */
 324   m = 0;
 325   for( k=0; k<gd->my_rank; k++ )
 326     for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
 327       m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
 328
 329 /*  Determine total number of keys on this processor */
 330   j = 0;
 331   for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
 332     j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
 333
 334 /*  Ranking of all keys occurs in this section:                 */
 335 /*  shift it backwards so no subtractions are necessary in loop */
 336   key_buff_ptr = gd->key_buff1 - min_key_val;
 337
 338 /*  In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
 339     individual population                                       */
 340   for( i=0; i<j; i++ )
 341     key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key  population                     */
 342
 343 /*  To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
 344  *  keys, m.
 345     NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
 346     key population here, but still needed during the partial verify test.  This is to ensure that 32-bit key_buff can
 347     still be used for class D.           */
 348 /*    key_buff_ptr[min_key_val] += m;    */
 349   for( i=min_key_val; i<max_key_val; i++ )
 350     key_buff_ptr[i+1] += key_buff_ptr[i];
 351
 352 /* This is the partial verify test section */
 353 /* Observe that test_rank_array vals are shifted differently for different cases */
 354   for( i=0; i<TEST_ARRAY_SIZE; i++ ){
 355     k = gd->bucket_size_totals[i+num_buckets];    /* Keys were hidden here */
 356     if( min_key_val <= k  &&  k <= max_key_val ){
 357       /* Add the total of lesser keys, m, here */
 358       INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
 359       int failed = 0;
 360
 361       switch( class ){
 362         case 'S':
 363           if( i <= 2 ) {
 364             if( key_rank != gd->test_rank_array[i]+iteration )
 365               failed = 1;
 366             else
 367               gd->passed_verification++;
 368           } else {
 369             if( key_rank != gd->test_rank_array[i]-iteration )
 370               failed = 1;
 371             else
 372               gd->passed_verification++;
 373           }
 374           break;
 375         case 'W':
 376           if( i < 2 ){
 377             if( key_rank != gd->test_rank_array[i]+(iteration-2) )
 378               failed = 1;
 379             else
 380               gd->passed_verification++;
 381           } else {
 382               if( key_rank != gd->test_rank_array[i]-iteration )
 383                 failed = 1;
 384               else
 385                 gd->passed_verification++;
 386           }
 387           break;
 388         case 'A':
 389           if( i <= 2 ){
 390             if( key_rank != gd->test_rank_array[i]+(iteration-1) )
 391               failed = 1;
 392             else
 393               gd->passed_verification++;
 394           } else {
 395               if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
 396                 failed = 1;
 397               else
 398                 gd->passed_verification++;
 399           }
 400           break;
 401         case 'B':
 402           if( i == 1 || i == 2 || i == 4 ) {
 403             if( key_rank != gd->test_rank_array[i]+iteration )
 404               failed = 1;
 405             else
 406               gd->passed_verification++;
 407           } else {
 408               if( key_rank != gd->test_rank_array[i]-iteration )
 409                 failed = 1;
 410               else
 411                 gd->passed_verification++;
 412           }
 413           break;
 414         case 'C':
 415           if( i <= 2 ){
 416             if( key_rank != gd->test_rank_array[i]+iteration )
 417               failed = 1;
 418             else
 419               gd->passed_verification++;
 420           } else {
 421               if( key_rank != gd->test_rank_array[i]-iteration )
 422                 failed = 1;
 423               else
 424                 gd->passed_verification++;
 425           }
 426           break;
 427         case 'D':
 428           if( i < 2 ) {
 429             if( key_rank != gd->test_rank_array[i]+iteration )
 430               failed = 1;
 431             else
 432               gd->passed_verification++;
 433            } else {
 434               if( key_rank != gd->test_rank_array[i]-iteration )
 435                 failed = 1;
 436               else
 437                 gd->passed_verification++;
 438            }
 439          break;
 440       }
 441       if( failed == 1 )
 442         printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
 443                iteration, gd->my_rank, (int)i );
 444     }
 445   }
 446
 447 /*  Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
 448  *  the code, probably since they cannot be made register by compiler                        */
 449
 450   if( iteration == MAX_ITERATIONS ) {
 451     gd->key_buff_ptr_global = key_buff_ptr;
 452     gd->total_local_keys    = j;
 453     gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
 454   }
 455 }
 456
 457 int main( int argc, char **argv )
 458 {
 459   int             i, iteration, itemp;
 460   double          timecounter, maxtime;
 461
 462   global_data* gd = malloc(sizeof(global_data));
 463 /*  Initialize MPI */
 464   MPI_Init( &argc, &argv );
 465   MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
 466   MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
 467
 468   get_info(argc, argv, &nprocs, &class);
 469   check_info(IS, nprocs, class);
 470 /*  Initialize the verification arrays if a valid class */
 471   for( i=0; i<TEST_ARRAY_SIZE; i++ )
 472
 473     switch( class ) {
 474       case 'S':
 475          total_keys_log2 = 16;
 476          max_key_log_2 = 11;
 477          num_bucket_log_2 = 9;
 478          max_procs = 128;
 479          gd->test_index_array[i] = S_test_index_array[i];
 480          gd->test_rank_array[i]  = S_test_rank_array[i];
 481          break;
 482       case 'A':
 483          total_keys_log2 = 23;
 484          max_key_log_2 = 19;
 485          num_bucket_log_2 = 10;
 486          gd->test_index_array[i] = A_test_index_array[i];
 487          gd->test_rank_array[i]  = A_test_rank_array[i];
 488          break;
 489       case 'W':
 490           total_keys_log2 = 20;
 491           max_key_log_2 = 16;
 492           num_bucket_log_2 = 10;
 493          gd->test_index_array[i] = W_test_index_array[i];
 494          gd->test_rank_array[i]  = W_test_rank_array[i];
 495          break;
 496       case 'B':
 497           total_keys_log2 = 25;
 498           max_key_log_2 = 21;
 499           num_bucket_log_2 = 10;
 500          gd->test_index_array[i] = B_test_index_array[i];
 501          gd->test_rank_array[i]  = B_test_rank_array[i];
 502          break;
 503       case 'C':
 504           total_keys_log2 = 27;
 505           max_key_log_2 = 23;
 506           num_bucket_log_2 = 10;
 507          gd->test_index_array[i] = C_test_index_array[i];
 508          gd->test_rank_array[i]  = C_test_rank_array[i];
 509          break;
 510       case 'D':
 511           total_keys_log2 = 29;
 512           max_key_log_2 = 27;
 513           num_bucket_log_2 = 10;
 514          min_procs = 4;
 515          gd->test_index_array[i] = D_test_index_array[i];
 516          gd->test_rank_array[i]  = D_test_rank_array[i];
 517          break;
 518     };
 519
 520   total_keys  = (1 << total_keys_log2);
 521   max_key     = (1 << max_key_log_2);
 522   num_buckets = (1 << num_bucket_log_2);
 523   num_keys    = (total_keys/nprocs*min_procs);
 524
 525   /* On larger number of processors, since the keys are (roughly)  gaussian distributed, the first and last processor
 526    * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
 527    * however, a small number The required array size also depends on the bucket size used. The following values are
 528    * validated for the 1024-bucket setup. */
 529   if (nprocs < 256)
 530     size_of_buffers = 3*num_keys/2;
 531   else if (nprocs < 512)
 532     size_of_buffers = 5*num_keys/2;
 533   else if (nprocs < 1024)
 534     size_of_buffers = 4*num_keys/2;
 535   else
 536     size_of_buffers = 13*num_keys/2;
 537
 538   gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 539   gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 540   gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 541   gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));     /* Top 5 elements for */
 542   gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
 543   gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
 544   gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
 545   gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
 546 //  int      send_count[max_procs], recv_count[max_procs],
 547 //           send_displ[max_procs], recv_displ[max_procs];
 548
 549 /*  Printout initial NPB info */
 550   if( gd->my_rank == 0 ){
 551      printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
 552      printf( " Size:  %ld  (class %c)\n", (long)total_keys*min_procs, class);
 553      printf( " Iterations:   %d\n", MAX_ITERATIONS );
 554      printf( " Number of processes:     %d\n",gd->comm_size );
 555   }
 556
 557 /*  Check that actual and compiled number of processors agree */
 558   if( gd->comm_size != nprocs) {
 559     if( gd->my_rank == 0 )
 560        printf( "\n ERROR: compiled for %d processes\n"
 561                " Number of active processes: %d\n"
 562                " Exiting program!\n\n", nprocs, gd->comm_size );
 563     MPI_Finalize();
 564     exit( 1 );
 565   }
 566
 567 /*  Check to see whether total number of processes is within bounds.
 568     This could in principle be checked in setparams.c, but it is more convenient to do it here */
 569   if( gd->comm_size < min_procs || gd->comm_size > max_procs){
 570     if( gd->my_rank == 0 )
 571       printf( "\n ERROR: number of processes %d not within range %d-%d"
 572               "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
 573     MPI_Finalize();
 574     exit( 1 );
 575   }
 576
 577 /*  Generate random number sequence and subsequent keys on all procs */
 578   create_seq(gd,  find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
 579              314159265.00,      /* Random number gen seed */
 580              1220703125.00 ),   /* Random number gen mult */
 581              1220703125.00 );   /* Random number gen mult */
 582
 583 /*  Do one interation for free (i.e., untimed) to guarantee initialization of
 584     all data and code pages and respective tables */
 585   rank(gd, 1 );
 586
 587 /*  Start verification counter */
 588   gd->passed_verification = 0;
 589
 590   if( gd->my_rank == 0 && class != 'S' ) printf( "\n   iteration\n" );
 591
 592 /*  Initialize timer  */
 593   timer_clear(0);
 594
 595 /*  Start timer */
 596   timer_start(0);
 597
 598   char smpi_category[100];
 599   snprintf (smpi_category, 100, "%d", gd->my_rank);
 600   TRACE_smpi_set_category (smpi_category);
 601
 602 /*  This is the main iteration */
 603   for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
 604     if( gd->my_rank == 0 && class != 'S' ) printf( "        %d\n", iteration );
 605     rank(gd,  iteration );
 606   }
 607   TRACE_smpi_set_category (NULL);
 608
 609 /*  Stop timer, obtain time for processors */
 610   timer_stop(0);
 611
 612   timecounter = timer_read(0);
 613
 614 /*  End of timing, obtain maximum time of all processors */
 615   MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
 616
 617 /*  This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
 618   full_verify(gd);
 619
 620 /*  Obtain verification counter sum */
 621   itemp =gd->passed_verification;
 622   MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
 623
 624 /*  The final printout  */
 625   if( gd->my_rank == 0 ) {
 626     if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
 627       gd->passed_verification = 0;
 628     c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
 629                     ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
 630                     gd->passed_verification);
 631   }
 632
 633   MPI_Finalize();
 634   free(gd);
 635
 636   return 0;
 637 }