examples/smpi/NAS/is.c

   1 /*************************************************************************
   2  *                                                                       *
   3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
   4  *                                                                       *
   5  *                                  I S                                  *
   6  *                                                                       *
   7  *************************************************************************
   8  *                                                                       *
   9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
  10  *   It is described in NAS Technical Report 95-020.                     *
  11  *                                                                       *
  12  *   Permission to use, copy, distribute and modify this software        *
  13  *   for any purpose with or without fee is hereby granted.  We          *
  14  *   request, however, that all derived work reference the NAS           *
  15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
  16  *   without express or implied warranty.                                *
  17  *                                                                       *
  18  *   Information on NPB 3.3, including the technical report, the         *
  19  *   original specifications, source code, results and information       *
  20  *   on how to submit new results, is available at:                      *
  21  *                                                                       *
  22  *          http://www.nas.nasa.gov/Software/NPB                         *
  23  *                                                                       *
  24  *   Send comments or suggestions to  npb@nas.nasa.gov                   *
  25  *   Send bug reports to              npb-bugs@nas.nasa.gov              *
  26  *                                                                       *
  27  *         NAS Parallel Benchmarks Group                                 *
  28  *         NASA Ames Research Center                                     *
  29  *         Mail Stop: T27A-1                                             *
  30  *         Moffett Field, CA   94035-1000                                *
  31  *                                                                       *
  32  *         E-mail:  npb@nas.nasa.gov                                     *
  33  *         Fax:     (650) 604-3957                                       *
  34  *                                                                       *
  35  *************************************************************************
  36  *                                                                       *
  37  *   Author: M. Yarrow                                                   *
  38  *           H. Jin                                                      *
  39  *                                                                       *
  40  *************************************************************************/
  41
  42 #include "smpi/mpi.h"
  43 #include "nas_common.h"
  44 #include <stdlib.h>
  45 #include <stdio.h>
  46
  47 #include "simgrid/instr.h" //TRACE_
  48
  49 char class;
  50 int nprocs;
  51 int total_keys_log2;
  52 int max_key_log_2;
  53 int num_bucket_log_2;
  54 int min_procs=1;
  55 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
  56  * INCREASE max_procs AT YOUR PERIL */
  57 int max_procs=1024;
  58
  59 int total_keys;
  60 int max_key;
  61 int num_buckets;
  62 int num_keys;
  63 long size_of_buffers;
  64
  65 #define  MAX_ITERATIONS      10
  66 #define  TEST_ARRAY_SIZE     5
  67
  68 /* Typedef: if necessary, change the size of int here by changing the  int type to, say, long */
  69 typedef  int  INT_TYPE;
  70 typedef  long INT_TYPE2;
  71 #define MP_KEY_TYPE MPI_INT
  72
  73 typedef struct {
  74 /* MPI properties:  */
  75 int      my_rank, comm_size;
  76 /* Some global info */
  77 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
  78          total_local_keys,             /* copies of rank info        */
  79          total_lesser_keys;
  80
  81 int      passed_verification;
  82 /* These are the three main arrays. See SIZE_OF_BUFFERS def above    */
  83 INT_TYPE *key_array, *key_buff1, *key_buff2,
  84          *bucket_size,     /* Top 5 elements for */
  85          *bucket_size_totals, /* part. ver. vals */
  86          *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
  87 int      send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
  88
  89 /* Partial verif info */
  90 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
  91          test_rank_array[TEST_ARRAY_SIZE];
  92 } global_data;
  93
  94 const INT_TYPE2
  95          S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
  96          S_test_rank_array[TEST_ARRAY_SIZE] =  {0,18,346,64917,65463},
  97          W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
  98          W_test_rank_array[TEST_ARRAY_SIZE] =  {1249,11698,1039987,1043896,1048018},
  99
 100          A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
 101          A_test_rank_array[TEST_ARRAY_SIZE] =  {104,17523,123928,8288932,8388264},
 102
 103          B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
 104          B_test_rank_array[TEST_ARRAY_SIZE] =  {33422937,10244,59149,33135281,99},
 105
 106          C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
 107          C_test_rank_array[TEST_ARRAY_SIZE] =  {61147,882988,266290,133997595,133525895},
 108
 109          D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
 110          D_test_rank_array[TEST_ARRAY_SIZE] =  {1,36538729,1978098519,2145192618,2147425337};
 111
 112 void full_verify( global_data* gd );
 113
 114 /************ returns parallel random number seq seed ************/
 115 /*
 116  * Create a random number sequence of total length nn residing on np number of processors.  Each processor will
 117  * therefore have a subsequence of length nn/np.  This routine returns that random number which is the first random
 118  * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
 119  */
 120 static double  find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
 121                              int  np,       /* np = num procs                      */
 122                              long nn,       /* total num of ran numbers, all procs */
 123                              double s,      /* Ran num seed, for ex.: 314159265.00 */
 124                              double a )     /* Ran num gen mult, try 1220703125.00 */
 125 {
 126   long   i;
 127   double t1,t2,t3,an;
 128   long   mq,nq,kk,ik;
 129
 130   nq = nn / np;
 131
 132   for( mq=0; nq>1; mq++,nq/=2);
 133
 134   t1 = a;
 135
 136   for( i=1; i<=mq; i++ )
 137     t2 = randlc( &t1, &t1 );
 138
 139   an = t1;
 140
 141   kk = kn;
 142   t1 = s;
 143   t2 = an;
 144
 145   for( i=1; i<=100; i++ ){
 146     ik = kk / 2;
 147     if( 2 * ik !=  kk )
 148       t3 = randlc( &t1, &t2 );
 149     if( ik == 0 )
 150       break;
 151     t3 = randlc( &t2, &t2 );
 152     kk = ik;
 153   }
 154   an=t3;//added to silence paranoid compilers
 155
 156   return t1;
 157 }
 158
 159 static void create_seq( global_data* gd, double seed, double a )
 160 {
 161   double x;
 162   int    i, k;
 163
 164   k = max_key/4;
 165
 166   for (i=0; i<num_keys; i++){
 167     x = randlc(&seed, &a);
 168     x += randlc(&seed, &a);
 169     x += randlc(&seed, &a);
 170     x += randlc(&seed, &a);
 171
 172     gd->key_array[i] = k*x;
 173   }
 174 }
 175
 176 void full_verify( global_data* gd )
 177 {
 178   MPI_Status  status;
 179   MPI_Request request;
 180
 181   INT_TYPE    i, j;
 182   INT_TYPE    k, last_local_key;
 183
 184 /*  Now, finally, sort the keys:  */
 185   for( i=0; i<gd->total_local_keys; i++ )
 186     gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
 187   last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
 188
 189 /*  Send largest key value to next processor  */
 190   if( gd->my_rank > 0 )
 191     MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
 192   if( gd->my_rank < gd->comm_size-1 )
 193     MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
 194   if( gd->my_rank > 0 )
 195     MPI_Wait( &request, &status );
 196
 197 /*  Confirm that neighbor's greatest key value is not greater than my least key value */
 198   j = 0;
 199   if( gd->my_rank > 0 && gd->total_local_keys > 0 )
 200     if( k > gd->key_array[0] )
 201       j++;
 202
 203 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
 204   for( i=1; i<gd->total_local_keys; i++ )
 205     if( gd->key_array[i-1] > gd->key_array[i] )
 206       j++;
 207
 208   if( j != 0 ) {
 209     printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
 210   } else
 211     gd->passed_verification++;
 212 }
 213
 214 static void rank( global_data* gd, int iteration )
 215 {
 216   INT_TYPE    i, k;
 217   INT_TYPE    shift = max_key_log_2 - num_bucket_log_2;
 218   INT_TYPE    key;
 219   INT_TYPE2   bucket_sum_accumulator, j, m;
 220   INT_TYPE    local_bucket_sum_accumulator;
 221   INT_TYPE    min_key_val, max_key_val;
 222   INT_TYPE    *key_buff_ptr;
 223
 224 /*  Iteration alteration of keys */
 225   if(gd->my_rank == 0){
 226     gd->key_array[iteration] = iteration;
 227     gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
 228   }
 229
 230 /*  Initialize */
 231   for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
 232     gd->bucket_size[i] = 0;
 233     gd->bucket_size_totals[i] = 0;
 234     gd->process_bucket_distrib_ptr1[i] = 0;
 235     gd->process_bucket_distrib_ptr2[i] = 0;
 236   }
 237
 238 /*  Determine where the partial verify test keys are, load into top of array bucket_size */
 239   for( i=0; i<TEST_ARRAY_SIZE; i++ )
 240     if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
 241       gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
 242
 243 /*  Determine the number of keys in each bucket */
 244   for( i=0; i<num_keys; i++ )
 245     gd->bucket_size[gd->key_array[i] >> shift]++;
 246
 247 /*  Accumulative bucket sizes are the bucket pointers */
 248   gd->bucket_ptrs[0] = 0;
 249   for( i=1; i< num_buckets; i++ )
 250     gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
 251
 252 /*  Sort into appropriate bucket */
 253   for( i=0; i<num_keys; i++ ) {
 254     key = gd->key_array[i];
 255     gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
 256   }
 257
 258 /*  Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
 259   MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
 260                 MPI_COMM_WORLD);
 261
 262 /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
 263  * average number of keys per processor).  Then all keys in these buckets go to processor 0.
 264    Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc.  This
 265    algorithm guarantees that all processors have work ranking; no processors are left idle.
 266    The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
 267    of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
 268    computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
 269    Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
 270    processor will have after the redistribution is done.
 271 */
 272
 273   bucket_sum_accumulator = 0;
 274   local_bucket_sum_accumulator = 0;
 275   gd->send_displ[0] = 0;
 276   gd->process_bucket_distrib_ptr1[0] = 0;
 277   for( i=0, j=0; i<num_buckets; i++ ) {
 278     bucket_sum_accumulator       += gd->bucket_size_totals[i];
 279     local_bucket_sum_accumulator += gd->bucket_size[i];
 280     if( bucket_sum_accumulator >= (j+1)*num_keys ) {
 281       gd->send_count[j] = local_bucket_sum_accumulator;
 282       if( j != 0 ){
 283          gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
 284          gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
 285       }
 286       gd->process_bucket_distrib_ptr2[j++] = i;
 287       local_bucket_sum_accumulator = 0;
 288     }
 289   }
 290
 291 /*  When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
 292  *  So, we need to set counts properly in this case to avoid any fallouts.    */
 293   while( j < gd->comm_size ) {
 294     gd->send_count[j] = 0;
 295     gd->process_bucket_distrib_ptr1[j] = 1;
 296     j++;
 297   }
 298
 299 /*  This is the redistribution section:  first find out how many keys
 300     each processor will send to every other processor:                 */
 301   MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
 302
 303 /*  Determine the receive array displacements for the buckets */
 304   gd->recv_displ[0] = 0;
 305   for( i=1; i<gd->comm_size; i++ )
 306     gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
 307
 308   /*  Now send the keys to respective processors  */
 309   MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
 310                 gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
 311
 312 /* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
 313  * obtain the smallest possible min and greatest possible max value of any key on each processor
 314  */
 315   min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
 316   max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
 317
 318 /*  Clear the work array */
 319   for( i=0; i<max_key_val-min_key_val+1; i++ )
 320     gd->key_buff1[i] = 0;
 321
 322 /*  Determine the total number of keys on all other processors holding keys of lesser value         */
 323   m = 0;
 324   for( k=0; k<gd->my_rank; k++ )
 325     for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
 326       m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
 327
 328 /*  Determine total number of keys on this processor */
 329   j = 0;
 330   for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
 331     j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
 332
 333 /*  Ranking of all keys occurs in this section:                 */
 334 /*  shift it backwards so no subtractions are necessary in loop */
 335   key_buff_ptr = gd->key_buff1 - min_key_val;
 336
 337 /*  In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
 338     individual population                                       */
 339   for( i=0; i<j; i++ )
 340     key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key  population                     */
 341
 342 /*  To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
 343  *  keys, m.
 344     NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
 345     key population here, but still needed during the partial verify test.  This is to ensure that 32-bit key_buff can
 346     still be used for class D.           */
 347 /*    key_buff_ptr[min_key_val] += m;    */
 348   for( i=min_key_val; i<max_key_val; i++ )
 349     key_buff_ptr[i+1] += key_buff_ptr[i];
 350
 351 /* This is the partial verify test section */
 352 /* Observe that test_rank_array vals are shifted differently for different cases */
 353   for( i=0; i<TEST_ARRAY_SIZE; i++ ){
 354     k = gd->bucket_size_totals[i+num_buckets];    /* Keys were hidden here */
 355     if( min_key_val <= k  &&  k <= max_key_val ){
 356       /* Add the total of lesser keys, m, here */
 357       INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
 358       int failed = 0;
 359
 360       switch( class ){
 361         case 'S':
 362           if( i <= 2 ) {
 363             if( key_rank != gd->test_rank_array[i]+iteration )
 364               failed = 1;
 365             else
 366               gd->passed_verification++;
 367           } else {
 368             if( key_rank != gd->test_rank_array[i]-iteration )
 369               failed = 1;
 370             else
 371               gd->passed_verification++;
 372           }
 373           break;
 374         case 'W':
 375           if( i < 2 ){
 376             if( key_rank != gd->test_rank_array[i]+(iteration-2) )
 377               failed = 1;
 378             else
 379               gd->passed_verification++;
 380           } else {
 381               if( key_rank != gd->test_rank_array[i]-iteration )
 382                 failed = 1;
 383               else
 384                 gd->passed_verification++;
 385           }
 386           break;
 387         case 'A':
 388           if( i <= 2 ){
 389             if( key_rank != gd->test_rank_array[i]+(iteration-1) )
 390               failed = 1;
 391             else
 392               gd->passed_verification++;
 393           } else {
 394               if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
 395                 failed = 1;
 396               else
 397                 gd->passed_verification++;
 398           }
 399           break;
 400         case 'B':
 401           if( i == 1 || i == 2 || i == 4 ) {
 402             if( key_rank != gd->test_rank_array[i]+iteration )
 403               failed = 1;
 404             else
 405               gd->passed_verification++;
 406           } else {
 407               if( key_rank != gd->test_rank_array[i]-iteration )
 408                 failed = 1;
 409               else
 410                 gd->passed_verification++;
 411           }
 412           break;
 413         case 'C':
 414           if( i <= 2 ){
 415             if( key_rank != gd->test_rank_array[i]+iteration )
 416               failed = 1;
 417             else
 418               gd->passed_verification++;
 419           } else {
 420               if( key_rank != gd->test_rank_array[i]-iteration )
 421                 failed = 1;
 422               else
 423                 gd->passed_verification++;
 424           }
 425           break;
 426         case 'D':
 427           if( i < 2 ) {
 428             if( key_rank != gd->test_rank_array[i]+iteration )
 429               failed = 1;
 430             else
 431               gd->passed_verification++;
 432            } else {
 433               if( key_rank != gd->test_rank_array[i]-iteration )
 434                 failed = 1;
 435               else
 436                 gd->passed_verification++;
 437            }
 438          break;
 439       }
 440       if( failed == 1 )
 441         printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
 442                iteration, gd->my_rank, (int)i );
 443     }
 444   }
 445
 446 /*  Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
 447  *  the code, probably since they cannot be made register by compiler                        */
 448
 449   if( iteration == MAX_ITERATIONS ) {
 450     gd->key_buff_ptr_global = key_buff_ptr;
 451     gd->total_local_keys    = j;
 452     gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
 453   }
 454 }
 455
 456 int main( int argc, char **argv )
 457 {
 458   int             i, iteration, itemp;
 459   double          timecounter, maxtime;
 460
 461   global_data* gd = malloc(sizeof(global_data));
 462 /*  Initialize MPI */
 463   MPI_Init( &argc, &argv );
 464   MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
 465   MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
 466
 467   get_info(argc, argv, &nprocs, &class);
 468   check_info(IS, nprocs, class);
 469 /*  Initialize the verification arrays if a valid class */
 470   for( i=0; i<TEST_ARRAY_SIZE; i++ )
 471
 472     switch( class ) {
 473       case 'S':
 474          total_keys_log2 = 16;
 475          max_key_log_2 = 11;
 476          num_bucket_log_2 = 9;
 477          max_procs = 128;
 478          gd->test_index_array[i] = S_test_index_array[i];
 479          gd->test_rank_array[i]  = S_test_rank_array[i];
 480          break;
 481       case 'A':
 482          total_keys_log2 = 23;
 483          max_key_log_2 = 19;
 484          num_bucket_log_2 = 10;
 485          gd->test_index_array[i] = A_test_index_array[i];
 486          gd->test_rank_array[i]  = A_test_rank_array[i];
 487          break;
 488       case 'W':
 489           total_keys_log2 = 20;
 490           max_key_log_2 = 16;
 491           num_bucket_log_2 = 10;
 492          gd->test_index_array[i] = W_test_index_array[i];
 493          gd->test_rank_array[i]  = W_test_rank_array[i];
 494          break;
 495       case 'B':
 496           total_keys_log2 = 25;
 497           max_key_log_2 = 21;
 498           num_bucket_log_2 = 10;
 499          gd->test_index_array[i] = B_test_index_array[i];
 500          gd->test_rank_array[i]  = B_test_rank_array[i];
 501          break;
 502       case 'C':
 503           total_keys_log2 = 27;
 504           max_key_log_2 = 23;
 505           num_bucket_log_2 = 10;
 506          gd->test_index_array[i] = C_test_index_array[i];
 507          gd->test_rank_array[i]  = C_test_rank_array[i];
 508          break;
 509       case 'D':
 510           total_keys_log2 = 29;
 511           max_key_log_2 = 27;
 512           num_bucket_log_2 = 10;
 513          min_procs = 4;
 514          gd->test_index_array[i] = D_test_index_array[i];
 515          gd->test_rank_array[i]  = D_test_rank_array[i];
 516          break;
 517     };
 518
 519   total_keys  = (1 << total_keys_log2);
 520   max_key     = (1 << max_key_log_2);
 521   num_buckets = (1 << num_bucket_log_2);
 522   num_keys    = (total_keys/nprocs*min_procs);
 523
 524   /* On larger number of processors, since the keys are (roughly)  gaussian distributed, the first and last processor
 525    * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
 526    * however, a small number The required array size also depends on the bucket size used. The following values are
 527    * validated for the 1024-bucket setup. */
 528   if (nprocs < 256)
 529     size_of_buffers = 3*num_keys/2;
 530   else if (nprocs < 512)
 531     size_of_buffers = 5*num_keys/2;
 532   else if (nprocs < 1024)
 533     size_of_buffers = 4*num_keys/2;
 534   else
 535     size_of_buffers = 13*num_keys/2;
 536
 537   gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 538   gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 539   gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 540   gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));     /* Top 5 elements for */
 541   gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
 542   gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
 543   gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
 544   gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
 545 //  int      send_count[max_procs], recv_count[max_procs],
 546 //           send_displ[max_procs], recv_displ[max_procs];
 547
 548 /*  Printout initial NPB info */
 549   if( gd->my_rank == 0 ){
 550      printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
 551      printf( " Size:  %ld  (class %c)\n", (long)total_keys*min_procs, class);
 552      printf( " Iterations:   %d\n", MAX_ITERATIONS );
 553      printf( " Number of processes:     %d\n",gd->comm_size );
 554   }
 555
 556 /*  Check that actual and compiled number of processors agree */
 557   if( gd->comm_size != nprocs) {
 558     if( gd->my_rank == 0 )
 559        printf( "\n ERROR: compiled for %d processes\n"
 560                " Number of active processes: %d\n"
 561                " Exiting program!\n\n", nprocs, gd->comm_size );
 562     MPI_Finalize();
 563     exit( 1 );
 564   }
 565
 566 /*  Check to see whether total number of processes is within bounds.
 567     This could in principle be checked in setparams.c, but it is more convenient to do it here */
 568   if( gd->comm_size < min_procs || gd->comm_size > max_procs){
 569     if( gd->my_rank == 0 )
 570       printf( "\n ERROR: number of processes %d not within range %d-%d"
 571               "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
 572     MPI_Finalize();
 573     exit( 1 );
 574   }
 575
 576 /*  Generate random number sequence and subsequent keys on all procs */
 577   create_seq(gd,  find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
 578              314159265.00,      /* Random number gen seed */
 579              1220703125.00 ),   /* Random number gen mult */
 580              1220703125.00 );   /* Random number gen mult */
 581
 582 /*  Do one interation for free (i.e., untimed) to guarantee initialization of
 583     all data and code pages and respective tables */
 584   rank(gd, 1 );
 585
 586 /*  Start verification counter */
 587   gd->passed_verification = 0;
 588
 589   if( gd->my_rank == 0 && class != 'S' ) printf( "\n   iteration\n" );
 590
 591 /*  Initialize timer  */
 592   timer_clear(0);
 593
 594 /*  Start timer */
 595   timer_start(0);
 596
 597   char smpi_category[100];
 598   snprintf (smpi_category, 100, "%d", gd->my_rank);
 599   TRACE_smpi_set_category (smpi_category);
 600
 601 /*  This is the main iteration */
 602   for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
 603     if( gd->my_rank == 0 && class != 'S' ) printf( "        %d\n", iteration );
 604     rank(gd,  iteration );
 605   }
 606   TRACE_smpi_set_category (NULL);
 607
 608 /*  Stop timer, obtain time for processors */
 609   timer_stop(0);
 610
 611   timecounter = timer_read(0);
 612
 613 /*  End of timing, obtain maximum time of all processors */
 614   MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
 615
 616 /*  This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
 617   full_verify(gd);
 618
 619 /*  Obtain verification counter sum */
 620   itemp =gd->passed_verification;
 621   MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
 622
 623 /*  The final printout  */
 624   if( gd->my_rank == 0 ) {
 625     if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
 626       gd->passed_verification = 0;
 627     c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
 628                     ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
 629                     gd->passed_verification);
 630   }
 631
 632   MPI_Finalize();
 633   free(gd);
 634
 635   return 0;
 636 }