src/smpi/colls/smpi_openmpi_selector.c

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009, 2010. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11
  12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
  13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  14 {
  15     size_t dsize, block_dsize;
  16     int comm_size = smpi_comm_size(comm);
  17     const size_t intermediate_message = 10000;
  18
  19     /**
  20      * Decision function based on MX results from the Grig cluster at UTK.
  21      *
  22      * Currently, linear, recursive doubling, and nonoverlapping algorithms
  23      * can handle both commutative and non-commutative operations.
  24      * Ring algorithm does not support non-commutative operations.
  25      */
  26     dsize = smpi_datatype_size(dtype);
  27     block_dsize = dsize * count;
  28
  29     if (block_dsize < intermediate_message) {
  30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
  31                                                                    count, dtype,
  32                                                                    op, comm));
  33     }
  34
  35     if( smpi_op_is_commute(op) && (count > comm_size) ) {
  36         const size_t segment_size = 1 << 20; /* 1 MB */
  37         if ((comm_size * segment_size >= block_dsize)) {
  38             //FIXME: ok, these are not the right algorithms, try to find closer ones
  39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
  40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
  41                                               op, comm);
  42         } else {
  43            // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
  44            return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
  45                                                                     count, dtype,
  46                                                                     op, comm
  47                                                                     /*segment_size*/));
  48         }
  49     }
  50
  51     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
  52                                                             dtype, op, comm));
  53 }
  54
  55
  56
  57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
  58                                              MPI_Datatype sdtype,
  59                                              void* rbuf, int rcount,
  60                                              MPI_Datatype rdtype,
  61                                              MPI_Comm comm)
  62 {
  63     int communicator_size;
  64     size_t dsize, block_dsize;
  65     communicator_size = smpi_comm_size(comm);
  66
  67     /* Decision function based on measurement on Grig cluster at
  68        the University of Tennessee (2GB MX) up to 64 nodes.
  69        Has better performance for messages of intermediate sizes than the old one */
  70     /* determine block size */
  71     dsize = smpi_datatype_size(sdtype);
  72     block_dsize = dsize * scount;
  73
  74     if ((block_dsize < 200) && (communicator_size > 12)) {
  75         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
  76                                                     rbuf, rcount, rdtype,
  77                                                     comm);
  78
  79     } else if (block_dsize < 3000) {
  80         return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
  81                                                            rbuf, rcount, rdtype,
  82                                                            comm);
  83     }
  84
  85     return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
  86                                                     rbuf, rcount, rdtype,
  87                                                     comm);
  88 }
  89
  90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
  91                                               MPI_Datatype sdtype,
  92                                               void *rbuf, int *rcounts, int *rdisps,
  93                                               MPI_Datatype rdtype,
  94                                               MPI_Comm  comm
  95                                               )
  96 {
  97     /* For starters, just keep the original algorithm. */
  98     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
  99                                                         rbuf, rcounts, rdisps,rdtype,
 100                                                         comm);
 101 }
 102
 103 /*
 104 void smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
 105 {    int communicator_size = smpi_comm_size(comm);
 106
 107     if( 2 == communicator_size )
 108         return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
 109      * Basic optimisation. If we have a power of 2 number of nodes
 110      * the use the recursive doubling algorithm, otherwise
 111      * bruck is the one we want.
 112     {
 113         bool has_one = false;
 114         for( ; communicator_size > 0; communicator_size >>= 1 ) {
 115             if( communicator_size & 0x1 ) {
 116                 if( has_one )
 117                     return smpi_coll_tuned_barrier_intra_bruck(comm, module);
 118                 has_one = true;
 119             }
 120         }
 121     }
 122     return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
 123 }*/
 124
 125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
 126                                           MPI_Datatype datatype, int root,
 127                                           MPI_Comm  comm
 128                                           )
 129 {
 130     /* Decision function based on MX results for
 131        messages up to 36MB and communicator sizes up to 64 nodes */
 132     const size_t small_message_size = 2048;
 133     const size_t intermediate_message_size = 370728;
 134     const double a_p16  = 3.2118e-6; /* [1 / byte] */
 135     const double b_p16  = 8.7936;
 136     const double a_p64  = 2.3679e-6; /* [1 / byte] */
 137     const double b_p64  = 1.1787;
 138     const double a_p128 = 1.6134e-6; /* [1 / byte] */
 139     const double b_p128 = 2.1102;
 140
 141     int communicator_size;
 142     //int segsize = 0;
 143     size_t message_size, dsize;
 144
 145     communicator_size = smpi_comm_size(comm);
 146
 147     /* else we need data size for decision function */
 148     dsize = smpi_datatype_size(datatype);
 149     message_size = dsize * (unsigned long)count;   /* needed for decision */
 150
 151     /* Handle messages of small and intermediate size, and
 152        single-element broadcasts */
 153     if ((message_size < small_message_size) || (count <= 1)) {
 154         /* Binomial without segmentation */
 155         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
 156                                                       root, comm);
 157
 158     } else if (message_size < intermediate_message_size) {
 159         // SplittedBinary with 1KB segments
 160         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 161                                                          root, comm);
 162
 163     }
 164      //Handle large message sizes
 165     else if (communicator_size < (a_p128 * message_size + b_p128)) {
 166         //Pipeline with 128KB segments
 167         //segsize = 1024  << 7;
 168         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 169                                                      root, comm);
 170
 171
 172     } else if (communicator_size < 13) {
 173         // Split Binary with 8KB segments
 174         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 175                                                          root, comm);
 176
 177     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
 178         // Pipeline with 64KB segments
 179         //segsize = 1024 << 6;
 180         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 181                                                      root, comm);
 182
 183
 184     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
 185         //Pipeline with 16KB segments
 186         //segsize = 1024 << 4;
 187         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 188                                                      root, comm);
 189
 190
 191     }
 192     /* Pipeline with 8KB segments */
 193     //segsize = 1024 << 3;
 194     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 195                                                  root, comm
 196                                                  /*segsize*/);
 197 #if 0
 198     /* this is based on gige measurements */
 199
 200     if (communicator_size  < 4) {
 201         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 202     }
 203     if (communicator_size == 4) {
 204         if (message_size < 524288) segsize = 0;
 205         else segsize = 16384;
 206         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 207     }
 208     if (communicator_size <= 8 && message_size < 4096) {
 209         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 210     }
 211     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
 212         segsize = 16384;
 213         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 214     }
 215     if (message_size >= 524288) {
 216         segsize = 16384;
 217         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
 218     }
 219     segsize = 0;
 220     /* once tested can swap this back in */
 221     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
 222     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 223 #endif  /* 0 */
 224 }
 225
 226 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
 227                                             int count, MPI_Datatype  datatype,
 228                                             MPI_Op   op, int root,
 229                                             MPI_Comm   comm
 230                                             )
 231 {
 232     int communicator_size=0;
 233     //int segsize = 0;
 234     size_t message_size, dsize;
 235     const double a1 =  0.6016 / 1024.0; /* [1/B] */
 236     const double b1 =  1.3496;
 237     const double a2 =  0.0410 / 1024.0; /* [1/B] */
 238     const double b2 =  9.7128;
 239     const double a3 =  0.0422 / 1024.0; /* [1/B] */
 240     const double b3 =  1.1614;
 241     //const double a4 =  0.0033 / 1024.0; /* [1/B] */
 242     //const double b4 =  1.6761;
 243
 244     //const int max_requests = 0; /* no limit on # of outstanding requests */
 245
 246     communicator_size = smpi_comm_size(comm);
 247
 248     /* need data size for decision function */
 249     dsize=smpi_datatype_size(datatype);
 250     message_size = dsize * count;   /* needed for decision */
 251
 252     /**
 253      * If the operation is non commutative we currently have choice of linear
 254      * or in-order binary tree algorithm.
 255      */
 256     if( !smpi_op_is_commute(op) ) {
 257         if ((communicator_size < 12) && (message_size < 2048)) {
 258             return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/);
 259         }
 260         return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 261                                                              0, max_requests*/);
 262     }
 263
 264     if ((communicator_size < 8) && (message_size < 512)){
 265         /* Linear_0K */
 266         return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
 267     } else if (((communicator_size < 8) && (message_size < 20480)) ||
 268                (message_size < 2048) || (count <= 1)) {
 269         /* Binomial_0K */
 270         //segsize = 0;
 271         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 272                                                      segsize, max_requests*/);
 273     } else if (communicator_size > (a1 * message_size + b1)) {
 274         // Binomial_1K
 275         //segsize = 1024;
 276         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 277                                                      segsize, max_requests*/);
 278     } else if (communicator_size > (a2 * message_size + b2)) {
 279         // Pipeline_1K
 280         //segsize = 1024;
 281         return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 282                                                       segsize, max_requests*/);
 283     } else if (communicator_size > (a3 * message_size + b3)) {
 284         // Binary_32K
 285         //segsize = 32*1024;
 286         return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
 287                                                     comm/*, module, segsize, max_requests*/);
 288     }
 289     /*if (communicator_size > (a4 * message_size + b4)) {
 290         // Pipeline_32K
 291         segsize = 32*1024;
 292     } else {
 293         // Pipeline_64K
 294         segsize = 64*1024;
 295     }*/
 296     return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 297                                                   segsize, max_requests*/);
 298
 299 #if 0
 300     /* for small messages use linear algorithm */
 301     if (message_size <= 4096) {
 302         segsize = 0;
 303         fanout = communicator_size - 1;
 304         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
 305         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
 306         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 307         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
 308     }
 309     if (message_size < 524288) {
 310         if (message_size <= 65536 ) {
 311             segsize = 32768;
 312             fanout = 8;
 313         } else {
 314             segsize = 1024;
 315             fanout = communicator_size/2;
 316         }
 317         /* later swap this for a binary tree */
 318         /*         fanout = 2; */
 319         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 320                                                    segsize, fanout, max_requests);
 321     }
 322     segsize = 1024;
 323     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 324                                                   segsize, max_requests);
 325 #endif  /* 0 */
 326 }
 327
 328 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
 329                                                     int *rcounts,
 330                                                     MPI_Datatype dtype,
 331                                                     MPI_Op  op,
 332                                                     MPI_Comm  comm,
 333                                                     )
 334 {
 335     int comm_size, i, pow2;
 336     size_t total_message_size, dsize;
 337     const double a = 0.0012;
 338     const double b = 8.0;
 339     const size_t small_message_size = 12 * 1024;
 340     const size_t large_message_size = 256 * 1024;
 341     bool zerocounts = false;
 342
 343     OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
 344
 345     comm_size = smpi_comm_size(comm);
 346     // We need data size for decision function
 347     ompi_datatype_type_size(dtype, &dsize);
 348     total_message_size = 0;
 349     for (i = 0; i < comm_size; i++) {
 350         total_message_size += rcounts[i];
 351         if (0 == rcounts[i]) {
 352             zerocounts = true;
 353         }
 354     }
 355
 356     if( !ompi_op_is_commute(op) || (zerocounts)) {
 357         return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
 358                                                                     dtype, op,
 359                                                                     comm, module);
 360     }
 361
 362     total_message_size *= dsize;
 363
 364     // compute the nearest power of 2
 365     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
 366
 367     if ((total_message_size <= small_message_size) ||
 368         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
 369         (comm_size >= a * total_message_size + b)) {
 370         return
 371             smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
 372                                                                         dtype, op,
 373                                                                         comm, module);
 374     }
 375     return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
 376                                                      dtype, op,
 377                                                      comm, module);
 378
 379
 380     return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
 381                                                      dtype, op,
 382                                                      comm;
 383
 384 }*/
 385
 386 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
 387                                               MPI_Datatype sdtype,
 388                                               void* rbuf, int rcount,
 389                                               MPI_Datatype rdtype,
 390                                               MPI_Comm  comm
 391                                               )
 392 {
 393     int communicator_size, pow2_size;
 394     size_t dsize, total_dsize;
 395
 396     communicator_size = smpi_comm_size(comm);
 397
 398     /* Special case for 2 processes */
 399     if (communicator_size == 2) {
 400         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
 401                                                           rbuf, rcount, rdtype,
 402                                                           comm/*, module*/);
 403     }
 404
 405     /* Determine complete data size */
 406     dsize=smpi_datatype_size(sdtype);
 407     total_dsize = dsize * scount * communicator_size;
 408
 409     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 410
 411     /* Decision based on MX 2Gb results from Grig cluster at
 412        The University of Tennesse, Knoxville
 413        - if total message size is less than 50KB use either bruck or
 414        recursive doubling for non-power of two and power of two nodes,
 415        respectively.
 416        - else use ring and neighbor exchange algorithms for odd and even
 417        number of nodes, respectively.
 418     */
 419     if (total_dsize < 50000) {
 420         if (pow2_size == communicator_size) {
 421             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 422                                                                      rbuf, rcount, rdtype,
 423                                                                      comm);
 424         } else {
 425             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 426                                                          rbuf, rcount, rdtype,
 427                                                          comm);
 428         }
 429     } else {
 430         //if (communicator_size % 2) {
 431             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 432                                                         rbuf, rcount, rdtype,
 433                                                         comm);
 434         /*} else {
 435             return  smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
 436                                                                      rbuf, rcount, rdtype,
 437                                                                      comm, module);
 438         }*/
 439     }
 440
 441 #if defined(USE_MPICH2_DECISION)
 442     /* Decision as in MPICH-2
 443        presented in Thakur et.al. "Optimization of Collective Communication
 444        Operations in MPICH", International Journal of High Performance Computing
 445        Applications, Vol. 19, No. 1, 49-66 (2005)
 446        - for power-of-two processes and small and medium size messages
 447        (up to 512KB) use recursive doubling
 448        - for non-power-of-two processes and small messages (80KB) use bruck,
 449        - for everything else use ring.
 450     */
 451     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 452         return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
 453                                                                  rbuf, rcount, rdtype,
 454                                                                  comm, module);
 455     } else if (total_dsize <= 81920) {
 456         return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
 457                                                      rbuf, rcount, rdtype,
 458                                                      comm, module);
 459     }
 460     return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
 461                                                 rbuf, rcount, rdtype,
 462                                                 comm, module);
 463 #endif  /* defined(USE_MPICH2_DECISION) */
 464 }
 465
 466 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
 467                                                MPI_Datatype sdtype,
 468                                                void* rbuf, int *rcounts,
 469                                                int *rdispls,
 470                                                MPI_Datatype rdtype,
 471                                                MPI_Comm  comm
 472                                                )
 473 {
 474     int i;
 475     int communicator_size;
 476     size_t dsize, total_dsize;
 477
 478     communicator_size = smpi_comm_size(comm);
 479
 480     /* Special case for 2 processes */
 481     if (communicator_size == 2) {
 482         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
 483                                                            rbuf, rcounts, rdispls, rdtype,
 484                                                            comm);
 485     }
 486
 487     /* Determine complete data size */
 488     dsize=smpi_datatype_size(sdtype);
 489     total_dsize = 0;
 490     for (i = 0; i < communicator_size; i++) {
 491         total_dsize += dsize * rcounts[i];
 492     }
 493
 494     /* Decision based on allgather decision.   */
 495     if (total_dsize < 50000) {
 496 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
 497                                                       rbuf, rcounts, rdispls, rdtype,
 498                                                       comm, module);*/
 499     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 500                                                       rbuf, rcounts, rdispls, rdtype,
 501                                                       comm);
 502
 503     } else {
 504 //        if (communicator_size % 2) {
 505             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 506                                                          rbuf, rcounts, rdispls, rdtype,
 507                                                          comm);
 508 /*        } else {
 509             return  smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
 510                                                                       rbuf, rcounts, rdispls, rdtype,
 511                                                                       comm, module);
 512         }*/
 513     }
 514 }
 515 /*
 516 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
 517                                            MPI_Datatype sdtype,
 518                                            void* rbuf, int rcount,
 519                                            MPI_Datatype rdtype,
 520                                            int root,
 521                                            MPI_Comm  comm,
 522                                            )
 523 {
 524     const int large_segment_size = 32768;
 525     const int small_segment_size = 1024;
 526
 527     const size_t large_block_size = 92160;
 528     const size_t intermediate_block_size = 6000;
 529     const size_t small_block_size = 1024;
 530
 531     const int large_communicator_size = 60;
 532     const int small_communicator_size = 10;
 533
 534     int communicator_size, rank;
 535     size_t dsize, block_size;
 536
 537     OPAL_OUTPUT((smpi_coll_tuned_stream,
 538                  "smpi_coll_tuned_gather_ompi"));
 539
 540     communicator_size = smpi_comm_size(comm);
 541     rank = ompi_comm_rank(comm);
 542
 543     // Determine block size
 544     if (rank == root) {
 545         ompi_datatype_type_size(rdtype, &dsize);
 546         block_size = dsize * rcount;
 547     } else {
 548         ompi_datatype_type_size(sdtype, &dsize);
 549         block_size = dsize * scount;
 550     }
 551
 552     if (block_size > large_block_size) {
 553         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 554                                                          rbuf, rcount, rdtype,
 555                                                          root, comm, module,
 556                                                          large_segment_size);
 557
 558     } else if (block_size > intermediate_block_size) {
 559         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 560                                                          rbuf, rcount, rdtype,
 561                                                          root, comm, module,
 562                                                          small_segment_size);
 563
 564     } else if ((communicator_size > large_communicator_size) ||
 565                ((communicator_size > small_communicator_size) &&
 566                 (block_size < small_block_size))) {
 567         return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
 568                                                       rbuf, rcount, rdtype,
 569                                                       root, comm, module);
 570
 571     }
 572     // Otherwise, use basic linear
 573     return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
 574                                                       rbuf, rcount, rdtype,
 575                                                       root, comm, module);
 576 }*/
 577 /*
 578 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
 579                                             MPI_Datatype sdtype,
 580                                             void* rbuf, int rcount,
 581                                             MPI_Datatype rdtype,
 582                                             int root, MPI_Comm  comm,
 583                                             )
 584 {
 585     const size_t small_block_size = 300;
 586     const int small_comm_size = 10;
 587     int communicator_size, rank;
 588     size_t dsize, block_size;
 589
 590     OPAL_OUTPUT((smpi_coll_tuned_stream,
 591                  "smpi_coll_tuned_scatter_ompi"));
 592
 593     communicator_size = smpi_comm_size(comm);
 594     rank = ompi_comm_rank(comm);
 595     // Determine block size
 596     if (root == rank) {
 597         ompi_datatype_type_size(sdtype, &dsize);
 598         block_size = dsize * scount;
 599     } else {
 600         ompi_datatype_type_size(rdtype, &dsize);
 601         block_size = dsize * rcount;
 602     }
 603
 604     if ((communicator_size > small_comm_size) &&
 605         (block_size < small_block_size)) {
 606         return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
 607                                                        rbuf, rcount, rdtype,
 608                                                        root, comm, module);
 609     }
 610     return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
 611                                                        rbuf, rcount, rdtype,
 612                                                        root, comm, module);
 613 }*/
 614