Another attempt at fixing the Flag2 job of jenkins

[simgrid.git] / src / smpi / colls / smpi_openmpi_selector.cpp
diff --git a/src/smpi/colls/smpi_openmpi_selector.cpp b/src/smpi/colls/smpi_openmpi_selector.cpp

index 3461709ae97c37fd951ce08dbfd8a6a7893d6798..51602c6134d58163de6828219d541c1b46484bd3 100644 (file)
--- a/src/smpi/colls/smpi_openmpi_selector.cpp
+++ b/src/smpi/colls/smpi_openmpi_selector.cpp
@@ -1,6 +1,7 @@
-/* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
+/* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector
+ * Updated 02/2022                                                          */
  
-/* Copyright (c) 2009-2019. The SimGrid Team.
+/* Copyright (c) 2009-2023. The SimGrid Team.
   * All rights reserved.                                                     */
  
  /* This program is free software; you can redistribute it and/or modify it
@@ -8,606 +9,1330 @@
  
  #include "colls_private.hpp"
  
-namespace simgrid{
-namespace smpi{
+#include <memory>
  
-int Coll_allreduce_ompi::allreduce(const void *sbuf, void *rbuf, int count,
-                        MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
-{
-    size_t dsize, block_dsize;
-    int comm_size = comm->size();
-    const size_t intermediate_message = 10000;
+namespace simgrid::smpi {
  
-    /**
-     * Decision function based on MX results from the Grig cluster at UTK.
+int allreduce__ompi(const void *sbuf, void *rbuf, int count,
+                    MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+{
+    size_t total_dsize = dtype->size() * (ptrdiff_t)count;
+    int communicator_size = comm->size();
+    int alg = 1;
+    int(*funcs[]) (const void*, void*, int, MPI_Datatype, MPI_Op, MPI_Comm)={
+        &allreduce__redbcast,
+        &allreduce__redbcast,
+        &allreduce__rdb,
+        &allreduce__lr,
+        &allreduce__ompi_ring_segmented,
+        &allreduce__rab_rdb
+    };
+
+    /** Algorithms:
+     *  {1, "basic_linear"},
+     *  {2, "nonoverlapping"},
+     *  {3, "recursive_doubling"},
+     *  {4, "ring"},
+     *  {5, "segmented_ring"},
+     *  {6, "rabenseifner"
       *
-     * Currently, linear, recursive doubling, and nonoverlapping algorithms
-     * can handle both commutative and non-commutative operations.
-     * Ring algorithm does not support non-commutative operations.
+     * Currently, ring, segmented ring, and rabenseifner do not support
+     * non-commutative operations.
       */
-    dsize = dtype->size();
-    block_dsize = dsize * count;
-
-    if (block_dsize < intermediate_message) {
-        return (Coll_allreduce_rdb::allreduce (sbuf, rbuf,
-                                                                   count, dtype,
-                                                                   op, comm));
-    }
-
-    if( ((op==MPI_OP_NULL) || op->is_commutative()) && (count > comm_size) ) {
-        const size_t segment_size = 1 << 20; /* 1 MB */
-        if ((comm_size * segment_size >= block_dsize)) {
-            //FIXME: ok, these are not the right algorithms, try to find closer ones
-            // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
-            return Coll_allreduce_lr::allreduce(sbuf, rbuf, count, dtype,
-                                              op, comm);
+    if ((op != MPI_OP_NULL) && not op->is_commutative()) {
+        if (communicator_size < 4) {
+            if (total_dsize < 131072) {
+                alg = 3;
+            } else {
+                alg = 1;
+            }
+        } else if (communicator_size < 8) {
+            alg = 3;
+        } else if (communicator_size < 16) {
+            if (total_dsize < 1048576) {
+                alg = 3;
+            } else {
+                alg = 2;
+            }
+        } else if (communicator_size < 128) {
+            alg = 3;
+        } else if (communicator_size < 256) {
+            if (total_dsize < 131072) {
+                alg = 2;
+            } else if (total_dsize < 524288) {
+                alg = 3;
+            } else {
+                alg = 2;
+            }
+        } else if (communicator_size < 512) {
+            if (total_dsize < 4096) {
+                alg = 2;
+            } else if (total_dsize < 524288) {
+                alg = 3;
+            } else {
+                alg = 2;
+            }
          } else {
-           return (Coll_allreduce_ompi_ring_segmented::allreduce (sbuf, rbuf,
-                                                                    count, dtype,
-                                                                    op, comm
-                                                                    /*segment_size*/));
+            if (total_dsize < 2048) {
+                alg = 2;
+            } else {
+                alg = 3;
+            }
+        }
+    } else {
+        if (communicator_size < 4) {
+            if (total_dsize < 8) {
+                alg = 4;
+            } else if (total_dsize < 4096) {
+                alg = 3;
+            } else if (total_dsize < 8192) {
+                alg = 4;
+            } else if (total_dsize < 16384) {
+                alg = 3;
+            } else if (total_dsize < 65536) {
+                alg = 4;
+            } else if (total_dsize < 262144) {
+                alg = 5;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 8) {
+            if (total_dsize < 16) {
+                alg = 4;
+            } else if (total_dsize < 8192) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 16) {
+            if (total_dsize < 8192) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 32) {
+            if (total_dsize < 64) {
+                alg = 5;
+            } else if (total_dsize < 4096) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 64) {
+            if (total_dsize < 128) {
+                alg = 5;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 128) {
+            if (total_dsize < 262144) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 256) {
+            if (total_dsize < 131072) {
+                alg = 2;
+            } else if (total_dsize < 262144) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 512) {
+            if (total_dsize < 4096) {
+                alg = 2;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 2048) {
+            if (total_dsize < 2048) {
+                alg = 2;
+            } else if (total_dsize < 16384) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 4096) {
+            if (total_dsize < 2048) {
+                alg = 2;
+            } else if (total_dsize < 4096) {
+                alg = 5;
+            } else if (total_dsize < 16384) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
+        } else {
+            if (total_dsize < 2048) {
+                alg = 2;
+            } else if (total_dsize < 16384) {
+                alg = 5;
+            } else if (total_dsize < 32768) {
+                alg = 3;
+            } else {
+                alg = 6;
+            }
          }
      }
-
-    return (Coll_allreduce_redbcast::allreduce(sbuf, rbuf, count,
-                                                            dtype, op, comm));
+    return funcs[alg-1](sbuf, rbuf, count, dtype, op, comm);
  }
  
  
  
-int Coll_alltoall_ompi::alltoall(const void *sbuf, int scount,
-                                             MPI_Datatype sdtype,
-                                             void* rbuf, int rcount,
-                                             MPI_Datatype rdtype,
-                                             MPI_Comm comm)
+int alltoall__ompi(const void *sbuf, int scount,
+                   MPI_Datatype sdtype,
+                   void* rbuf, int rcount,
+                   MPI_Datatype rdtype,
+                   MPI_Comm comm)
  {
-    int communicator_size;
-    size_t dsize, block_dsize;
-    communicator_size = comm->size();
+    int alg = 1;
+    size_t dsize, total_dsize;
+    int communicator_size = comm->size();
  
-    /* Decision function based on measurement on Grig cluster at
-       the University of Tennessee (2GB MX) up to 64 nodes.
-       Has better performance for messages of intermediate sizes than the old one */
-    /* determine block size */
-    dsize = sdtype->size();
-    block_dsize = dsize * scount;
-
-    if ((block_dsize < 200) && (communicator_size > 12)) {
-        return Coll_alltoall_bruck::alltoall(sbuf, scount, sdtype,
-                                                    rbuf, rcount, rdtype,
-                                                    comm);
-
-    } else if (block_dsize < 3000) {
-        return Coll_alltoall_basic_linear::alltoall(sbuf, scount, sdtype,
-                                                           rbuf, rcount, rdtype,
-                                                           comm);
+    if (MPI_IN_PLACE != sbuf) {
+        dsize = sdtype->size();
+    } else {
+        dsize = rdtype->size();
+    }
+    total_dsize = dsize * (ptrdiff_t)scount;
+    int (*funcs[])(const void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = {
+        &alltoall__basic_linear,
+        &alltoall__pair,
+        &alltoall__bruck,
+        &alltoall__basic_linear,
+        &alltoall__basic_linear
+    };
+    /** Algorithms:
+     *  {1, "linear"},
+     *  {2, "pairwise"},
+     *  {3, "modified_bruck"},
+     *  {4, "linear_sync"},
+     *  {5, "two_proc"},
+     */
+    if (communicator_size == 2) {
+        if (total_dsize < 2) {
+            alg = 2;
+        } else if (total_dsize < 4) {
+            alg = 5;
+        } else if (total_dsize < 16) {
+            alg = 2;
+        } else if (total_dsize < 64) {
+            alg = 5;
+        } else if (total_dsize < 256) {
+            alg = 2;
+        } else if (total_dsize < 4096) {
+            alg = 5;
+        } else if (total_dsize < 32768) {
+            alg = 2;
+        } else if (total_dsize < 262144) {
+            alg = 4;
+        } else if (total_dsize < 1048576) {
+            alg = 5;
+        } else {
+            alg = 2;
+        }
+    } else if (communicator_size < 8) {
+        if (total_dsize < 8192) {
+            alg = 4;
+        } else if (total_dsize < 16384) {
+            alg = 1;
+        } else if (total_dsize < 65536) {
+            alg = 4;
+        } else if (total_dsize < 524288) {
+            alg = 1;
+        } else if (total_dsize < 1048576) {
+            alg = 2;
+        } else {
+            alg = 1;
+        }
+    } else if (communicator_size < 16) {
+        if (total_dsize < 262144) {
+            alg = 4;
+        } else {
+            alg = 1;
+        }
+    } else if (communicator_size < 32) {
+        if (total_dsize < 4) {
+            alg = 4;
+        } else if (total_dsize < 512) {
+            alg = 3;
+        } else if (total_dsize < 8192) {
+            alg = 4;
+        } else if (total_dsize < 32768) {
+            alg = 1;
+        } else if (total_dsize < 262144) {
+            alg = 4;
+        } else if (total_dsize < 524288) {
+            alg = 1;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 64) {
+        if (total_dsize < 512) {
+            alg = 3;
+        } else if (total_dsize < 524288) {
+            alg = 1;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 128) {
+        if (total_dsize < 1024) {
+            alg = 3;
+        } else if (total_dsize < 2048) {
+            alg = 1;
+        } else if (total_dsize < 4096) {
+            alg = 4;
+        } else if (total_dsize < 262144) {
+            alg = 1;
+        } else {
+            alg = 2;
+        }
+    } else if (communicator_size < 256) {
+        if (total_dsize < 1024) {
+            alg = 3;
+        } else if (total_dsize < 2048) {
+            alg = 4;
+        } else if (total_dsize < 262144) {
+            alg = 1;
+        } else {
+            alg = 2;
+        }
+    } else if (communicator_size < 512) {
+        if (total_dsize < 1024) {
+            alg = 3;
+        } else if (total_dsize < 8192) {
+            alg = 4;
+        } else if (total_dsize < 32768) {
+            alg = 1;
+        } else {
+            alg = 2;
+        }
+    } else if (communicator_size < 1024) {
+        if (total_dsize < 512) {
+            alg = 3;
+        } else if (total_dsize < 8192) {
+            alg = 4;
+        } else if (total_dsize < 16384) {
+            alg = 1;
+        } else if (total_dsize < 131072) {
+            alg = 4;
+        } else if (total_dsize < 262144) {
+            alg = 1;
+        } else {
+            alg = 2;
+        }
+    } else if (communicator_size < 2048) {
+        if (total_dsize < 512) {
+            alg = 3;
+        } else if (total_dsize < 1024) {
+            alg = 4;
+        } else if (total_dsize < 2048) {
+            alg = 1;
+        } else if (total_dsize < 16384) {
+            alg = 4;
+        } else if (total_dsize < 262144) {
+            alg = 1;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 4096) {
+        if (total_dsize < 1024) {
+            alg = 3;
+        } else if (total_dsize < 4096) {
+            alg = 4;
+        } else if (total_dsize < 8192) {
+            alg = 1;
+        } else if (total_dsize < 131072) {
+            alg = 4;
+        } else {
+            alg = 1;
+        }
+    } else {
+        if (total_dsize < 2048) {
+            alg = 3;
+        } else if (total_dsize < 8192) {
+            alg = 4;
+        } else if (total_dsize < 16384) {
+            alg = 1;
+        } else if (total_dsize < 32768) {
+            alg = 4;
+        } else if (total_dsize < 65536) {
+            alg = 1;
+        } else {
+            alg = 4;
+        }
      }
  
-    return Coll_alltoall_ring::alltoall (sbuf, scount, sdtype,
-                                                    rbuf, rcount, rdtype,
-                                                    comm);
+    return funcs[alg-1](sbuf, scount, sdtype,
+                          rbuf, rcount, rdtype, comm);
  }
  
-int Coll_alltoallv_ompi::alltoallv(const void *sbuf, const int *scounts, const int *sdisps,
-                                              MPI_Datatype sdtype,
-                                              void *rbuf, const int *rcounts, const int *rdisps,
-                                              MPI_Datatype rdtype,
-                                              MPI_Comm  comm
-                                              )
+int alltoallv__ompi(const void *sbuf, const int *scounts, const int *sdisps,
+                    MPI_Datatype sdtype,
+                    void *rbuf, const int *rcounts, const int *rdisps,
+                    MPI_Datatype rdtype,
+                    MPI_Comm  comm
+                    )
  {
-    /* For starters, just keep the original algorithm. */
-    return Coll_alltoallv_ring::alltoallv(sbuf, scounts, sdisps, sdtype,
-                                                        rbuf, rcounts, rdisps,rdtype,
-                                                        comm);
-}
-
-
-int Coll_barrier_ompi::barrier(MPI_Comm  comm)
-{    int communicator_size = comm->size();
-
-    if( 2 == communicator_size )
-        return Coll_barrier_ompi_two_procs::barrier(comm);
-/*     * Basic optimisation. If we have a power of 2 number of nodes*/
-/*     * the use the recursive doubling algorithm, otherwise*/
-/*     * bruck is the one we want.*/
-    {
-        int has_one = 0;
-        for( ; communicator_size > 0; communicator_size >>= 1 ) {
-            if( communicator_size & 0x1 ) {
-                if( has_one )
-                    return Coll_barrier_ompi_bruck::barrier(comm);
-                has_one = 1;
-            }
-        }
+    int communicator_size = comm->size();
+    int alg = 1;
+    int (*funcs[])(const void *, const int*, const int*, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = {
+        &alltoallv__ompi_basic_linear,
+        &alltoallv__pair
+    };
+   /** Algorithms:
+     *  {1, "basic_linear"},
+     *  {2, "pairwise"},
+     *
+     * We can only optimize based on com size
+     */
+    if (communicator_size < 4) {
+        alg = 2;
+    } else if (communicator_size < 64) {
+        alg = 1;
+    } else if (communicator_size < 128) {
+        alg = 2;
+    } else if (communicator_size < 256) {
+        alg = 1;
+    } else if (communicator_size < 1024) {
+        alg = 2;
+    } else {
+        alg = 1;
      }
-    return Coll_barrier_ompi_recursivedoubling::barrier(comm);
+    return funcs[alg-1](sbuf, scounts, sdisps, sdtype,
+                           rbuf, rcounts, rdisps,rdtype,
+                           comm);
  }
  
-int Coll_bcast_ompi::bcast(void *buff, int count,
-                                          MPI_Datatype datatype, int root,
-                                          MPI_Comm  comm
-                                          )
+int barrier__ompi(MPI_Comm  comm)
  {
-    /* Decision function based on MX results for
-       messages up to 36MB and communicator sizes up to 64 nodes */
-    const size_t small_message_size = 2048;
-    const size_t intermediate_message_size = 370728;
-    const double a_p16  = 3.2118e-6; /* [1 / byte] */
-    const double b_p16  = 8.7936;
-    const double a_p64  = 2.3679e-6; /* [1 / byte] */
-    const double b_p64  = 1.1787;
-    const double a_p128 = 1.6134e-6; /* [1 / byte] */
-    const double b_p128 = 2.1102;
-
-    int communicator_size;
-    //int segsize = 0;
-    size_t message_size, dsize;
-
-    communicator_size = comm->size();
-
-    /* else we need data size for decision function */
-    dsize = datatype->size();
-    message_size = dsize * (unsigned long)count;   /* needed for decision */
-
-    /* Handle messages of small and intermediate size, and
-       single-element broadcasts */
-    if ((message_size < small_message_size) || (count <= 1)) {
-        /* Binomial without segmentation */
-        return  Coll_bcast_binomial_tree::bcast (buff, count, datatype,
-                                                      root, comm);
-
-    } else if (message_size < intermediate_message_size) {
-        // SplittedBinary with 1KB segments
-        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
-                                                         root, comm);
-
+    int communicator_size = comm->size();
+    int alg = 1;
+    int (*funcs[])(MPI_Comm) = {
+        &barrier__ompi_basic_linear,
+        &barrier__ompi_basic_linear,
+        &barrier__ompi_recursivedoubling,
+        &barrier__ompi_bruck,
+        &barrier__ompi_two_procs,
+        &barrier__ompi_tree
+    };
+    /** Algorithms:
+     *  {1, "linear"},
+     *  {2, "double_ring"},
+     *  {3, "recursive_doubling"},
+     *  {4, "bruck"},
+     *  {5, "two_proc"},
+     *  {6, "tree"},
+     *
+     * We can only optimize based on com size
+     */
+    if (communicator_size < 4) {
+        alg = 3;
+    } else if (communicator_size < 8) {
+        alg = 1;
+    } else if (communicator_size < 64) {
+        alg = 3;
+    } else if (communicator_size < 256) {
+        alg = 4;
+    } else if (communicator_size < 512) {
+        alg = 6;
+    } else if (communicator_size < 1024) {
+        alg = 4;
+    } else if (communicator_size < 4096) {
+        alg = 6;
+    } else {
+        alg = 4;
      }
-     //Handle large message sizes
-    else if (communicator_size < (a_p128 * message_size + b_p128)) {
-        //Pipeline with 128KB segments
-        //segsize = 1024  << 7;
-        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
-                                                     root, comm);
-
-
-    } else if (communicator_size < 13) {
-        // Split Binary with 8KB segments
-        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
-                                                         root, comm);
-
-    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
-        // Pipeline with 64KB segments
-        //segsize = 1024 << 6;
-        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
-                                                     root, comm);
  
+    return funcs[alg-1](comm);
+}
  
-    } else if (communicator_size < (a_p16 * message_size + b_p16)) {
-        //Pipeline with 16KB segments
-        //segsize = 1024 << 4;
-        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
-                                                     root, comm);
+int bcast__ompi(void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm  comm)
+{
+    int alg = 1;
+    size_t total_dsize, dsize;
  
+    int communicator_size = comm->size();
  
+    dsize = datatype->size();
+    total_dsize = dsize * (unsigned long)count;
+    int (*funcs[])(void*, int, MPI_Datatype, int, MPI_Comm) = {
+        &bcast__NTSL,
+        &bcast__ompi_pipeline,
+        &bcast__ompi_pipeline,
+        &bcast__ompi_split_bintree,
+        &bcast__NTSB,
+        &bcast__binomial_tree,
+        &bcast__mvapich2_knomial_intra_node,
+        &bcast__scatter_rdb_allgather,
+        &bcast__scatter_LR_allgather,
+    };
+    /** Algorithms:
+     *  {1, "basic_linear"},
+     *  {2, "chain"},
+     *  {3, "pipeline"},
+     *  {4, "split_binary_tree"},
+     *  {5, "binary_tree"},
+     *  {6, "binomial"},
+     *  {7, "knomial"},
+     *  {8, "scatter_allgather"},
+     *  {9, "scatter_allgather_ring"},
+     */
+    if (communicator_size < 4) {
+        if (total_dsize < 32) {
+            alg = 3;
+        } else if (total_dsize < 256) {
+            alg = 5;
+        } else if (total_dsize < 512) {
+            alg = 3;
+        } else if (total_dsize < 1024) {
+            alg = 7;
+        } else if (total_dsize < 32768) {
+            alg = 1;
+        } else if (total_dsize < 131072) {
+            alg = 5;
+        } else if (total_dsize < 262144) {
+            alg = 2;
+        } else if (total_dsize < 524288) {
+            alg = 1;
+        } else if (total_dsize < 1048576) {
+            alg = 6;
+        } else {
+            alg = 5;
+        }
+    } else if (communicator_size < 8) {
+        if (total_dsize < 64) {
+            alg = 5;
+        } else if (total_dsize < 128) {
+            alg = 6;
+        } else if (total_dsize < 2048) {
+            alg = 5;
+        } else if (total_dsize < 8192) {
+            alg = 6;
+        } else if (total_dsize < 1048576) {
+            alg = 1;
+        } else {
+            alg = 2;
+        }
+    } else if (communicator_size < 16) {
+        if (total_dsize < 8) {
+            alg = 7;
+        } else if (total_dsize < 64) {
+            alg = 5;
+        } else if (total_dsize < 4096) {
+            alg = 7;
+        } else if (total_dsize < 16384) {
+            alg = 5;
+        } else if (total_dsize < 32768) {
+            alg = 6;
+        } else {
+            alg = 1;
+        }
+    } else if (communicator_size < 32) {
+        if (total_dsize < 4096) {
+            alg = 7;
+        } else if (total_dsize < 1048576) {
+            alg = 6;
+        } else {
+            alg = 8;
+        }
+    } else if (communicator_size < 64) {
+        if (total_dsize < 2048) {
+            alg = 6;
+        } else {
+            alg = 7;
+        }
+    } else if (communicator_size < 128) {
+        alg = 7;
+    } else if (communicator_size < 256) {
+        if (total_dsize < 2) {
+            alg = 6;
+        } else if (total_dsize < 16384) {
+            alg = 5;
+        } else if (total_dsize < 32768) {
+            alg = 1;
+        } else if (total_dsize < 65536) {
+            alg = 5;
+        } else {
+            alg = 7;
+        }
+    } else if (communicator_size < 1024) {
+        if (total_dsize < 16384) {
+            alg = 7;
+        } else if (total_dsize < 32768) {
+            alg = 4;
+        } else {
+            alg = 7;
+        }
+    } else if (communicator_size < 2048) {
+        if (total_dsize < 524288) {
+            alg = 7;
+        } else {
+            alg = 8;
+        }
+    } else if (communicator_size < 4096) {
+        if (total_dsize < 262144) {
+            alg = 7;
+        } else {
+            alg = 8;
+        }
+    } else {
+        if (total_dsize < 8192) {
+            alg = 7;
+        } else if (total_dsize < 16384) {
+            alg = 5;
+        } else if (total_dsize < 262144) {
+            alg = 7;
+        } else {
+            alg = 8;
+        }
      }
-    /* Pipeline with 8KB segments */
-    //segsize = 1024 << 3;
-    return Coll_bcast_flattree_pipeline::bcast (buff, count, datatype,
-                                                 root, comm
-                                                 /*segsize*/);
-#if 0
-    /* this is based on gige measurements */
-
-    if (communicator_size  < 4) {
-        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
-    }
-    if (communicator_size == 4) {
-        if (message_size < 524288) segsize = 0;
-        else segsize = 16384;
-        return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
-    }
-    if (communicator_size <= 8 && message_size < 4096) {
-        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
-    }
-    if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
-        segsize = 16384;
-        return  Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
-    }
-    if (message_size >= 524288) {
-        segsize = 16384;
-        return Coll_bcast_intra_pipeline::bcast (buff, count, datatype, root, comm, module, segsize);
-    }
-    segsize = 0;
-    /* once tested can swap this back in */
-    /* return Coll_bcast_intra_bmtree::bcast (buff, count, datatype, root, comm, segsize); */
-    return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
-#endif  /* 0 */
+    return funcs[alg-1](buff, count, datatype, root, comm);
  }
  
-int Coll_reduce_ompi::reduce(const void *sendbuf, void *recvbuf,
-                                            int count, MPI_Datatype  datatype,
-                                            MPI_Op   op, int root,
-                                            MPI_Comm   comm
-                                            )
+int reduce__ompi(const void *sendbuf, void *recvbuf,
+                 int count, MPI_Datatype  datatype,
+                 MPI_Op   op, int root,
+                 MPI_Comm   comm)
  {
-    int communicator_size=0;
-    //int segsize = 0;
-    size_t message_size, dsize;
-    const double a1 =  0.6016 / 1024.0; /* [1/B] */
-    const double b1 =  1.3496;
-    const double a2 =  0.0410 / 1024.0; /* [1/B] */
-    const double b2 =  9.7128;
-    const double a3 =  0.0422 / 1024.0; /* [1/B] */
-    const double b3 =  1.1614;
-    //const double a4 =  0.0033 / 1024.0;  [1/B]
-    //const double b4 =  1.6761;
-
-    /* no limit on # of outstanding requests */
-    //const int max_requests = 0;
-
-    communicator_size = comm->size();
+    size_t total_dsize, dsize;
+    int alg = 1;
+    int communicator_size = comm->size();
  
-    /* need data size for decision function */
      dsize=datatype->size();
-    message_size = dsize * count;   /* needed for decision */
-
-    /**
-     * If the operation is non commutative we currently have choice of linear
-     * or in-order binary tree algorithm.
+    total_dsize = dsize * count;
+    int (*funcs[])(const void*, void*, int, MPI_Datatype, MPI_Op, int, MPI_Comm) = {
+        &reduce__ompi_basic_linear,
+        &reduce__ompi_chain,
+        &reduce__ompi_pipeline,
+        &reduce__ompi_binary,
+        &reduce__ompi_binomial,
+        &reduce__ompi_in_order_binary,
+        //&reduce__rab our rab can't be used with all datatypes
+        &reduce__ompi_basic_linear
+    };
+    /** Algorithms:
+     *  {1, "linear"},
+     *  {2, "chain"},
+     *  {3, "pipeline"},
+     *  {4, "binary"},
+     *  {5, "binomial"},
+     *  {6, "in-order_binary"},
+     *  {7, "rabenseifner"},
+     *
+     * Currently, only linear and in-order binary tree algorithms are
+     * capable of non commutative ops.
       */
-    if ((op != MPI_OP_NULL) && not op->is_commutative()) {
-      if ((communicator_size < 12) && (message_size < 2048)) {
-        return Coll_reduce_ompi_basic_linear::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module*/);
-      }
-      return Coll_reduce_ompi_in_order_binary::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module,
-                                                             0, max_requests*/);
+     if ((op != MPI_OP_NULL) && not op->is_commutative()) {
+        if (communicator_size < 4) {
+            if (total_dsize < 8) {
+                alg = 6;
+            } else {
+                alg = 1;
+            }
+        } else if (communicator_size < 8) {
+            alg = 1;
+        } else if (communicator_size < 16) {
+            if (total_dsize < 1024) {
+                alg = 6;
+            } else if (total_dsize < 8192) {
+                alg = 1;
+            } else if (total_dsize < 16384) {
+                alg = 6;
+            } else if (total_dsize < 262144) {
+                alg = 1;
+            } else {
+                alg = 6;
+            }
+        } else if (communicator_size < 128) {
+            alg = 6;
+        } else if (communicator_size < 256) {
+            if (total_dsize < 512) {
+                alg = 6;
+            } else if (total_dsize < 1024) {
+                alg = 1;
+            } else {
+                alg = 6;
+            }
+        } else {
+            alg = 6;
+        }
+    } else {
+        if (communicator_size < 4) {
+            if (total_dsize < 8) {
+                alg = 7;
+            } else if (total_dsize < 16) {
+                alg = 4;
+            } else if (total_dsize < 32) {
+                alg = 3;
+            } else if (total_dsize < 262144) {
+                alg = 1;
+            } else if (total_dsize < 524288) {
+                alg = 3;
+            } else if (total_dsize < 1048576) {
+                alg = 2;
+            } else {
+                alg = 3;
+            }
+        } else if (communicator_size < 8) {
+            if (total_dsize < 4096) {
+                alg = 4;
+            } else if (total_dsize < 65536) {
+                alg = 2;
+            } else if (total_dsize < 262144) {
+                alg = 5;
+            } else if (total_dsize < 524288) {
+                alg = 1;
+            } else if (total_dsize < 1048576) {
+                alg = 5;
+            } else {
+                alg = 1;
+            }
+        } else if (communicator_size < 16) {
+            if (total_dsize < 8192) {
+                alg = 4;
+            } else {
+                alg = 5;
+            }
+        } else if (communicator_size < 32) {
+            if (total_dsize < 4096) {
+                alg = 4;
+            } else {
+                alg = 5;
+            }
+        } else if (communicator_size < 256) {
+            alg = 5;
+        } else if (communicator_size < 512) {
+            if (total_dsize < 8192) {
+                alg = 5;
+            } else if (total_dsize < 16384) {
+                alg = 6;
+            } else {
+                alg = 5;
+            }
+        } else if (communicator_size < 2048) {
+            alg = 5;
+        } else if (communicator_size < 4096) {
+            if (total_dsize < 512) {
+                alg = 5;
+            } else if (total_dsize < 1024) {
+                alg = 6;
+            } else if (total_dsize < 8192) {
+                alg = 5;
+            } else if (total_dsize < 16384) {
+                alg = 6;
+            } else {
+                alg = 5;
+            }
+        } else {
+            if (total_dsize < 16) {
+                alg = 5;
+            } else if (total_dsize < 32) {
+                alg = 6;
+            } else if (total_dsize < 1024) {
+                alg = 5;
+            } else if (total_dsize < 2048) {
+                alg = 6;
+            } else if (total_dsize < 8192) {
+                alg = 5;
+            } else if (total_dsize < 16384) {
+                alg = 6;
+            } else {
+                alg = 5;
+            }
+        }
      }
  
-    if ((communicator_size < 8) && (message_size < 512)){
-        /* Linear_0K */
-        return Coll_reduce_ompi_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm);
-    } else if (((communicator_size < 8) && (message_size < 20480)) ||
-               (message_size < 2048) || (count <= 1)) {
-        /* Binomial_0K */
-        //segsize = 0;
-        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
-                                                     segsize, max_requests*/);
-    } else if (communicator_size > (a1 * message_size + b1)) {
-        // Binomial_1K
-        //segsize = 1024;
-        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
-                                                     segsize, max_requests*/);
-    } else if (communicator_size > (a2 * message_size + b2)) {
-        // Pipeline_1K
-        //segsize = 1024;
-        return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
-                                                      segsize, max_requests*/);
-    } else if (communicator_size > (a3 * message_size + b3)) {
-        // Binary_32K
-        //segsize = 32*1024;
-        return Coll_reduce_ompi_binary::reduce( sendbuf, recvbuf, count, datatype, op, root,
-                                                    comm/*, module, segsize, max_requests*/);
-    }
-//    if (communicator_size > (a4 * message_size + b4)) {
-        // Pipeline_32K
-//        segsize = 32*1024;
-//    } else {
-        // Pipeline_64K
-//        segsize = 64*1024;
-//    }
-    return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
-                                                  segsize, max_requests*/);
-
-#if 0
-    /* for small messages use linear algorithm */
-    if (message_size <= 4096) {
-        segsize = 0;
-        fanout = communicator_size - 1;
-        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
-        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
-        return Coll_reduce_intra_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module);
-        /*        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
-    }
-    if (message_size < 524288) {
-        if (message_size <= 65536 ) {
-            segsize = 32768;
-            fanout = 8;
-        } else {
-            segsize = 1024;
-            fanout = communicator_size/2;
-        }
-        /* later swap this for a binary tree */
-        /*         fanout = 2; */
-        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
-                                                   segsize, fanout, max_requests);
-    }
-    segsize = 1024;
-    return Coll_reduce_intra_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
-                                                  segsize, max_requests);
-#endif  /* 0 */
+    return funcs[alg-1] (sendbuf, recvbuf, count, datatype, op, root, comm);
  }
  
-int Coll_reduce_scatter_ompi::reduce_scatter(const void *sbuf, void *rbuf,
-                                                    const int *rcounts,
-                                                    MPI_Datatype dtype,
-                                                    MPI_Op  op,
-                                                    MPI_Comm  comm
-                                                    )
+int reduce_scatter__ompi(const void *sbuf, void *rbuf,
+                         const int *rcounts,
+                         MPI_Datatype dtype,
+                         MPI_Op  op,
+                         MPI_Comm  comm
+                         )
  {
-    int comm_size, i, pow2;
-    size_t total_message_size, dsize;
-    const double a = 0.0012;
-    const double b = 8.0;
-    const size_t small_message_size = 12 * 1024;
-    const size_t large_message_size = 256 * 1024;
+    size_t total_dsize, dsize;
+    int communicator_size = comm->size();
+    int alg = 1;
      int zerocounts = 0;
-
-    XBT_DEBUG("Coll_reduce_scatter_ompi::reduce_scatter");
-
-    comm_size = comm->size();
-    // We need data size for decision function
      dsize=dtype->size();
-    total_message_size = 0;
-    for (i = 0; i < comm_size; i++) {
-        total_message_size += rcounts[i];
-        if (0 == rcounts[i]) {
-            zerocounts = 1;
-        }
+    total_dsize = 0;
+    for (int i = 0; i < communicator_size; i++) {
+        total_dsize += rcounts[i];
+       // if (0 == rcounts[i]) {
+        //    zerocounts = 1;
+        //}
      }
-
+    total_dsize *= dsize;
+    int (*funcs[])(const void*, void*, const int*, MPI_Datatype, MPI_Op, MPI_Comm) = {
+        &reduce_scatter__default,
+        &reduce_scatter__ompi_basic_recursivehalving,
+        &reduce_scatter__ompi_ring,
+        &reduce_scatter__ompi_butterfly,
+    };
+    /** Algorithms:
+     *  {1, "non-overlapping"},
+     *  {2, "recursive_halving"},
+     *  {3, "ring"},
+     *  {4, "butterfly"},
+     *
+     * Non commutative algorithm capability needs re-investigation.
+     * Defaulting to non overlapping for non commutative ops.
+     */
      if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) {
-      Coll_reduce_scatter_default::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
-      return MPI_SUCCESS;
-    }
-
-    total_message_size *= dsize;
-
-    // compute the nearest power of 2
-    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
-
-    if ((total_message_size <= small_message_size) ||
-        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
-        (comm_size >= a * total_message_size + b)) {
-        return
-            Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter(sbuf, rbuf, rcounts,
-                                                                        dtype, op,
-                                                                        comm);
+        alg = 1;
+    } else {
+        if (communicator_size < 4) {
+            if (total_dsize < 65536) {
+                alg = 3;
+            } else if (total_dsize < 131072) {
+                alg = 4;
+            } else {
+                alg = 3;
+            }
+        } else if (communicator_size < 8) {
+            if (total_dsize < 8) {
+                alg = 1;
+            } else if (total_dsize < 262144) {
+                alg = 2;
+            } else {
+                alg = 3;
+            }
+        } else if (communicator_size < 32) {
+            if (total_dsize < 262144) {
+                alg = 2;
+            } else {
+                alg = 3;
+            }
+        } else if (communicator_size < 64) {
+            if (total_dsize < 64) {
+                alg = 1;
+            } else if (total_dsize < 2048) {
+                alg = 2;
+            } else if (total_dsize < 524288) {
+                alg = 4;
+            } else {
+                alg = 3;
+            }
+        } else if (communicator_size < 128) {
+            if (total_dsize < 256) {
+                alg = 1;
+            } else if (total_dsize < 512) {
+                alg = 2;
+            } else if (total_dsize < 2048) {
+                alg = 4;
+            } else if (total_dsize < 4096) {
+                alg = 2;
+            } else {
+                alg = 4;
+            }
+        } else if (communicator_size < 256) {
+            if (total_dsize < 256) {
+                alg = 1;
+            } else if (total_dsize < 512) {
+                alg = 2;
+            } else {
+                alg = 4;
+            }
+        } else if (communicator_size < 512) {
+            if (total_dsize < 256) {
+                alg = 1;
+            } else if (total_dsize < 1024) {
+                alg = 2;
+            } else {
+                alg = 4;
+            }
+        } else if (communicator_size < 1024) {
+            if (total_dsize < 512) {
+                alg = 1;
+            } else if (total_dsize < 2048) {
+                alg = 2;
+            } else if (total_dsize < 8192) {
+                alg = 4;
+            } else if (total_dsize < 16384) {
+                alg = 2;
+            } else {
+                alg = 4;
+            }
+        } else if (communicator_size < 2048) {
+            if (total_dsize < 512) {
+                alg = 1;
+            } else if (total_dsize < 4096) {
+                alg = 2;
+            } else if (total_dsize < 16384) {
+                alg = 4;
+            } else if (total_dsize < 32768) {
+                alg = 2;
+            } else {
+                alg = 4;
+            }
+        } else if (communicator_size < 4096) {
+            if (total_dsize < 512) {
+                alg = 1;
+            } else if (total_dsize < 4096) {
+                alg = 2;
+            } else {
+                alg = 4;
+            }
+        } else {
+            if (total_dsize < 1024) {
+                alg = 1;
+            } else if (total_dsize < 8192) {
+                alg = 2;
+            } else {
+                alg = 4;
+            }
+        }
      }
-    return Coll_reduce_scatter_ompi_ring::reduce_scatter(sbuf, rbuf, rcounts,
-                                                     dtype, op,
-                                                     comm);
-
-
  
+    return funcs[alg-1] (sbuf, rbuf, rcounts, dtype, op, comm);
  }
  
-int Coll_allgather_ompi::allgather(const void *sbuf, int scount,
-                                              MPI_Datatype sdtype,
-                                              void* rbuf, int rcount,
-                                              MPI_Datatype rdtype,
-                                              MPI_Comm  comm
-                                              )
+int allgather__ompi(const void *sbuf, int scount,
+                    MPI_Datatype sdtype,
+                    void* rbuf, int rcount,
+                    MPI_Datatype rdtype,
+                    MPI_Comm  comm
+                    )
  {
-    int communicator_size, pow2_size;
+    int communicator_size;
      size_t dsize, total_dsize;
-
+    int alg = 1;
      communicator_size = comm->size();
-
-    /* Special case for 2 processes */
-    if (communicator_size == 2) {
-        return Coll_allgather_pair::allgather (sbuf, scount, sdtype,
-                                                          rbuf, rcount, rdtype,
-                                                          comm/*, module*/);
+    if (MPI_IN_PLACE != sbuf) {
+        dsize = sdtype->size();
+    } else {
+        dsize = rdtype->size();
      }
-
-    /* Determine complete data size */
-    dsize=sdtype->size();
-    total_dsize = dsize * scount * communicator_size;
-
-    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
-
-    /* Decision based on MX 2Gb results from Grig cluster at
-       The University of Tennesse, Knoxville
-       - if total message size is less than 50KB use either bruck or
-       recursive doubling for non-power of two and power of two nodes,
-       respectively.
-       - else use ring and neighbor exchange algorithms for odd and even
-       number of nodes, respectively.
-    */
-    if (total_dsize < 50000) {
-        if (pow2_size == communicator_size) {
-            return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
-                                                                     rbuf, rcount, rdtype,
-                                                                     comm);
-        } else {
-            return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
-                                                         rbuf, rcount, rdtype,
-                                                         comm);
+    total_dsize = dsize * (ptrdiff_t)scount;
+    int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = {
+        &allgather__NTSLR_NB,
+        &allgather__bruck,
+        &allgather__rdb,
+        &allgather__ring,
+        &allgather__ompi_neighborexchange,
+        &allgather__pair
+    };
+    /** Algorithms:
+     *  {1, "linear"},
+     *  {2, "bruck"},
+     *  {3, "recursive_doubling"},
+     *  {4, "ring"},
+     *  {5, "neighbor"},
+     *  {6, "two_proc"}
+     */
+    if (communicator_size == 2) {
+        alg = 6;
+    } else if (communicator_size < 32) {
+        alg = 3;
+    } else if (communicator_size < 64) {
+        if (total_dsize < 1024) {
+            alg = 3;
+        } else if (total_dsize < 65536) {
+            alg = 5;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 128) {
+        if (total_dsize < 512) {
+            alg = 3;
+        } else if (total_dsize < 65536) {
+            alg = 5;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 256) {
+        if (total_dsize < 512) {
+            alg = 3;
+        } else if (total_dsize < 131072) {
+            alg = 5;
+        } else if (total_dsize < 524288) {
+            alg = 4;
+        } else if (total_dsize < 1048576) {
+            alg = 5;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 512) {
+        if (total_dsize < 32) {
+            alg = 3;
+        } else if (total_dsize < 128) {
+            alg = 2;
+        } else if (total_dsize < 1024) {
+            alg = 3;
+        } else if (total_dsize < 131072) {
+            alg = 5;
+        } else if (total_dsize < 524288) {
+            alg = 4;
+        } else if (total_dsize < 1048576) {
+            alg = 5;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 1024) {
+        if (total_dsize < 64) {
+            alg = 3;
+        } else if (total_dsize < 256) {
+            alg = 2;
+        } else if (total_dsize < 2048) {
+            alg = 3;
+        } else {
+            alg = 5;
+        }
+    } else if (communicator_size < 2048) {
+        if (total_dsize < 4) {
+            alg = 3;
+        } else if (total_dsize < 8) {
+            alg = 2;
+        } else if (total_dsize < 16) {
+            alg = 3;
+        } else if (total_dsize < 32) {
+            alg = 2;
+        } else if (total_dsize < 256) {
+            alg = 3;
+        } else if (total_dsize < 512) {
+            alg = 2;
+        } else if (total_dsize < 4096) {
+            alg = 3;
+        } else {
+            alg = 5;
+        }
+    } else if (communicator_size < 4096) {
+        if (total_dsize < 32) {
+            alg = 2;
+        } else if (total_dsize < 128) {
+            alg = 3;
+        } else if (total_dsize < 512) {
+            alg = 2;
+        } else if (total_dsize < 4096) {
+            alg = 3;
+        } else {
+            alg = 5;
          }
      } else {
-        if (communicator_size % 2) {
-            return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
-                                                        rbuf, rcount, rdtype,
-                                                        comm);
+        if (total_dsize < 2) {
+            alg = 3;
+        } else if (total_dsize < 8) {
+            alg = 2;
+        } else if (total_dsize < 16) {
+            alg = 3;
+        } else if (total_dsize < 512) {
+            alg = 2;
+        } else if (total_dsize < 4096) {
+            alg = 3;
          } else {
-            return  Coll_allgather_ompi_neighborexchange::allgather(sbuf, scount, sdtype,
-                                                                     rbuf, rcount, rdtype,
-                                                                     comm);
+            alg = 5;
          }
      }
  
-#if defined(USE_MPICH2_DECISION)
-    /* Decision as in MPICH-2
-       presented in Thakur et.al. "Optimization of Collective Communication
-       Operations in MPICH", International Journal of High Performance Computing
-       Applications, Vol. 19, No. 1, 49-66 (2005)
-       - for power-of-two processes and small and medium size messages
-       (up to 512KB) use recursive doubling
-       - for non-power-of-two processes and small messages (80KB) use bruck,
-       - for everything else use ring.
-    */
-    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
-        return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
-                                                                 rbuf, rcount, rdtype,
-                                                                 comm);
-    } else if (total_dsize <= 81920) {
-        return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
-                                                     rbuf, rcount, rdtype,
-                                                     comm);
-    }
-    return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
-                                                rbuf, rcount, rdtype,
-                                                comm);
-#endif  /* defined(USE_MPICH2_DECISION) */
+    return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
+
  }
  
-int Coll_allgatherv_ompi::allgatherv(const void *sbuf, int scount,
-                                               MPI_Datatype sdtype,
-                                               void* rbuf, const int *rcounts,
-                                               const int *rdispls,
-                                               MPI_Datatype rdtype,
-                                               MPI_Comm  comm
-                                               )
+int allgatherv__ompi(const void *sbuf, int scount,
+                     MPI_Datatype sdtype,
+                     void* rbuf, const int *rcounts,
+                     const int *rdispls,
+                     MPI_Datatype rdtype,
+                     MPI_Comm  comm
+                     )
  {
      int i;
      int communicator_size;
      size_t dsize, total_dsize;
-
+    int alg = 1;
      communicator_size = comm->size();
-
-    /* Special case for 2 processes */
-    if (communicator_size == 2) {
-        return Coll_allgatherv_pair::allgatherv(sbuf, scount, sdtype,
-                                                           rbuf, rcounts, rdispls, rdtype,
-                                                           comm);
+    if (MPI_IN_PLACE != sbuf) {
+        dsize = sdtype->size();
+    } else {
+        dsize = rdtype->size();
      }
  
-    /* Determine complete data size */
-    dsize=sdtype->size();
      total_dsize = 0;
      for (i = 0; i < communicator_size; i++) {
          total_dsize += dsize * rcounts[i];
      }
  
-    /* Decision based on allgather decision.   */
-    if (total_dsize < 50000) {
-        return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
-                                                      rbuf, rcounts, rdispls, rdtype,
-                                                      comm);
-
+    /* use the per-rank data size as basis, similar to allgather */
+    size_t per_rank_dsize = total_dsize / communicator_size;
+
+    int (*funcs[])(const void*, int, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = {
+        &allgatherv__GB,
+        &allgatherv__ompi_bruck,
+        &allgatherv__mpich_ring,
+        &allgatherv__ompi_neighborexchange,
+        &allgatherv__pair
+    };
+    /** Algorithms:
+     *  {1, "default"},
+     *  {2, "bruck"},
+     *  {3, "ring"},
+     *  {4, "neighbor"},
+     *  {5, "two_proc"},
+     */
+    if (communicator_size == 2) {
+        if (per_rank_dsize < 2048) {
+            alg = 3;
+        } else if (per_rank_dsize < 4096) {
+            alg = 5;
+        } else if (per_rank_dsize < 8192) {
+            alg = 3;
+        } else {
+            alg = 5;
+        }
+    } else if (communicator_size < 8) {
+        if (per_rank_dsize < 256) {
+            alg = 1;
+        } else if (per_rank_dsize < 4096) {
+            alg = 4;
+        } else if (per_rank_dsize < 8192) {
+            alg = 3;
+        } else if (per_rank_dsize < 16384) {
+            alg = 4;
+        } else if (per_rank_dsize < 262144) {
+            alg = 2;
+        } else {
+            alg = 4;
+        }
+    } else if (communicator_size < 16) {
+        if (per_rank_dsize < 1024) {
+            alg = 1;
+        } else {
+            alg = 2;
+        }
+    } else if (communicator_size < 32) {
+        if (per_rank_dsize < 128) {
+            alg = 1;
+        } else if (per_rank_dsize < 262144) {
+            alg = 2;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 64) {
+        if (per_rank_dsize < 256) {
+            alg = 1;
+        } else if (per_rank_dsize < 8192) {
+            alg = 2;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 128) {
+        if (per_rank_dsize < 256) {
+            alg = 1;
+        } else if (per_rank_dsize < 4096) {
+            alg = 2;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 256) {
+        if (per_rank_dsize < 1024) {
+            alg = 2;
+        } else if (per_rank_dsize < 65536) {
+            alg = 4;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 512) {
+        if (per_rank_dsize < 1024) {
+            alg = 2;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 1024) {
+        if (per_rank_dsize < 512) {
+            alg = 2;
+        } else if (per_rank_dsize < 1024) {
+            alg = 1;
+        } else if (per_rank_dsize < 4096) {
+            alg = 2;
+        } else if (per_rank_dsize < 1048576) {
+            alg = 4;
+        } else {
+            alg = 3;
+        }
      } else {
-        if (communicator_size % 2) {
-            return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
-                                                         rbuf, rcounts, rdispls, rdtype,
-                                                         comm);
+        if (per_rank_dsize < 4096) {
+            alg = 2;
          } else {
-            return  Coll_allgatherv_ompi_neighborexchange::allgatherv(sbuf, scount, sdtype,
-                                                                      rbuf, rcounts, rdispls, rdtype,
-                                                                      comm);
+            alg = 4;
          }
      }
+
+    return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm);
  }
  
-int Coll_gather_ompi::gather(const void *sbuf, int scount,
-                                           MPI_Datatype sdtype,
-                                           void* rbuf, int rcount,
-                                           MPI_Datatype rdtype,
-                                           int root,
-                                           MPI_Comm  comm
-                                           )
+int gather__ompi(const void *sbuf, int scount,
+                 MPI_Datatype sdtype,
+                 void* rbuf, int rcount,
+                 MPI_Datatype rdtype,
+                 int root,
+                 MPI_Comm  comm
+                 )
  {
-    //const int large_segment_size = 32768;
-    //const int small_segment_size = 1024;
-
-    //const size_t large_block_size = 92160;
-    const size_t intermediate_block_size = 6000;
-    const size_t small_block_size = 1024;
-
-    const int large_communicator_size = 60;
-    const int small_communicator_size = 10;
-
      int communicator_size, rank;
-    size_t dsize, block_size;
-
-    XBT_DEBUG("smpi_coll_tuned_gather_ompi");
-
+    size_t dsize, total_dsize;
+    int alg = 1;
      communicator_size = comm->size();
      rank = comm->rank();
  
-    // Determine block size
      if (rank == root) {
          dsize = rdtype->size();
-        block_size = dsize * rcount;
+        total_dsize = dsize * rcount;
      } else {
          dsize = sdtype->size();
-        block_size = dsize * scount;
+        total_dsize = dsize * scount;
      }
-
-/*    if (block_size > large_block_size) {*/
-/*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
-/*                                                         rbuf, rcount, rdtype, */
-/*                                                         root, comm);*/
-
-/*    } else*/ if (block_size > intermediate_block_size) {
-        return Coll_gather_ompi_linear_sync::gather (sbuf, scount, sdtype,
-                                                         rbuf, rcount, rdtype,
-                                                         root, comm);
-
-    } else if ((communicator_size > large_communicator_size) ||
-               ((communicator_size > small_communicator_size) &&
-                (block_size < small_block_size))) {
-        return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
-                                                      rbuf, rcount, rdtype,
-                                                      root, comm);
-
+    int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = {
+        &gather__ompi_basic_linear,
+        &gather__ompi_binomial,
+        &gather__ompi_linear_sync
+    };
+    /** Algorithms:
+     *  {1, "basic_linear"},
+     *  {2, "binomial"},
+     *  {3, "linear_sync"},
+     *
+     * We do not make any rank specific checks since the params
+     * should be uniform across ranks.
+     */
+    if (communicator_size < 4) {
+        if (total_dsize < 2) {
+            alg = 3;
+        } else if (total_dsize < 4) {
+            alg = 1;
+        } else if (total_dsize < 32768) {
+            alg = 2;
+        } else if (total_dsize < 65536) {
+            alg = 1;
+        } else if (total_dsize < 131072) {
+            alg = 2;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 8) {
+        if (total_dsize < 1024) {
+            alg = 2;
+        } else if (total_dsize < 8192) {
+            alg = 1;
+        } else if (total_dsize < 32768) {
+            alg = 2;
+        } else if (total_dsize < 262144) {
+            alg = 1;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 256) {
+        alg = 2;
+    } else if (communicator_size < 512) {
+        if (total_dsize < 2048) {
+            alg = 2;
+        } else if (total_dsize < 8192) {
+            alg = 1;
+        } else {
+            alg = 2;
+        }
+    } else {
+        alg = 2;
      }
-    // Otherwise, use basic linear
-    return Coll_gather_ompi_basic_linear::gather (sbuf, scount, sdtype,
-                                                      rbuf, rcount, rdtype,
-                                                      root, comm);
+
+    return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
  }
  
  
-int Coll_scatter_ompi::scatter(const void *sbuf, int scount,
-                                            MPI_Datatype sdtype,
-                                            void* rbuf, int rcount,
-                                            MPI_Datatype rdtype,
-                                            int root, MPI_Comm  comm
-                                            )
+int scatter__ompi(const void *sbuf, int scount,
+                  MPI_Datatype sdtype,
+                  void* rbuf, int rcount,
+                  MPI_Datatype rdtype,
+                  int root, MPI_Comm  comm
+                  )
  {
-    const size_t small_block_size = 300;
-    const int small_comm_size = 10;
      int communicator_size, rank;
-    size_t dsize, block_size;
-
-    XBT_DEBUG("Coll_scatter_ompi::scatter");
+    size_t dsize, total_dsize;
+    int alg = 1;
  
      communicator_size = comm->size();
      rank = comm->rank();
-    // Determine block size
      if (root == rank) {
          dsize=sdtype->size();
-        block_size = dsize * scount;
+        total_dsize = dsize * scount;
      } else {
          dsize=rdtype->size();
-        block_size = dsize * rcount;
+        total_dsize = dsize * rcount;
      }
-
-    if ((communicator_size > small_comm_size) &&
-        (block_size < small_block_size)) {
-      std::unique_ptr<unsigned char[]> tmp_buf;
-      if (rank != root) {
-        tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]);
-        sbuf   = tmp_buf.get();
-        scount = rcount;
-        sdtype = rdtype;
-      }
-      return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
+    int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = {
+        &scatter__ompi_basic_linear,
+        &scatter__ompi_binomial,
+        &scatter__ompi_linear_nb
+    };
+    /** Algorithms:
+     *  {1, "basic_linear"},
+     *  {2, "binomial"},
+     *  {3, "linear_nb"},
+     *
+     * We do not make any rank specific checks since the params
+     * should be uniform across ranks.
+     */
+    if (communicator_size < 4) {
+        if (total_dsize < 2) {
+            alg = 3;
+        } else if (total_dsize < 131072) {
+            alg = 1;
+        } else if (total_dsize < 262144) {
+            alg = 3;
+        } else {
+            alg = 1;
+        }
+    } else if (communicator_size < 8) {
+        if (total_dsize < 2048) {
+            alg = 2;
+        } else if (total_dsize < 4096) {
+            alg = 1;
+        } else if (total_dsize < 8192) {
+            alg = 2;
+        } else if (total_dsize < 32768) {
+            alg = 1;
+        } else if (total_dsize < 1048576) {
+            alg = 3;
+        } else {
+            alg = 1;
+        }
+    } else if (communicator_size < 16) {
+        if (total_dsize < 16384) {
+            alg = 2;
+        } else if (total_dsize < 1048576) {
+            alg = 3;
+        } else {
+            alg = 1;
+        }
+    } else if (communicator_size < 32) {
+        if (total_dsize < 16384) {
+            alg = 2;
+        } else if (total_dsize < 32768) {
+            alg = 1;
+        } else {
+            alg = 3;
+        }
+    } else if (communicator_size < 64) {
+        if (total_dsize < 512) {
+            alg = 2;
+        } else if (total_dsize < 8192) {
+            alg = 3;
+        } else if (total_dsize < 16384) {
+            alg = 2;
+        } else {
+            alg = 3;
+        }
+    } else {
+        if (total_dsize < 512) {
+            alg = 2;
+        } else {
+            alg = 3;
+        }
      }
-    return Coll_scatter_ompi_basic_linear::scatter (sbuf, scount, sdtype,
-                                                       rbuf, rcount, rdtype,
-                                                       root, comm);
-}
  
+    return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
  }
-}
+
+} // namespace simgrid::smpi