+/*
+ * ompi_coll_base_reduce_scatter_intra_butterfly
+ *
+ * Function: Butterfly algorithm for reduce_scatter
+ * Accepts: Same as MPI_Reduce_scatter
+ * Returns: MPI_SUCCESS or error code
+ *
+ * Description: Implements butterfly algorithm for MPI_Reduce_scatter [*].
+ * The algorithm can be used both by commutative and non-commutative
+ * operations, for power-of-two and non-power-of-two number of processes.
+ *
+ * [*] J.L. Traff. An improved Algorithm for (non-commutative) Reduce-scatter
+ * with an Application // Proc. of EuroPVM/MPI, 2005. -- pp. 129-137.
+ *
+ * Time complexity: O(m\lambda + log(p)\alpha + m\beta + m\gamma),
+ * where m = sum of rcounts[], p = comm_size
+ * Memory requirements (per process): 2 * m * typesize + comm_size
+ *
+ * Example: comm_size=6, nprocs_pof2=4, nprocs_rem=2, rcounts[]=1, sbuf=[0,1,...,5]
+ * Step 1. Reduce the number of processes to 4
+ * rank 0: [0|1|2|3|4|5]: send to 1: vrank -1
+ * rank 1: [0|1|2|3|4|5]: recv from 0, op: vrank 0: [0|2|4|6|8|10]
+ * rank 2: [0|1|2|3|4|5]: send to 3: vrank -1
+ * rank 3: [0|1|2|3|4|5]: recv from 2, op: vrank 1: [0|2|4|6|8|10]
+ * rank 4: [0|1|2|3|4|5]: vrank 2: [0|1|2|3|4|5]
+ * rank 5: [0|1|2|3|4|5]: vrank 3: [0|1|2|3|4|5]
+ *
+ * Step 2. Butterfly. Buffer of 6 elements is divided into 4 blocks.
+ * Round 1 (mask=1, nblocks=2)
+ * 0: vrank -1
+ * 1: vrank 0 [0 2|4 6|8|10]: exch with 1: send [2,3], recv [0,1]: [0 4|8 12|*|*]
+ * 2: vrank -1
+ * 3: vrank 1 [0 2|4 6|8|10]: exch with 0: send [0,1], recv [2,3]: [**|**|16|20]
+ * 4: vrank 2 [0 1|2 3|4|5] : exch with 3: send [2,3], recv [0,1]: [0 2|4 6|*|*]
+ * 5: vrank 3 [0 1|2 3|4|5] : exch with 2: send [0,1], recv [2,3]: [**|**|8|10]
+ *
+ * Round 2 (mask=2, nblocks=1)
+ * 0: vrank -1
+ * 1: vrank 0 [0 4|8 12|*|*]: exch with 2: send [1], recv [0]: [0 6|**|*|*]
+ * 2: vrank -1
+ * 3: vrank 1 [**|**|16|20] : exch with 3: send [3], recv [2]: [**|**|24|*]
+ * 4: vrank 2 [0 2|4 6|*|*] : exch with 0: send [0], recv [1]: [**|12 18|*|*]
+ * 5: vrank 3 [**|**|8|10] : exch with 1: send [2], recv [3]: [**|**|*|30]
+ *
+ * Step 3. Exchange with remote process according to a mirror permutation:
+ * mperm(0)=0, mperm(1)=2, mperm(2)=1, mperm(3)=3
+ * 0: vrank -1: recv "0" from process 0
+ * 1: vrank 0 [0 6|**|*|*]: send "0" to 0, copy "6" to rbuf (mperm(0)=0)
+ * 2: vrank -1: recv result "12" from process 4
+ * 3: vrank 1 [**|**|24|*]
+ * 4: vrank 2 [**|12 18|*|*]: send "12" to 2, send "18" to 3, recv "24" from 3
+ * 5: vrank 3 [**|**|*|30]: copy "30" to rbuf (mperm(3)=3)
+ */
+int reduce_scatter__ompi_butterfly(
+ const void *sbuf, void *rbuf, const int *rcounts, MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm)
+{
+ char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
+ int *displs = NULL, index;
+ ptrdiff_t span, gap, totalcount, extent;
+ int err = MPI_SUCCESS;
+ int comm_size = comm->size();
+ int rank = comm->rank();
+ int vrank = -1;
+ int nprocs_rem = 0;
+
+ XBT_DEBUG("coll:base:reduce_scatter_intra_butterfly: rank %d/%d",
+ rank, comm_size);
+ if (comm_size < 2)
+ return MPI_SUCCESS;
+
+ displs = (int*)malloc(sizeof(*displs) * comm_size);
+ if (NULL == displs) {
+ err = MPI_ERR_OTHER;
+ goto cleanup_and_return;
+ }
+ displs[0] = 0;
+ for (int i = 1; i < comm_size; i++) {
+ displs[i] = displs[i - 1] + rcounts[i - 1];
+ }
+ totalcount = displs[comm_size - 1] + rcounts[comm_size - 1];
+ dtype->extent(&gap, &extent);
+ span = extent * totalcount;
+ tmpbuf[0] = (char*)malloc(span);
+ tmpbuf[1] = (char*)malloc(span);
+ if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
+ err = MPI_ERR_OTHER;
+ goto cleanup_and_return;
+ }
+ psend = tmpbuf[0] - gap;
+ precv = tmpbuf[1] - gap;
+
+ if (sbuf != MPI_IN_PLACE) {
+ err = Datatype::copy(sbuf, totalcount, dtype, psend, totalcount, dtype);
+ if (MPI_SUCCESS != err) { goto cleanup_and_return; }
+ } else {
+ err = Datatype::copy(rbuf, totalcount, dtype, psend, totalcount, dtype);
+ if (MPI_SUCCESS != err) { goto cleanup_and_return; }
+ }
+
+ /*
+ * Step 1. Reduce the number of processes to the nearest lower power of two
+ * p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
+ * In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send
+ * the input vector to their neighbor (rank + 1) and all the odd ranks recv
+ * the input vector and perform local reduction.
+ * The odd ranks (0 to 2r - 1) contain the reduction with the input
+ * vector on their neighbors (the even ranks). The first r odd
+ * processes and the p - 2r last processes are renumbered from
+ * 0 to 2^{\floor{\log_2 p}} - 1. Even ranks do not participate in the
+ * rest of the algorithm.
+ */
+
+ /* Find nearest power-of-two less than or equal to comm_size */
+ int nprocs_pof2, size;
+ for( nprocs_pof2 = 1, size = comm_size; size > 0; size >>= 1, nprocs_pof2 <<= 1 );
+ nprocs_pof2 >>= 1;
+
+ nprocs_rem = comm_size - nprocs_pof2;
+ int log2_size;
+ for (log2_size = 0, size = 1; size < nprocs_pof2; ++log2_size, size <<= 1);
+
+ if (rank < 2 * nprocs_rem) {
+ if ((rank % 2) == 0) {
+ /* Even process */
+ Request::send(psend, totalcount, dtype, rank + 1,
+ COLL_TAG_REDUCE_SCATTER, comm);
+ /* This process does not participate in the rest of the algorithm */
+ vrank = -1;
+ } else {
+ /* Odd process */
+ Request::recv(precv, totalcount, dtype, rank - 1,
+ COLL_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE);
+ op->apply(precv, psend, (int*)&totalcount, dtype);
+ /* Adjust rank to be the bottom "remain" ranks */
+ vrank = rank / 2;
+ }
+ } else {
+ /* Adjust rank to show that the bottom "even remain" ranks dropped out */
+ vrank = rank - nprocs_rem;
+ }