+ /*
+ * Step 1. Reduce the number of processes to the nearest lower power of two
+ * p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
+ * In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send
+ * the input vector to their neighbor (rank + 1) and all the odd ranks recv
+ * the input vector and perform local reduction.
+ * The odd ranks (0 to 2r - 1) contain the reduction with the input
+ * vector on their neighbors (the even ranks). The first r odd
+ * processes and the p - 2r last processes are renumbered from
+ * 0 to 2^{\floor{\log_2 p}} - 1. Even ranks do not participate in the
+ * rest of the algorithm.
+ */
+
+ /* Find nearest power-of-two less than or equal to comm_size */
+ int nprocs_pof2, size;
+ for( nprocs_pof2 = 1, size = comm_size; size > 0; size >>= 1, nprocs_pof2 <<= 1 );
+ nprocs_pof2 >>= 1;
+
+ nprocs_rem = comm_size - nprocs_pof2;
+ int log2_size;
+ for (log2_size = 0, size = 1; size < nprocs_pof2; ++log2_size, size <<= 1);
+
+ if (rank < 2 * nprocs_rem) {
+ if ((rank % 2) == 0) {
+ /* Even process */
+ Request::send(psend, totalcount, dtype, rank + 1,
+ COLL_TAG_REDUCE_SCATTER, comm);
+ /* This process does not participate in the rest of the algorithm */
+ vrank = -1;
+ } else {
+ /* Odd process */
+ Request::recv(precv, totalcount, dtype, rank - 1,
+ COLL_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE);
+ op->apply(precv, psend, (int*)&totalcount, dtype);
+ /* Adjust rank to be the bottom "remain" ranks */
+ vrank = rank / 2;
+ }
+ } else {
+ /* Adjust rank to show that the bottom "even remain" ranks dropped out */
+ vrank = rank - nprocs_rem;
+ }
+
+ if (vrank != -1) {
+ /*
+ * Now, psend vector of size totalcount is divided into nprocs_pof2 blocks:
+ * block 0: rcounts[0] and rcounts[1] -- for process 0 and 1
+ * block 1: rcounts[2] and rcounts[3] -- for process 2 and 3
+ * ...
+ * block r-1: rcounts[2*(r-1)] and rcounts[2*(r-1)+1]
+ * block r: rcounts[r+r]
+ * block r+1: rcounts[r+r+1]
+ * ...
+ * block nprocs_pof2 - 1: rcounts[r+nprocs_pof2-1]
+ */
+ int nblocks = nprocs_pof2, send_index = 0, recv_index = 0;
+ for (int mask = 1; mask < nprocs_pof2; mask <<= 1) {
+ int vpeer = vrank ^ mask;
+ int peer = (vpeer < nprocs_rem) ? vpeer * 2 + 1 : vpeer + nprocs_rem;
+
+ nblocks /= 2;
+ if ((vrank & mask) == 0) {
+ /* Send the upper half of reduction buffer, recv the lower half */
+ send_index += nblocks;
+ } else {
+ /* Send the upper half of reduction buffer, recv the lower half */
+ recv_index += nblocks;
+ }
+
+ /* Send blocks: [send_index, send_index + nblocks - 1] */
+ int send_count = ompi_sum_counts(rcounts, displs, nprocs_rem,
+ send_index, send_index + nblocks - 1);
+ index = (send_index < nprocs_rem) ? 2 * send_index : nprocs_rem + send_index;
+ ptrdiff_t sdispl = displs[index];
+
+ /* Recv blocks: [recv_index, recv_index + nblocks - 1] */
+ int recv_count = ompi_sum_counts(rcounts, displs, nprocs_rem,
+ recv_index, recv_index + nblocks - 1);
+ index = (recv_index < nprocs_rem) ? 2 * recv_index : nprocs_rem + recv_index;
+ ptrdiff_t rdispl = displs[index];
+
+ Request::sendrecv(psend + (ptrdiff_t)sdispl * extent, send_count,
+ dtype, peer, COLL_TAG_REDUCE_SCATTER,
+ precv + (ptrdiff_t)rdispl * extent, recv_count,
+ dtype, peer, COLL_TAG_REDUCE_SCATTER,
+ comm, MPI_STATUS_IGNORE);
+
+ if (vrank < vpeer) {
+ /* precv = psend <op> precv */
+ op->apply(psend + (ptrdiff_t)rdispl * extent,
+ precv + (ptrdiff_t)rdispl * extent, &recv_count, dtype);
+ char *p = psend;
+ psend = precv;
+ precv = p;
+ } else {
+ /* psend = precv <op> psend */
+ op->apply(precv + (ptrdiff_t)rdispl * extent,
+ psend + (ptrdiff_t)rdispl * extent, &recv_count, dtype);
+ }
+ send_index = recv_index;
+ }
+ /*
+ * psend points to the result block [send_index]
+ * Exchange results with remote process according to a mirror permutation.
+ */
+ int vpeer = ompi_mirror_perm(vrank, log2_size);
+ int peer = (vpeer < nprocs_rem) ? vpeer * 2 + 1 : vpeer + nprocs_rem;
+ index = (send_index < nprocs_rem) ? 2 * send_index : nprocs_rem + send_index;
+
+ if (vpeer < nprocs_rem) {
+ /*
+ * Process has two blocks: for excluded process and own.
+ * Send the first block to excluded process.
+ */
+ Request::send(psend + (ptrdiff_t)displs[index] * extent,
+ rcounts[index], dtype, peer - 1,
+ COLL_TAG_REDUCE_SCATTER,
+ comm);
+ }
+
+ /* If process has two blocks, then send the second block (own block) */
+ if (vpeer < nprocs_rem)
+ index++;
+ if (vpeer != vrank) {
+ Request::sendrecv(psend + (ptrdiff_t)displs[index] * extent,
+ rcounts[index], dtype, peer,
+ COLL_TAG_REDUCE_SCATTER,
+ rbuf, rcounts[rank], dtype, peer,
+ COLL_TAG_REDUCE_SCATTER,
+ comm, MPI_STATUS_IGNORE);
+ } else {
+ err = Datatype::copy(psend + (ptrdiff_t)displs[rank] * extent, rcounts[rank], dtype,
+ rbuf, rcounts[rank], dtype);
+ if (MPI_SUCCESS != err) { goto cleanup_and_return; }
+ }
+
+ } else {
+ /* Excluded process: receive result */
+ int vpeer = ompi_mirror_perm((rank + 1) / 2, log2_size);
+ int peer = (vpeer < nprocs_rem) ? vpeer * 2 + 1 : vpeer + nprocs_rem;
+ Request::recv(rbuf, rcounts[rank], dtype, peer,
+ COLL_TAG_REDUCE_SCATTER, comm,
+ MPI_STATUS_IGNORE);
+ }
+
+cleanup_and_return:
+ if (displs)
+ free(displs);
+ if (tmpbuf[0])
+ free(tmpbuf[0]);
+ if (tmpbuf[1])
+ free(tmpbuf[1]);
+ return err;
+}
+} // namespace simgrid::smpi