1 /* selector for collective algorithms based on mpich decision logic */
3 /* Copyright (c) 2009-2010, 2013-2017. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.hpp"
11 /* This is the default implementation of allreduce. The algorithm is:
13 Algorithm: MPI_Allreduce
15 For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
16 in order to meet the requirement that all processes must have the
17 same result. For the homogeneous case, we use the following algorithms.
20 For long messages and for builtin ops and if count >= pof2 (where
21 pof2 is the nearest power-of-two less than or equal to the number
22 of processes), we use Rabenseifner's algorithm (see
23 http://www.hlrs.de/mpi/myreduce.html).
24 This algorithm implements the allreduce in two steps: first a
25 reduce-scatter, followed by an allgather. A recursive-halving
26 algorithm (beginning with processes that are distance 1 apart) is
27 used for the reduce-scatter, and a recursive doubling
28 algorithm is used for the allgather. The non-power-of-two case is
29 handled by dropping to the nearest lower power-of-two: the first
30 few even-numbered processes send their data to their right neighbors
31 (rank+1), and the reduce-scatter and allgather happen among the remaining
32 power-of-two processes. At the end, the first few even-numbered
33 processes get the result from their right neighbors.
35 For the power-of-two case, the cost for the reduce-scatter is
36 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
37 allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
39 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
41 For the non-power-of-two case,
42 Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
45 For short messages, for user-defined ops, and for count < pof2
46 we use a recursive doubling algorithm (similar to the one in
47 MPI_Allgather). We use this algorithm in the case of user-defined ops
48 because in this case derived datatypes are allowed, and the user
49 could pass basic datatypes on one process and derived on another as
50 long as the type maps are the same. Breaking up derived datatypes
51 to do the reduce-scatter is tricky.
53 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
55 Possible improvements:
57 End Algorithm: MPI_Allreduce
61 int Coll_allreduce_mpich::allreduce(void *sbuf, void *rbuf, int count,
62 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
64 size_t dsize, block_dsize;
65 int comm_size = comm->size();
66 const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
68 dsize = dtype->size();
69 block_dsize = dsize * count;
72 /* find nearest power-of-two less than or equal to comm_size */
74 while (pof2 <= comm_size) pof2 <<= 1;
77 if (block_dsize > large_message && count >= pof2 && (op==MPI_OP_NULL || op->is_commutative())) {
79 return (Coll_allreduce_rab_rdb::allreduce (sbuf, rbuf,
83 //for short ones and count < pof2
84 return (Coll_allreduce_rdb::allreduce (sbuf, rbuf,
91 /* This is the default implementation of alltoall. The algorithm is:
93 Algorithm: MPI_Alltoall
95 We use four algorithms for alltoall. For short messages and
96 (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
97 IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
98 takes lgp steps. Because of the extra communication, the bandwidth
99 requirement is (n/2).lgp.beta.
101 Cost = lgp.alpha + (n/2).lgp.beta
103 where n is the total amount of data a process needs to send to all
106 For medium size messages and (short messages for comm_size < 8), we
107 use an algorithm that posts all irecvs and isends and then does a
108 waitall. We scatter the order of sources and destinations among the
109 processes, so that all processes don't try to send/recv to/from the
110 same process at the same time.
112 *** Modification: We post only a small number of isends and irecvs
113 at a time and wait on them as suggested by Tony Ladd. ***
114 *** See comments below about an additional modification that
115 we may want to consider ***
117 For long messages and power-of-two number of processes, we use a
118 pairwise exchange algorithm, which takes p-1 steps. We
119 calculate the pairs by using an exclusive-or algorithm:
120 for (i=1; i<comm_size; i++)
122 This algorithm doesn't work if the number of processes is not a power of
123 two. For a non-power-of-two number of processes, we use an
124 algorithm in which, in step i, each process receives from (rank-i)
125 and sends to (rank+i).
127 Cost = (p-1).alpha + n.beta
129 where n is the total amount of data a process needs to send to all
132 Possible improvements:
134 End Algorithm: MPI_Alltoall
137 int Coll_alltoall_mpich::alltoall( void *sbuf, int scount,
139 void* rbuf, int rcount,
143 int communicator_size;
144 size_t dsize, block_dsize;
145 communicator_size = comm->size();
147 unsigned int short_size=256;
148 unsigned int medium_size=32768;
149 //short size and comm_size >=8 -> bruck
151 // medium size messages and (short messages for comm_size < 8), we
152 // use an algorithm that posts all irecvs and isends and then does a
155 // For long messages and power-of-two number of processes, we use a
156 // pairwise exchange algorithm
158 // For a non-power-of-two number of processes, we use an
159 // algorithm in which, in step i, each process receives from (rank-i)
160 // and sends to (rank+i).
163 dsize = sdtype->size();
164 block_dsize = dsize * scount;
166 if ((block_dsize < short_size) && (communicator_size >= 8)) {
167 return Coll_alltoall_bruck::alltoall(sbuf, scount, sdtype,
168 rbuf, rcount, rdtype,
171 } else if (block_dsize < medium_size) {
172 return Coll_alltoall_basic_linear::alltoall(sbuf, scount, sdtype,
173 rbuf, rcount, rdtype,
175 }else if (communicator_size%2){
176 return Coll_alltoall_ring::alltoall(sbuf, scount, sdtype,
177 rbuf, rcount, rdtype,
181 return Coll_alltoall_ring::alltoall (sbuf, scount, sdtype,
182 rbuf, rcount, rdtype,
186 int Coll_alltoallv_mpich::alltoallv(void *sbuf, int *scounts, int *sdisps,
188 void *rbuf, int *rcounts, int *rdisps,
193 /* For starters, just keep the original algorithm. */
194 return Coll_alltoallv_bruck::alltoallv(sbuf, scounts, sdisps, sdtype,
195 rbuf, rcounts, rdisps,rdtype,
200 int Coll_barrier_mpich::barrier(MPI_Comm comm)
202 return Coll_barrier_ompi_bruck::barrier(comm);
205 /* This is the default implementation of broadcast. The algorithm is:
209 For short messages, we use a binomial tree algorithm.
210 Cost = lgp.alpha + n.lgp.beta
212 For long messages, we do a scatter followed by an allgather.
213 We first scatter the buffer using a binomial tree algorithm. This costs
214 lgp.alpha + n.((p-1)/p).beta
215 If the datatype is contiguous and the communicator is homogeneous,
216 we treat the data as bytes and divide (scatter) it among processes
217 by using ceiling division. For the noncontiguous or heterogeneous
218 cases, we first pack the data into a temporary buffer by using
219 MPI_Pack, scatter it as bytes, and unpack it after the allgather.
221 For the allgather, we use a recursive doubling algorithm for
222 medium-size messages and power-of-two number of processes. This
223 takes lgp steps. In each step pairs of processes exchange all the
224 data they have (we take care of non-power-of-two situations). This
225 costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
226 because it may be slightly more in the non-power-of-two case, but
227 it's still a logarithmic algorithm.) Therefore, for long messages
228 Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
230 Note that this algorithm has twice the latency as the tree algorithm
231 we use for short messages, but requires lower bandwidth: 2.n.beta
232 versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
233 this algorithm will perform better.
235 For long messages and for medium-size messages and non-power-of-two
236 processes, we use a ring algorithm for the allgather, which
237 takes p-1 steps, because it performs better than recursive doubling.
238 Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
240 Possible improvements:
241 For clusters of SMPs, we may want to do something differently to
242 take advantage of shared memory on each node.
244 End Algorithm: MPI_Bcast
248 int Coll_bcast_mpich::bcast(void *buff, int count,
249 MPI_Datatype datatype, int root,
253 /* Decision function based on MX results for
254 messages up to 36MB and communicator sizes up to 64 nodes */
255 const size_t small_message_size = 12288;
256 const size_t intermediate_message_size = 524288;
258 int communicator_size;
260 size_t message_size, dsize;
262 communicator_size = comm->size();
264 /* else we need data size for decision function */
265 dsize = datatype->size();
266 message_size = dsize * (unsigned long)count; /* needed for decision */
268 /* Handle messages of small and intermediate size, and
269 single-element broadcasts */
270 if ((message_size < small_message_size) || (communicator_size <= 8)) {
271 /* Binomial without segmentation */
272 return Coll_bcast_binomial_tree::bcast (buff, count, datatype,
275 } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
276 // SplittedBinary with 1KB segments
277 return Coll_bcast_scatter_rdb_allgather::bcast(buff, count, datatype,
281 //Handle large message sizes
282 return Coll_bcast_scatter_LR_allgather::bcast (buff, count, datatype,
289 /* This is the default implementation of reduce. The algorithm is:
291 Algorithm: MPI_Reduce
293 For long messages and for builtin ops and if count >= pof2 (where
294 pof2 is the nearest power-of-two less than or equal to the number
295 of processes), we use Rabenseifner's algorithm (see
296 http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
297 This algorithm implements the reduce in two steps: first a
298 reduce-scatter, followed by a gather to the root. A
299 recursive-halving algorithm (beginning with processes that are
300 distance 1 apart) is used for the reduce-scatter, and a binomial tree
301 algorithm is used for the gather. The non-power-of-two case is
302 handled by dropping to the nearest lower power-of-two: the first
303 few odd-numbered processes send their data to their left neighbors
304 (rank-1), and the reduce-scatter happens among the remaining
305 power-of-two processes. If the root is one of the excluded
306 processes, then after the reduce-scatter, rank 0 sends its result to
307 the root and exits; the root now acts as rank 0 in the binomial tree
308 algorithm for gather.
310 For the power-of-two case, the cost for the reduce-scatter is
311 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
312 gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
314 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
316 For the non-power-of-two case, assuming the root is not one of the
317 odd-numbered processes that get excluded in the reduce-scatter,
318 Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
322 For short messages, user-defined ops, and count < pof2, we use a
323 binomial tree algorithm for both short and long messages.
325 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
328 We use the binomial tree algorithm in the case of user-defined ops
329 because in this case derived datatypes are allowed, and the user
330 could pass basic datatypes on one process and derived on another as
331 long as the type maps are the same. Breaking up derived datatypes
332 to do the reduce-scatter is tricky.
334 FIXME: Per the MPI-2.1 standard this case is not possible. We
335 should be able to use the reduce-scatter/gather approach as long as
336 count >= pof2. [goodell@ 2009-01-21]
338 Possible improvements:
340 End Algorithm: MPI_Reduce
344 int Coll_reduce_mpich::reduce( void *sendbuf, void *recvbuf,
345 int count, MPI_Datatype datatype,
350 int communicator_size=0;
352 size_t message_size, dsize;
353 communicator_size = comm->size();
355 /* need data size for decision function */
356 dsize=datatype->size();
357 message_size = dsize * count; /* needed for decision */
360 while (pof2 <= communicator_size) pof2 <<= 1;
363 if ((count < pof2) || (message_size < 2048) || (op != MPI_OP_NULL && not op->is_commutative())) {
364 return Coll_reduce_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
366 return Coll_reduce_scatter_gather::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
367 segsize, max_requests*/);
372 /* This is the default implementation of reduce_scatter. The algorithm is:
374 Algorithm: MPI_Reduce_scatter
376 If the operation is commutative, for short and medium-size
377 messages, we use a recursive-halving
378 algorithm in which the first p/2 processes send the second n/2 data
379 to their counterparts in the other half and receive the first n/2
380 data from them. This procedure continues recursively, halving the
381 data communicated at each step, for a total of lgp steps. If the
382 number of processes is not a power-of-two, we convert it to the
383 nearest lower power-of-two by having the first few even-numbered
384 processes send their data to the neighboring odd-numbered process
385 at (rank+1). Those odd-numbered processes compute the result for
386 their left neighbor as well in the recursive halving algorithm, and
387 then at the end send the result back to the processes that didn't
389 Therefore, if p is a power-of-two,
390 Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
391 If p is not a power-of-two,
392 Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
393 The above cost in the non power-of-two case is approximate because
394 there is some imbalance in the amount of work each process does
395 because some processes do the work of their neighbors as well.
397 For commutative operations and very long messages we use
398 we use a pairwise exchange algorithm similar to
399 the one used in MPI_Alltoall. At step i, each process sends n/p
400 amount of data to (rank+i) and receives n/p amount of data from
402 Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
405 If the operation is not commutative, we do the following:
407 We use a recursive doubling algorithm, which
408 takes lgp steps. At step 1, processes exchange (n-n/p) amount of
409 data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
410 amount of data, and so forth.
412 Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
414 Possible improvements:
416 End Algorithm: MPI_Reduce_scatter
420 int Coll_reduce_scatter_mpich::reduce_scatter( void *sbuf, void *rbuf,
428 size_t total_message_size;
430 if(sbuf==rbuf)sbuf=MPI_IN_PLACE; //restore MPI_IN_PLACE as these algorithms handle it
432 XBT_DEBUG("Coll_reduce_scatter_mpich::reduce");
434 comm_size = comm->size();
435 // We need data size for decision function
436 total_message_size = 0;
437 for (i = 0; i < comm_size; i++) {
438 total_message_size += rcounts[i];
441 if( (op==MPI_OP_NULL || op->is_commutative()) && total_message_size > 524288) {
442 return Coll_reduce_scatter_mpich_pair::reduce_scatter (sbuf, rbuf, rcounts,
445 } else if ((op != MPI_OP_NULL && not op->is_commutative())) {
446 int is_block_regular = 1;
447 for (i = 0; i < (comm_size - 1); ++i) {
448 if (rcounts[i] != rcounts[i + 1]) {
449 is_block_regular = 0;
454 /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
456 while (pof2 < comm_size)
459 if (pof2 == comm_size && is_block_regular) {
460 /* noncommutative, pof2 size, and block regular */
461 return Coll_reduce_scatter_mpich_noncomm::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
464 return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
466 return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
471 /* This is the default implementation of allgather. The algorithm is:
473 Algorithm: MPI_Allgather
475 For short messages and non-power-of-two no. of processes, we use
476 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
477 paper. It is a variant of the disemmination algorithm for
478 barrier. It takes ceiling(lg p) steps.
480 Cost = lgp.alpha + n.((p-1)/p).beta
481 where n is total size of data gathered on each process.
483 For short or medium-size messages and power-of-two no. of
484 processes, we use the recursive doubling algorithm.
486 Cost = lgp.alpha + n.((p-1)/p).beta
488 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
489 algorithm in all cases because of the pairwise-exchange property of
490 recursive doubling (see Benson et al paper in Euro PVM/MPI
493 It is interesting to note that either of the above algorithms for
494 MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
496 For long messages or medium-size messages and non-power-of-two
497 no. of processes, we use a ring algorithm. In the first step, each
498 process i sends its contribution to process i+1 and receives
499 the contribution from process i-1 (with wrap-around). From the
500 second step onwards, each process i forwards to process i+1 the
501 data it received from process i-1 in the previous step. This takes
502 a total of p-1 steps.
504 Cost = (p-1).alpha + n.((p-1)/p).beta
506 We use this algorithm instead of recursive doubling for long
507 messages because we find that this communication pattern (nearest
508 neighbor) performs twice as fast as recursive doubling for long
509 messages (on Myrinet and IBM SP).
511 Possible improvements:
513 End Algorithm: MPI_Allgather
516 int Coll_allgather_mpich::allgather(void *sbuf, int scount,
518 void* rbuf, int rcount,
523 int communicator_size, pow2_size;
524 size_t dsize, total_dsize;
526 communicator_size = comm->size();
528 /* Determine complete data size */
529 dsize=sdtype->size();
530 total_dsize = dsize * scount * communicator_size;
532 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
534 /* Decision as in MPICH-2
535 presented in Thakur et.al. "Optimization of Collective Communication
536 Operations in MPICH", International Journal of High Performance Computing
537 Applications, Vol. 19, No. 1, 49-66 (2005)
538 - for power-of-two processes and small and medium size messages
539 (up to 512KB) use recursive doubling
540 - for non-power-of-two processes and small messages (80KB) use bruck,
541 - for everything else use ring.
543 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
544 return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
545 rbuf, rcount, rdtype,
547 } else if (total_dsize <= 81920) {
548 return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
549 rbuf, rcount, rdtype,
552 return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
553 rbuf, rcount, rdtype,
558 /* This is the default implementation of allgatherv. The algorithm is:
560 Algorithm: MPI_Allgatherv
562 For short messages and non-power-of-two no. of processes, we use
563 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
564 paper. It is a variant of the disemmination algorithm for
565 barrier. It takes ceiling(lg p) steps.
567 Cost = lgp.alpha + n.((p-1)/p).beta
568 where n is total size of data gathered on each process.
570 For short or medium-size messages and power-of-two no. of
571 processes, we use the recursive doubling algorithm.
573 Cost = lgp.alpha + n.((p-1)/p).beta
575 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
576 algorithm in all cases because of the pairwise-exchange property of
577 recursive doubling (see Benson et al paper in Euro PVM/MPI
580 For long messages or medium-size messages and non-power-of-two
581 no. of processes, we use a ring algorithm. In the first step, each
582 process i sends its contribution to process i+1 and receives
583 the contribution from process i-1 (with wrap-around). From the
584 second step onwards, each process i forwards to process i+1 the
585 data it received from process i-1 in the previous step. This takes
586 a total of p-1 steps.
588 Cost = (p-1).alpha + n.((p-1)/p).beta
590 Possible improvements:
592 End Algorithm: MPI_Allgatherv
594 int Coll_allgatherv_mpich::allgatherv(void *sbuf, int scount,
596 void* rbuf, int *rcounts,
602 int communicator_size, pow2_size,i;
605 communicator_size = comm->size();
607 /* Determine complete data size */
609 for (i=0; i<communicator_size; i++)
610 total_dsize += rcounts[i];
611 if (total_dsize == 0)
614 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
616 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
617 return Coll_allgatherv_mpich_rdb::allgatherv(sbuf, scount, sdtype,
618 rbuf, rcounts, rdispls, rdtype,
620 } else if (total_dsize <= 81920) {
621 return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
622 rbuf, rcounts, rdispls, rdtype,
625 return Coll_allgatherv_mpich_ring::allgatherv(sbuf, scount, sdtype,
626 rbuf, rcounts, rdispls, rdtype,
630 /* This is the default implementation of gather. The algorithm is:
632 Algorithm: MPI_Gather
634 We use a binomial tree algorithm for both short and long
635 messages. At nodes other than leaf nodes we need to allocate a
636 temporary buffer to store the incoming message. If the root is not
637 rank 0, for very small messages, we pack it into a temporary
638 contiguous buffer and reorder it to be placed in the right
639 order. For small (but not very small) messages, we use a derived
640 datatype to unpack the incoming data into non-contiguous buffers in
641 the right order. In the heterogeneous case we first pack the
642 buffers by using MPI_Pack and then do the gather.
644 Cost = lgp.alpha + n.((p-1)/p).beta
645 where n is the total size of the data gathered at the root.
647 Possible improvements:
649 End Algorithm: MPI_Gather
652 int Coll_gather_mpich::gather(void *sbuf, int scount,
654 void* rbuf, int rcount,
660 return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
661 rbuf, rcount, rdtype,
665 /* This is the default implementation of scatter. The algorithm is:
667 Algorithm: MPI_Scatter
669 We use a binomial tree algorithm for both short and
670 long messages. At nodes other than leaf nodes we need to allocate
671 a temporary buffer to store the incoming message. If the root is
672 not rank 0, we reorder the sendbuf in order of relative ranks by
673 copying it into a temporary buffer, so that all the sends from the
674 root are contiguous and in the right order. In the heterogeneous
675 case, we first pack the buffer by using MPI_Pack and then do the
678 Cost = lgp.alpha + n.((p-1)/p).beta
679 where n is the total size of the data to be scattered from the root.
681 Possible improvements:
683 End Algorithm: MPI_Scatter
687 int Coll_scatter_mpich::scatter(void *sbuf, int scount,
689 void* rbuf, int rcount,
691 int root, MPI_Comm comm
694 if(comm->rank()!=root){
695 sbuf=xbt_malloc(rcount*rdtype->get_extent());
699 int ret= Coll_scatter_ompi_binomial::scatter (sbuf, scount, sdtype,
700 rbuf, rcount, rdtype,
702 if(comm->rank()!=root){