1 /* selector for collective algorithms based on mpich decision logic */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
11 /* This is the default implementation of allreduce. The algorithm is:
13 Algorithm: MPI_Allreduce
15 For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
16 in order to meet the requirement that all processes must have the
17 same result. For the homogeneous case, we use the following algorithms.
20 For long messages and for builtin ops and if count >= pof2 (where
21 pof2 is the nearest power-of-two less than or equal to the number
22 of processes), we use Rabenseifner's algorithm (see
23 http://www.hlrs.de/mpi/myreduce.html).
24 This algorithm implements the allreduce in two steps: first a
25 reduce-scatter, followed by an allgather. A recursive-halving
26 algorithm (beginning with processes that are distance 1 apart) is
27 used for the reduce-scatter, and a recursive doubling
28 algorithm is used for the allgather. The non-power-of-two case is
29 handled by dropping to the nearest lower power-of-two: the first
30 few even-numbered processes send their data to their right neighbors
31 (rank+1), and the reduce-scatter and allgather happen among the remaining
32 power-of-two processes. At the end, the first few even-numbered
33 processes get the result from their right neighbors.
35 For the power-of-two case, the cost for the reduce-scatter is
36 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
37 allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
39 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
41 For the non-power-of-two case,
42 Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
45 For short messages, for user-defined ops, and for count < pof2
46 we use a recursive doubling algorithm (similar to the one in
47 MPI_Allgather). We use this algorithm in the case of user-defined ops
48 because in this case derived datatypes are allowed, and the user
49 could pass basic datatypes on one process and derived on another as
50 long as the type maps are the same. Breaking up derived datatypes
51 to do the reduce-scatter is tricky.
53 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
55 Possible improvements:
57 End Algorithm: MPI_Allreduce
59 int smpi_coll_tuned_allreduce_mpich(void *sbuf, void *rbuf, int count,
60 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
62 size_t dsize, block_dsize;
63 int comm_size = smpi_comm_size(comm);
64 const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
66 dsize = smpi_datatype_size(dtype);
67 block_dsize = dsize * count;
70 /* find nearest power-of-two less than or equal to comm_size */
72 while (pof2 <= comm_size) pof2 <<= 1;
75 if (block_dsize > large_message && count >= pof2 && smpi_op_is_commute(op)) {
77 return (smpi_coll_tuned_allreduce_rab_rsag (sbuf, rbuf,
81 //for short ones and count < pof2
82 return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
89 /* This is the default implementation of alltoall. The algorithm is:
91 Algorithm: MPI_Alltoall
93 We use four algorithms for alltoall. For short messages and
94 (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
95 IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
96 takes lgp steps. Because of the extra communication, the bandwidth
97 requirement is (n/2).lgp.beta.
99 Cost = lgp.alpha + (n/2).lgp.beta
101 where n is the total amount of data a process needs to send to all
104 For medium size messages and (short messages for comm_size < 8), we
105 use an algorithm that posts all irecvs and isends and then does a
106 waitall. We scatter the order of sources and destinations among the
107 processes, so that all processes don't try to send/recv to/from the
108 same process at the same time.
110 *** Modification: We post only a small number of isends and irecvs
111 at a time and wait on them as suggested by Tony Ladd. ***
112 *** See comments below about an additional modification that
113 we may want to consider ***
115 For long messages and power-of-two number of processes, we use a
116 pairwise exchange algorithm, which takes p-1 steps. We
117 calculate the pairs by using an exclusive-or algorithm:
118 for (i=1; i<comm_size; i++)
120 This algorithm doesn't work if the number of processes is not a power of
121 two. For a non-power-of-two number of processes, we use an
122 algorithm in which, in step i, each process receives from (rank-i)
123 and sends to (rank+i).
125 Cost = (p-1).alpha + n.beta
127 where n is the total amount of data a process needs to send to all
130 Possible improvements:
132 End Algorithm: MPI_Alltoall
135 int smpi_coll_tuned_alltoall_mpich( void *sbuf, int scount,
137 void* rbuf, int rcount,
141 int communicator_size;
142 size_t dsize, block_dsize;
143 communicator_size = smpi_comm_size(comm);
146 int medium_size=32768;
147 //short size and comm_size >=8 -> bruck
149 // medium size messages and (short messages for comm_size < 8), we
150 // use an algorithm that posts all irecvs and isends and then does a
153 // For long messages and power-of-two number of processes, we use a
154 // pairwise exchange algorithm
156 // For a non-power-of-two number of processes, we use an
157 // algorithm in which, in step i, each process receives from (rank-i)
158 // and sends to (rank+i).
161 dsize = smpi_datatype_size(sdtype);
162 block_dsize = dsize * scount;
164 if ((block_dsize < short_size) && (communicator_size >= 8)) {
165 return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
166 rbuf, rcount, rdtype,
169 } else if (block_dsize < medium_size) {
170 return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
171 rbuf, rcount, rdtype,
173 }else if (communicator_size%2){
174 return smpi_coll_tuned_alltoall_ring(sbuf, scount, sdtype,
175 rbuf, rcount, rdtype,
179 return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
180 rbuf, rcount, rdtype,
184 int smpi_coll_tuned_alltoallv_mpich(void *sbuf, int *scounts, int *sdisps,
186 void *rbuf, int *rcounts, int *rdisps,
191 /* For starters, just keep the original algorithm. */
192 return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
193 rbuf, rcounts, rdisps,rdtype,
198 int smpi_coll_tuned_barrier_mpich(MPI_Comm comm)
200 return smpi_coll_tuned_barrier_ompi_bruck(comm);
203 /* This is the default implementation of broadcast. The algorithm is:
207 For short messages, we use a binomial tree algorithm.
208 Cost = lgp.alpha + n.lgp.beta
210 For long messages, we do a scatter followed by an allgather.
211 We first scatter the buffer using a binomial tree algorithm. This costs
212 lgp.alpha + n.((p-1)/p).beta
213 If the datatype is contiguous and the communicator is homogeneous,
214 we treat the data as bytes and divide (scatter) it among processes
215 by using ceiling division. For the noncontiguous or heterogeneous
216 cases, we first pack the data into a temporary buffer by using
217 MPI_Pack, scatter it as bytes, and unpack it after the allgather.
219 For the allgather, we use a recursive doubling algorithm for
220 medium-size messages and power-of-two number of processes. This
221 takes lgp steps. In each step pairs of processes exchange all the
222 data they have (we take care of non-power-of-two situations). This
223 costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
224 because it may be slightly more in the non-power-of-two case, but
225 it's still a logarithmic algorithm.) Therefore, for long messages
226 Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
228 Note that this algorithm has twice the latency as the tree algorithm
229 we use for short messages, but requires lower bandwidth: 2.n.beta
230 versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
231 this algorithm will perform better.
233 For long messages and for medium-size messages and non-power-of-two
234 processes, we use a ring algorithm for the allgather, which
235 takes p-1 steps, because it performs better than recursive doubling.
236 Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
238 Possible improvements:
239 For clusters of SMPs, we may want to do something differently to
240 take advantage of shared memory on each node.
242 End Algorithm: MPI_Bcast
246 int smpi_coll_tuned_bcast_mpich(void *buff, int count,
247 MPI_Datatype datatype, int root,
251 /* Decision function based on MX results for
252 messages up to 36MB and communicator sizes up to 64 nodes */
253 const size_t small_message_size = 12288;
254 const size_t intermediate_message_size = 524288;
256 int communicator_size;
258 size_t message_size, dsize;
260 communicator_size = smpi_comm_size(comm);
262 /* else we need data size for decision function */
263 dsize = smpi_datatype_size(datatype);
264 message_size = dsize * (unsigned long)count; /* needed for decision */
266 /* Handle messages of small and intermediate size, and
267 single-element broadcasts */
268 if ((message_size < small_message_size) || (communicator_size <= 8)) {
269 /* Binomial without segmentation */
270 return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
273 } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
274 // SplittedBinary with 1KB segments
275 return smpi_coll_tuned_bcast_scatter_rdb_allgather(buff, count, datatype,
279 //Handle large message sizes
280 return smpi_coll_tuned_bcast_scatter_LR_allgather (buff, count, datatype,
287 /* This is the default implementation of reduce. The algorithm is:
289 Algorithm: MPI_Reduce
291 For long messages and for builtin ops and if count >= pof2 (where
292 pof2 is the nearest power-of-two less than or equal to the number
293 of processes), we use Rabenseifner's algorithm (see
294 http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
295 This algorithm implements the reduce in two steps: first a
296 reduce-scatter, followed by a gather to the root. A
297 recursive-halving algorithm (beginning with processes that are
298 distance 1 apart) is used for the reduce-scatter, and a binomial tree
299 algorithm is used for the gather. The non-power-of-two case is
300 handled by dropping to the nearest lower power-of-two: the first
301 few odd-numbered processes send their data to their left neighbors
302 (rank-1), and the reduce-scatter happens among the remaining
303 power-of-two processes. If the root is one of the excluded
304 processes, then after the reduce-scatter, rank 0 sends its result to
305 the root and exits; the root now acts as rank 0 in the binomial tree
306 algorithm for gather.
308 For the power-of-two case, the cost for the reduce-scatter is
309 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
310 gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
312 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
314 For the non-power-of-two case, assuming the root is not one of the
315 odd-numbered processes that get excluded in the reduce-scatter,
316 Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
320 For short messages, user-defined ops, and count < pof2, we use a
321 binomial tree algorithm for both short and long messages.
323 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
326 We use the binomial tree algorithm in the case of user-defined ops
327 because in this case derived datatypes are allowed, and the user
328 could pass basic datatypes on one process and derived on another as
329 long as the type maps are the same. Breaking up derived datatypes
330 to do the reduce-scatter is tricky.
332 FIXME: Per the MPI-2.1 standard this case is not possible. We
333 should be able to use the reduce-scatter/gather approach as long as
334 count >= pof2. [goodell@ 2009-01-21]
336 Possible improvements:
338 End Algorithm: MPI_Reduce
342 int smpi_coll_tuned_reduce_mpich( void *sendbuf, void *recvbuf,
343 int count, MPI_Datatype datatype,
348 int communicator_size=0;
350 size_t message_size, dsize;
351 communicator_size = smpi_comm_size(comm);
353 /* need data size for decision function */
354 dsize=smpi_datatype_size(datatype);
355 message_size = dsize * count; /* needed for decision */
358 while (pof2 <= communicator_size) pof2 <<= 1;
362 if ((count < pof2) || (message_size < 2048) || !smpi_op_is_commute(op)) {
363 return smpi_coll_tuned_reduce_binomial (sendbuf, recvbuf, count, datatype, op, root, comm);
365 return smpi_coll_tuned_reduce_scatter_gather(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
366 segsize, max_requests*/);
371 /* This is the default implementation of reduce_scatter. The algorithm is:
373 Algorithm: MPI_Reduce_scatter
375 If the operation is commutative, for short and medium-size
376 messages, we use a recursive-halving
377 algorithm in which the first p/2 processes send the second n/2 data
378 to their counterparts in the other half and receive the first n/2
379 data from them. This procedure continues recursively, halving the
380 data communicated at each step, for a total of lgp steps. If the
381 number of processes is not a power-of-two, we convert it to the
382 nearest lower power-of-two by having the first few even-numbered
383 processes send their data to the neighboring odd-numbered process
384 at (rank+1). Those odd-numbered processes compute the result for
385 their left neighbor as well in the recursive halving algorithm, and
386 then at the end send the result back to the processes that didn't
388 Therefore, if p is a power-of-two,
389 Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
390 If p is not a power-of-two,
391 Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
392 The above cost in the non power-of-two case is approximate because
393 there is some imbalance in the amount of work each process does
394 because some processes do the work of their neighbors as well.
396 For commutative operations and very long messages we use
397 we use a pairwise exchange algorithm similar to
398 the one used in MPI_Alltoall. At step i, each process sends n/p
399 amount of data to (rank+i) and receives n/p amount of data from
401 Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
404 If the operation is not commutative, we do the following:
406 We use a recursive doubling algorithm, which
407 takes lgp steps. At step 1, processes exchange (n-n/p) amount of
408 data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
409 amount of data, and so forth.
411 Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
413 Possible improvements:
415 End Algorithm: MPI_Reduce_scatter
419 int smpi_coll_tuned_reduce_scatter_mpich( void *sbuf, void *rbuf,
427 size_t total_message_size;
429 XBT_DEBUG("smpi_coll_tuned_reduce_scatter_mpich");
431 comm_size = smpi_comm_size(comm);
432 // We need data size for decision function
433 total_message_size = 0;
434 for (i = 0; i < comm_size; i++) {
435 total_message_size += rcounts[i];
438 if( smpi_op_is_commute(op) && total_message_size > 524288) {
439 return smpi_coll_tuned_reduce_scatter_mpich_pair (sbuf, rbuf, rcounts,
442 }else if (!smpi_op_is_commute(op)) {
443 int is_block_regular = 1;
444 for (i = 0; i < (comm_size - 1); ++i) {
445 if (rcounts[i] != rcounts[i+1]) {
446 is_block_regular = 0;
451 /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
453 while (pof2 < comm_size) pof2 <<= 1;
455 if (pof2 == comm_size && is_block_regular) {
456 /* noncommutative, pof2 size, and block regular */
457 return smpi_coll_tuned_reduce_scatter_mpich_noncomm(sbuf, rbuf, rcounts, dtype, op, comm);
460 return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
462 return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
467 /* This is the default implementation of allgather. The algorithm is:
469 Algorithm: MPI_Allgather
471 For short messages and non-power-of-two no. of processes, we use
472 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
473 paper. It is a variant of the disemmination algorithm for
474 barrier. It takes ceiling(lg p) steps.
476 Cost = lgp.alpha + n.((p-1)/p).beta
477 where n is total size of data gathered on each process.
479 For short or medium-size messages and power-of-two no. of
480 processes, we use the recursive doubling algorithm.
482 Cost = lgp.alpha + n.((p-1)/p).beta
484 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
485 algorithm in all cases because of the pairwise-exchange property of
486 recursive doubling (see Benson et al paper in Euro PVM/MPI
489 It is interesting to note that either of the above algorithms for
490 MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
492 For long messages or medium-size messages and non-power-of-two
493 no. of processes, we use a ring algorithm. In the first step, each
494 process i sends its contribution to process i+1 and receives
495 the contribution from process i-1 (with wrap-around). From the
496 second step onwards, each process i forwards to process i+1 the
497 data it received from process i-1 in the previous step. This takes
498 a total of p-1 steps.
500 Cost = (p-1).alpha + n.((p-1)/p).beta
502 We use this algorithm instead of recursive doubling for long
503 messages because we find that this communication pattern (nearest
504 neighbor) performs twice as fast as recursive doubling for long
505 messages (on Myrinet and IBM SP).
507 Possible improvements:
509 End Algorithm: MPI_Allgather
512 int smpi_coll_tuned_allgather_mpich(void *sbuf, int scount,
514 void* rbuf, int rcount,
519 int communicator_size, pow2_size;
520 size_t dsize, total_dsize;
522 communicator_size = smpi_comm_size(comm);
524 /* Determine complete data size */
525 dsize=smpi_datatype_size(sdtype);
526 total_dsize = dsize * scount * communicator_size;
528 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
530 /* Decision as in MPICH-2
531 presented in Thakur et.al. "Optimization of Collective Communication
532 Operations in MPICH", International Journal of High Performance Computing
533 Applications, Vol. 19, No. 1, 49-66 (2005)
534 - for power-of-two processes and small and medium size messages
535 (up to 512KB) use recursive doubling
536 - for non-power-of-two processes and small messages (80KB) use bruck,
537 - for everything else use ring.
539 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
540 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
541 rbuf, rcount, rdtype,
543 } else if (total_dsize <= 81920) {
544 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
545 rbuf, rcount, rdtype,
548 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
549 rbuf, rcount, rdtype,
554 /* This is the default implementation of allgatherv. The algorithm is:
556 Algorithm: MPI_Allgatherv
558 For short messages and non-power-of-two no. of processes, we use
559 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
560 paper. It is a variant of the disemmination algorithm for
561 barrier. It takes ceiling(lg p) steps.
563 Cost = lgp.alpha + n.((p-1)/p).beta
564 where n is total size of data gathered on each process.
566 For short or medium-size messages and power-of-two no. of
567 processes, we use the recursive doubling algorithm.
569 Cost = lgp.alpha + n.((p-1)/p).beta
571 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
572 algorithm in all cases because of the pairwise-exchange property of
573 recursive doubling (see Benson et al paper in Euro PVM/MPI
576 For long messages or medium-size messages and non-power-of-two
577 no. of processes, we use a ring algorithm. In the first step, each
578 process i sends its contribution to process i+1 and receives
579 the contribution from process i-1 (with wrap-around). From the
580 second step onwards, each process i forwards to process i+1 the
581 data it received from process i-1 in the previous step. This takes
582 a total of p-1 steps.
584 Cost = (p-1).alpha + n.((p-1)/p).beta
586 Possible improvements:
588 End Algorithm: MPI_Allgatherv
590 int smpi_coll_tuned_allgatherv_mpich(void *sbuf, int scount,
592 void* rbuf, int *rcounts,
598 int communicator_size, pow2_size;
599 size_t dsize, total_dsize;
601 communicator_size = smpi_comm_size(comm);
603 /* Determine complete data size */
604 dsize=smpi_datatype_size(sdtype);
605 total_dsize = dsize * scount * communicator_size;
607 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
609 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
610 return smpi_coll_tuned_allgatherv_mpich_rdb(sbuf, scount, sdtype,
611 rbuf, rcounts, rdispls, rdtype,
613 } else if (total_dsize <= 81920) {
614 return smpi_coll_tuned_allgatherv_ompi_bruck(sbuf, scount, sdtype,
615 rbuf, rcounts, rdispls, rdtype,
618 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
619 rbuf, rcounts, rdispls, rdtype,
623 /* This is the default implementation of gather. The algorithm is:
625 Algorithm: MPI_Gather
627 We use a binomial tree algorithm for both short and long
628 messages. At nodes other than leaf nodes we need to allocate a
629 temporary buffer to store the incoming message. If the root is not
630 rank 0, for very small messages, we pack it into a temporary
631 contiguous buffer and reorder it to be placed in the right
632 order. For small (but not very small) messages, we use a derived
633 datatype to unpack the incoming data into non-contiguous buffers in
634 the right order. In the heterogeneous case we first pack the
635 buffers by using MPI_Pack and then do the gather.
637 Cost = lgp.alpha + n.((p-1)/p).beta
638 where n is the total size of the data gathered at the root.
640 Possible improvements:
642 End Algorithm: MPI_Gather
645 int smpi_coll_tuned_gather_mpich(void *sbuf, int scount,
647 void* rbuf, int rcount,
653 return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype,
654 rbuf, rcount, rdtype,
658 /* This is the default implementation of scatter. The algorithm is:
660 Algorithm: MPI_Scatter
662 We use a binomial tree algorithm for both short and
663 long messages. At nodes other than leaf nodes we need to allocate
664 a temporary buffer to store the incoming message. If the root is
665 not rank 0, we reorder the sendbuf in order of relative ranks by
666 copying it into a temporary buffer, so that all the sends from the
667 root are contiguous and in the right order. In the heterogeneous
668 case, we first pack the buffer by using MPI_Pack and then do the
671 Cost = lgp.alpha + n.((p-1)/p).beta
672 where n is the total size of the data to be scattered from the root.
674 Possible improvements:
676 End Algorithm: MPI_Scatter
680 int smpi_coll_tuned_scatter_mpich(void *sbuf, int scount,
682 void* rbuf, int rcount,
684 int root, MPI_Comm comm
687 return smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype,
688 rbuf, rcount, rdtype,