1 /* selector for collective algorithms based on mpich decision logic */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
11 /* This is the default implementation of allreduce. The algorithm is:
13 Algorithm: MPI_Allreduce
15 For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
16 in order to meet the requirement that all processes must have the
17 same result. For the homogeneous case, we use the following algorithms.
20 For long messages and for builtin ops and if count >= pof2 (where
21 pof2 is the nearest power-of-two less than or equal to the number
22 of processes), we use Rabenseifner's algorithm (see
23 http://www.hlrs.de/mpi/myreduce.html).
24 This algorithm implements the allreduce in two steps: first a
25 reduce-scatter, followed by an allgather. A recursive-halving
26 algorithm (beginning with processes that are distance 1 apart) is
27 used for the reduce-scatter, and a recursive doubling
28 algorithm is used for the allgather. The non-power-of-two case is
29 handled by dropping to the nearest lower power-of-two: the first
30 few even-numbered processes send their data to their right neighbors
31 (rank+1), and the reduce-scatter and allgather happen among the remaining
32 power-of-two processes. At the end, the first few even-numbered
33 processes get the result from their right neighbors.
35 For the power-of-two case, the cost for the reduce-scatter is
36 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
37 allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
39 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
41 For the non-power-of-two case,
42 Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
45 For short messages, for user-defined ops, and for count < pof2
46 we use a recursive doubling algorithm (similar to the one in
47 MPI_Allgather). We use this algorithm in the case of user-defined ops
48 because in this case derived datatypes are allowed, and the user
49 could pass basic datatypes on one process and derived on another as
50 long as the type maps are the same. Breaking up derived datatypes
51 to do the reduce-scatter is tricky.
53 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
55 Possible improvements:
57 End Algorithm: MPI_Allreduce
59 int smpi_coll_tuned_allreduce_mpich(void *sbuf, void *rbuf, int count,
60 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
62 size_t dsize, block_dsize;
63 int comm_size = smpi_comm_size(comm);
64 const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
66 dsize = smpi_datatype_size(dtype);
67 block_dsize = dsize * count;
70 /* find nearest power-of-two less than or equal to comm_size */
72 while (pof2 <= comm_size) pof2 <<= 1;
75 if (block_dsize > large_message && count >= pof2 && smpi_op_is_commute(op)) {
77 return (smpi_coll_tuned_allreduce_rab_rsag (sbuf, rbuf,
81 //for short ones and count < pof2
82 return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
89 /* This is the default implementation of alltoall. The algorithm is:
91 Algorithm: MPI_Alltoall
93 We use four algorithms for alltoall. For short messages and
94 (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
95 IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
96 takes lgp steps. Because of the extra communication, the bandwidth
97 requirement is (n/2).lgp.beta.
99 Cost = lgp.alpha + (n/2).lgp.beta
101 where n is the total amount of data a process needs to send to all
104 For medium size messages and (short messages for comm_size < 8), we
105 use an algorithm that posts all irecvs and isends and then does a
106 waitall. We scatter the order of sources and destinations among the
107 processes, so that all processes don't try to send/recv to/from the
108 same process at the same time.
110 *** Modification: We post only a small number of isends and irecvs
111 at a time and wait on them as suggested by Tony Ladd. ***
112 *** See comments below about an additional modification that
113 we may want to consider ***
115 For long messages and power-of-two number of processes, we use a
116 pairwise exchange algorithm, which takes p-1 steps. We
117 calculate the pairs by using an exclusive-or algorithm:
118 for (i=1; i<comm_size; i++)
120 This algorithm doesn't work if the number of processes is not a power of
121 two. For a non-power-of-two number of processes, we use an
122 algorithm in which, in step i, each process receives from (rank-i)
123 and sends to (rank+i).
125 Cost = (p-1).alpha + n.beta
127 where n is the total amount of data a process needs to send to all
130 Possible improvements:
132 End Algorithm: MPI_Alltoall
135 int smpi_coll_tuned_alltoall_mpich( void *sbuf, int scount,
137 void* rbuf, int rcount,
141 int communicator_size;
142 size_t dsize, block_dsize;
143 communicator_size = smpi_comm_size(comm);
146 int medium_size=32768;
147 //short size and comm_size >=8 -> bruck
149 // medium size messages and (short messages for comm_size < 8), we
150 // use an algorithm that posts all irecvs and isends and then does a
153 // For long messages and power-of-two number of processes, we use a
154 // pairwise exchange algorithm
156 // For a non-power-of-two number of processes, we use an
157 // algorithm in which, in step i, each process receives from (rank-i)
158 // and sends to (rank+i).
161 dsize = smpi_datatype_size(sdtype);
162 block_dsize = dsize * scount;
164 if ((block_dsize < short_size) && (communicator_size >= 8)) {
165 return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
166 rbuf, rcount, rdtype,
169 } else if (block_dsize < medium_size) {
170 return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
171 rbuf, rcount, rdtype,
173 }else if (communicator_size%2){
174 return smpi_coll_tuned_alltoall_ring(sbuf, scount, sdtype,
175 rbuf, rcount, rdtype,
179 return smpi_coll_tuned_alltoall_ompi_pairwise (sbuf, scount, sdtype,
180 rbuf, rcount, rdtype,
184 int smpi_coll_tuned_alltoallv_mpich(void *sbuf, int *scounts, int *sdisps,
186 void *rbuf, int *rcounts, int *rdisps,
191 /* For starters, just keep the original algorithm. */
192 return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
193 rbuf, rcounts, rdisps,rdtype,
198 int smpi_coll_tuned_barrier_mpich(MPI_Comm comm)
200 return smpi_coll_tuned_barrier_ompi_bruck(comm);
203 /* This is the default implementation of broadcast. The algorithm is:
207 For short messages, we use a binomial tree algorithm.
208 Cost = lgp.alpha + n.lgp.beta
210 For long messages, we do a scatter followed by an allgather.
211 We first scatter the buffer using a binomial tree algorithm. This costs
212 lgp.alpha + n.((p-1)/p).beta
213 If the datatype is contiguous and the communicator is homogeneous,
214 we treat the data as bytes and divide (scatter) it among processes
215 by using ceiling division. For the noncontiguous or heterogeneous
216 cases, we first pack the data into a temporary buffer by using
217 MPI_Pack, scatter it as bytes, and unpack it after the allgather.
219 For the allgather, we use a recursive doubling algorithm for
220 medium-size messages and power-of-two number of processes. This
221 takes lgp steps. In each step pairs of processes exchange all the
222 data they have (we take care of non-power-of-two situations). This
223 costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
224 because it may be slightly more in the non-power-of-two case, but
225 it's still a logarithmic algorithm.) Therefore, for long messages
226 Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
228 Note that this algorithm has twice the latency as the tree algorithm
229 we use for short messages, but requires lower bandwidth: 2.n.beta
230 versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
231 this algorithm will perform better.
233 For long messages and for medium-size messages and non-power-of-two
234 processes, we use a ring algorithm for the allgather, which
235 takes p-1 steps, because it performs better than recursive doubling.
236 Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
238 Possible improvements:
239 For clusters of SMPs, we may want to do something differently to
240 take advantage of shared memory on each node.
242 End Algorithm: MPI_Bcast
246 int smpi_coll_tuned_bcast_mpich(void *buff, int count,
247 MPI_Datatype datatype, int root,
251 /* Decision function based on MX results for
252 messages up to 36MB and communicator sizes up to 64 nodes */
253 const size_t small_message_size = 12288;
254 const size_t intermediate_message_size = 524288;
256 int communicator_size;
258 size_t message_size, dsize;
260 communicator_size = smpi_comm_size(comm);
262 /* else we need data size for decision function */
263 dsize = smpi_datatype_size(datatype);
264 message_size = dsize * (unsigned long)count; /* needed for decision */
266 /* Handle messages of small and intermediate size, and
267 single-element broadcasts */
268 if ((message_size < small_message_size) || (communicator_size <= 8)) {
269 /* Binomial without segmentation */
270 return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
273 } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
274 // SplittedBinary with 1KB segments
275 return smpi_coll_tuned_bcast_scatter_rdb_allgather(buff, count, datatype,
279 //Handle large message sizes
280 return smpi_coll_tuned_bcast_scatter_LR_allgather (buff, count, datatype,
287 /* This is the default implementation of reduce. The algorithm is:
289 Algorithm: MPI_Reduce
291 For long messages and for builtin ops and if count >= pof2 (where
292 pof2 is the nearest power-of-two less than or equal to the number
293 of processes), we use Rabenseifner's algorithm (see
294 http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
295 This algorithm implements the reduce in two steps: first a
296 reduce-scatter, followed by a gather to the root. A
297 recursive-halving algorithm (beginning with processes that are
298 distance 1 apart) is used for the reduce-scatter, and a binomial tree
299 algorithm is used for the gather. The non-power-of-two case is
300 handled by dropping to the nearest lower power-of-two: the first
301 few odd-numbered processes send their data to their left neighbors
302 (rank-1), and the reduce-scatter happens among the remaining
303 power-of-two processes. If the root is one of the excluded
304 processes, then after the reduce-scatter, rank 0 sends its result to
305 the root and exits; the root now acts as rank 0 in the binomial tree
306 algorithm for gather.
308 For the power-of-two case, the cost for the reduce-scatter is
309 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
310 gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
312 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
314 For the non-power-of-two case, assuming the root is not one of the
315 odd-numbered processes that get excluded in the reduce-scatter,
316 Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
320 For short messages, user-defined ops, and count < pof2, we use a
321 binomial tree algorithm for both short and long messages.
323 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
326 We use the binomial tree algorithm in the case of user-defined ops
327 because in this case derived datatypes are allowed, and the user
328 could pass basic datatypes on one process and derived on another as
329 long as the type maps are the same. Breaking up derived datatypes
330 to do the reduce-scatter is tricky.
332 FIXME: Per the MPI-2.1 standard this case is not possible. We
333 should be able to use the reduce-scatter/gather approach as long as
334 count >= pof2. [goodell@ 2009-01-21]
336 Possible improvements:
338 End Algorithm: MPI_Reduce
342 int smpi_coll_tuned_reduce_mpich( void *sendbuf, void *recvbuf,
343 int count, MPI_Datatype datatype,
348 int communicator_size=0;
350 size_t message_size, dsize;
351 communicator_size = smpi_comm_size(comm);
353 /* need data size for decision function */
354 dsize=smpi_datatype_size(datatype);
355 message_size = dsize * count; /* needed for decision */
358 while (pof2 <= communicator_size) pof2 <<= 1;
362 if ((count < pof2) || (message_size < 2048) || !smpi_op_is_commute(op)) {
363 return smpi_coll_tuned_reduce_binomial (sendbuf, recvbuf, count, datatype, op, root, comm);
365 return smpi_coll_tuned_reduce_scatter_gather(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
366 segsize, max_requests*/);
371 /* This is the default implementation of reduce_scatter. The algorithm is:
373 Algorithm: MPI_Reduce_scatter
375 If the operation is commutative, for short and medium-size
376 messages, we use a recursive-halving
377 algorithm in which the first p/2 processes send the second n/2 data
378 to their counterparts in the other half and receive the first n/2
379 data from them. This procedure continues recursively, halving the
380 data communicated at each step, for a total of lgp steps. If the
381 number of processes is not a power-of-two, we convert it to the
382 nearest lower power-of-two by having the first few even-numbered
383 processes send their data to the neighboring odd-numbered process
384 at (rank+1). Those odd-numbered processes compute the result for
385 their left neighbor as well in the recursive halving algorithm, and
386 then at the end send the result back to the processes that didn't
388 Therefore, if p is a power-of-two,
389 Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
390 If p is not a power-of-two,
391 Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
392 The above cost in the non power-of-two case is approximate because
393 there is some imbalance in the amount of work each process does
394 because some processes do the work of their neighbors as well.
396 For commutative operations and very long messages we use
397 we use a pairwise exchange algorithm similar to
398 the one used in MPI_Alltoall. At step i, each process sends n/p
399 amount of data to (rank+i) and receives n/p amount of data from
401 Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
404 If the operation is not commutative, we do the following:
406 We use a recursive doubling algorithm, which
407 takes lgp steps. At step 1, processes exchange (n-n/p) amount of
408 data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
409 amount of data, and so forth.
411 Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
413 Possible improvements:
415 End Algorithm: MPI_Reduce_scatter
419 int smpi_coll_tuned_reduce_scatter_mpich( void *sbuf, void *rbuf,
427 size_t total_message_size;
429 if(sbuf==rbuf)sbuf=MPI_IN_PLACE; //restore MPI_IN_PLACE as these algorithms handle it
431 XBT_DEBUG("smpi_coll_tuned_reduce_scatter_mpich");
433 comm_size = smpi_comm_size(comm);
434 // We need data size for decision function
435 total_message_size = 0;
436 for (i = 0; i < comm_size; i++) {
437 total_message_size += rcounts[i];
440 if( smpi_op_is_commute(op) && total_message_size > 524288) {
441 return smpi_coll_tuned_reduce_scatter_mpich_pair (sbuf, rbuf, rcounts,
444 }else if (!smpi_op_is_commute(op)) {
445 int is_block_regular = 1;
446 for (i = 0; i < (comm_size - 1); ++i) {
447 if (rcounts[i] != rcounts[i+1]) {
448 is_block_regular = 0;
453 /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
455 while (pof2 < comm_size) pof2 <<= 1;
457 if (pof2 == comm_size && is_block_regular) {
458 /* noncommutative, pof2 size, and block regular */
459 return smpi_coll_tuned_reduce_scatter_mpich_noncomm(sbuf, rbuf, rcounts, dtype, op, comm);
462 return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
464 return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
469 /* This is the default implementation of allgather. The algorithm is:
471 Algorithm: MPI_Allgather
473 For short messages and non-power-of-two no. of processes, we use
474 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
475 paper. It is a variant of the disemmination algorithm for
476 barrier. It takes ceiling(lg p) steps.
478 Cost = lgp.alpha + n.((p-1)/p).beta
479 where n is total size of data gathered on each process.
481 For short or medium-size messages and power-of-two no. of
482 processes, we use the recursive doubling algorithm.
484 Cost = lgp.alpha + n.((p-1)/p).beta
486 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
487 algorithm in all cases because of the pairwise-exchange property of
488 recursive doubling (see Benson et al paper in Euro PVM/MPI
491 It is interesting to note that either of the above algorithms for
492 MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
494 For long messages or medium-size messages and non-power-of-two
495 no. of processes, we use a ring algorithm. In the first step, each
496 process i sends its contribution to process i+1 and receives
497 the contribution from process i-1 (with wrap-around). From the
498 second step onwards, each process i forwards to process i+1 the
499 data it received from process i-1 in the previous step. This takes
500 a total of p-1 steps.
502 Cost = (p-1).alpha + n.((p-1)/p).beta
504 We use this algorithm instead of recursive doubling for long
505 messages because we find that this communication pattern (nearest
506 neighbor) performs twice as fast as recursive doubling for long
507 messages (on Myrinet and IBM SP).
509 Possible improvements:
511 End Algorithm: MPI_Allgather
514 int smpi_coll_tuned_allgather_mpich(void *sbuf, int scount,
516 void* rbuf, int rcount,
521 int communicator_size, pow2_size;
522 size_t dsize, total_dsize;
524 communicator_size = smpi_comm_size(comm);
526 /* Determine complete data size */
527 dsize=smpi_datatype_size(sdtype);
528 total_dsize = dsize * scount * communicator_size;
530 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
532 /* Decision as in MPICH-2
533 presented in Thakur et.al. "Optimization of Collective Communication
534 Operations in MPICH", International Journal of High Performance Computing
535 Applications, Vol. 19, No. 1, 49-66 (2005)
536 - for power-of-two processes and small and medium size messages
537 (up to 512KB) use recursive doubling
538 - for non-power-of-two processes and small messages (80KB) use bruck,
539 - for everything else use ring.
541 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
542 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
543 rbuf, rcount, rdtype,
545 } else if (total_dsize <= 81920) {
546 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
547 rbuf, rcount, rdtype,
550 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
551 rbuf, rcount, rdtype,
556 /* This is the default implementation of allgatherv. The algorithm is:
558 Algorithm: MPI_Allgatherv
560 For short messages and non-power-of-two no. of processes, we use
561 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
562 paper. It is a variant of the disemmination algorithm for
563 barrier. It takes ceiling(lg p) steps.
565 Cost = lgp.alpha + n.((p-1)/p).beta
566 where n is total size of data gathered on each process.
568 For short or medium-size messages and power-of-two no. of
569 processes, we use the recursive doubling algorithm.
571 Cost = lgp.alpha + n.((p-1)/p).beta
573 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
574 algorithm in all cases because of the pairwise-exchange property of
575 recursive doubling (see Benson et al paper in Euro PVM/MPI
578 For long messages or medium-size messages and non-power-of-two
579 no. of processes, we use a ring algorithm. In the first step, each
580 process i sends its contribution to process i+1 and receives
581 the contribution from process i-1 (with wrap-around). From the
582 second step onwards, each process i forwards to process i+1 the
583 data it received from process i-1 in the previous step. This takes
584 a total of p-1 steps.
586 Cost = (p-1).alpha + n.((p-1)/p).beta
588 Possible improvements:
590 End Algorithm: MPI_Allgatherv
592 int smpi_coll_tuned_allgatherv_mpich(void *sbuf, int scount,
594 void* rbuf, int *rcounts,
600 int communicator_size, pow2_size,i;
601 size_t dsize, total_dsize;
603 communicator_size = smpi_comm_size(comm);
605 /* Determine complete data size */
606 dsize=smpi_datatype_size(sdtype);
607 total_dsize = dsize * scount * communicator_size;
610 for (i=0; i<communicator_size; i++)
611 total_dsize += rcounts[i];
612 if (total_dsize == 0) return MPI_SUCCESS;
614 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
616 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
617 return smpi_coll_tuned_allgatherv_mpich_rdb(sbuf, scount, sdtype,
618 rbuf, rcounts, rdispls, rdtype,
620 } else if (total_dsize <= 81920) {
621 return smpi_coll_tuned_allgatherv_ompi_bruck(sbuf, scount, sdtype,
622 rbuf, rcounts, rdispls, rdtype,
625 return smpi_coll_tuned_allgatherv_mpich_ring(sbuf, scount, sdtype,
626 rbuf, rcounts, rdispls, rdtype,
630 /* This is the default implementation of gather. The algorithm is:
632 Algorithm: MPI_Gather
634 We use a binomial tree algorithm for both short and long
635 messages. At nodes other than leaf nodes we need to allocate a
636 temporary buffer to store the incoming message. If the root is not
637 rank 0, for very small messages, we pack it into a temporary
638 contiguous buffer and reorder it to be placed in the right
639 order. For small (but not very small) messages, we use a derived
640 datatype to unpack the incoming data into non-contiguous buffers in
641 the right order. In the heterogeneous case we first pack the
642 buffers by using MPI_Pack and then do the gather.
644 Cost = lgp.alpha + n.((p-1)/p).beta
645 where n is the total size of the data gathered at the root.
647 Possible improvements:
649 End Algorithm: MPI_Gather
652 int smpi_coll_tuned_gather_mpich(void *sbuf, int scount,
654 void* rbuf, int rcount,
660 return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype,
661 rbuf, rcount, rdtype,
665 /* This is the default implementation of scatter. The algorithm is:
667 Algorithm: MPI_Scatter
669 We use a binomial tree algorithm for both short and
670 long messages. At nodes other than leaf nodes we need to allocate
671 a temporary buffer to store the incoming message. If the root is
672 not rank 0, we reorder the sendbuf in order of relative ranks by
673 copying it into a temporary buffer, so that all the sends from the
674 root are contiguous and in the right order. In the heterogeneous
675 case, we first pack the buffer by using MPI_Pack and then do the
678 Cost = lgp.alpha + n.((p-1)/p).beta
679 where n is the total size of the data to be scattered from the root.
681 Possible improvements:
683 End Algorithm: MPI_Scatter
687 int smpi_coll_tuned_scatter_mpich(void *sbuf, int scount,
689 void* rbuf, int rcount,
691 int root, MPI_Comm comm
694 if(smpi_comm_rank(comm)!=root){
695 sbuf=xbt_malloc(rcount*smpi_datatype_get_extent(rdtype));
699 return smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype,
700 rbuf, rcount, rdtype,