1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2022. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.hpp"
16 int allreduce__ompi(const void *sbuf, void *rbuf, int count,
17 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
19 size_t dsize, block_dsize;
20 int comm_size = comm->size();
21 const size_t intermediate_message = 10000;
24 * Decision function based on MX results from the Grig cluster at UTK.
26 * Currently, linear, recursive doubling, and nonoverlapping algorithms
27 * can handle both commutative and non-commutative operations.
28 * Ring algorithm does not support non-commutative operations.
30 dsize = dtype->size();
31 block_dsize = dsize * count;
33 if (block_dsize < intermediate_message) {
34 return allreduce__rdb(sbuf, rbuf, count, dtype, op, comm);
37 if( ((op==MPI_OP_NULL) || op->is_commutative()) && (count > comm_size) ) {
38 const size_t segment_size = 1 << 20; /* 1 MB */
39 if ((comm_size * segment_size >= block_dsize)) {
40 //FIXME: ok, these are not the right algorithms, try to find closer ones
41 // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
42 return allreduce__lr(sbuf, rbuf, count, dtype, op, comm);
44 return allreduce__ompi_ring_segmented(sbuf, rbuf, count, dtype, op, comm /*segment_size*/);
48 return allreduce__redbcast(sbuf, rbuf, count, dtype, op, comm);
53 int alltoall__ompi(const void *sbuf, int scount,
55 void* rbuf, int rcount,
59 int communicator_size;
60 size_t dsize, block_dsize;
61 communicator_size = comm->size();
63 /* Decision function based on measurement on Grig cluster at
64 the University of Tennessee (2GB MX) up to 64 nodes.
65 Has better performance for messages of intermediate sizes than the old one */
66 /* determine block size */
67 dsize = sdtype->size();
68 block_dsize = dsize * scount;
70 if ((block_dsize < 200) && (communicator_size > 12)) {
71 return alltoall__bruck(sbuf, scount, sdtype,
72 rbuf, rcount, rdtype, comm);
74 } else if (block_dsize < 3000) {
75 return alltoall__basic_linear(sbuf, scount, sdtype,
76 rbuf, rcount, rdtype, comm);
79 return alltoall__ring(sbuf, scount, sdtype,
80 rbuf, rcount, rdtype, comm);
83 int alltoallv__ompi(const void *sbuf, const int *scounts, const int *sdisps,
85 void *rbuf, const int *rcounts, const int *rdisps,
90 /* For starters, just keep the original algorithm. */
91 return alltoallv__ring(sbuf, scounts, sdisps, sdtype,
92 rbuf, rcounts, rdisps,rdtype,
96 int barrier__ompi(MPI_Comm comm)
97 { int communicator_size = comm->size();
99 if( 2 == communicator_size )
100 return barrier__ompi_two_procs(comm);
101 /* * Basic optimisation. If we have a power of 2 number of nodes*/
102 /* * the use the recursive doubling algorithm, otherwise*/
103 /* * bruck is the one we want.*/
105 bool has_one = false;
106 for( ; communicator_size > 0; communicator_size >>= 1 ) {
107 if( communicator_size & 0x1 ) {
109 return barrier__ompi_bruck(comm);
114 return barrier__ompi_recursivedoubling(comm);
117 int bcast__ompi(void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
119 /* Decision function based on MX results for
120 messages up to 36MB and communicator sizes up to 64 nodes */
121 const size_t small_message_size = 2048;
122 const size_t intermediate_message_size = 370728;
123 const double a_p16 = 3.2118e-6; /* [1 / byte] */
124 const double b_p16 = 8.7936;
125 const double a_p64 = 2.3679e-6; /* [1 / byte] */
126 const double b_p64 = 1.1787;
127 const double a_p128 = 1.6134e-6; /* [1 / byte] */
128 const double b_p128 = 2.1102;
130 int communicator_size;
132 size_t message_size, dsize;
134 communicator_size = comm->size();
136 /* else we need data size for decision function */
137 dsize = datatype->size();
138 message_size = dsize * (unsigned long)count; /* needed for decision */
140 /* Handle messages of small and intermediate size, and
141 single-element broadcasts */
142 if ((message_size < small_message_size) || (count <= 1)) {
143 /* Binomial without segmentation */
144 return bcast__binomial_tree(buff, count, datatype, root, comm);
146 } else if (message_size < intermediate_message_size) {
147 // SplittedBinary with 1KB segments
148 return bcast__ompi_split_bintree(buff, count, datatype, root, comm);
151 //Handle large message sizes
152 else if (communicator_size < (a_p128 * message_size + b_p128)) {
153 //Pipeline with 128KB segments
154 //segsize = 1024 << 7;
155 return bcast__ompi_pipeline(buff, count, datatype, root, comm);
158 } else if (communicator_size < 13) {
159 // Split Binary with 8KB segments
160 return bcast__ompi_split_bintree(buff, count, datatype, root, comm);
162 } else if (communicator_size < (a_p64 * message_size + b_p64)) {
163 // Pipeline with 64KB segments
164 //segsize = 1024 << 6;
165 return bcast__ompi_pipeline(buff, count, datatype, root, comm);
168 } else if (communicator_size < (a_p16 * message_size + b_p16)) {
169 //Pipeline with 16KB segments
170 //segsize = 1024 << 4;
171 return bcast__ompi_pipeline(buff, count, datatype, root, comm);
175 /* Pipeline with 8KB segments */
176 //segsize = 1024 << 3;
177 return bcast__flattree_pipeline(buff, count, datatype, root, comm /*segsize*/);
179 /* this is based on gige measurements */
181 if (communicator_size < 4) {
182 return bcast__intra_basic_linear(buff, count, datatype, root, comm, module);
184 if (communicator_size == 4) {
185 if (message_size < 524288) segsize = 0;
186 else segsize = 16384;
187 return bcast__intra_bintree(buff, count, datatype, root, comm, module, segsize);
189 if (communicator_size <= 8 && message_size < 4096) {
190 return bcast__intra_basic_linear(buff, count, datatype, root, comm, module);
192 if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
194 return bcast__intra_bintree(buff, count, datatype, root, comm, module, segsize);
196 if (message_size >= 524288) {
198 return bcast__intra_pipeline(buff, count, datatype, root, comm, module, segsize);
201 /* once tested can swap this back in */
202 /* return bcast__intra_bmtree(buff, count, datatype, root, comm, segsize); */
203 return bcast__intra_bintree(buff, count, datatype, root, comm, module, segsize);
207 int reduce__ompi(const void *sendbuf, void *recvbuf,
208 int count, MPI_Datatype datatype,
212 int communicator_size=0;
214 size_t message_size, dsize;
215 const double a1 = 0.6016 / 1024.0; /* [1/B] */
216 const double b1 = 1.3496;
217 const double a2 = 0.0410 / 1024.0; /* [1/B] */
218 const double b2 = 9.7128;
219 const double a3 = 0.0422 / 1024.0; /* [1/B] */
220 const double b3 = 1.1614;
221 //const double a4 = 0.0033 / 1024.0; [1/B]
222 //const double b4 = 1.6761;
224 /* no limit on # of outstanding requests */
225 //const int max_requests = 0;
227 communicator_size = comm->size();
229 /* need data size for decision function */
230 dsize=datatype->size();
231 message_size = dsize * count; /* needed for decision */
234 * If the operation is non commutative we currently have choice of linear
235 * or in-order binary tree algorithm.
237 if ((op != MPI_OP_NULL) && not op->is_commutative()) {
238 if ((communicator_size < 12) && (message_size < 2048)) {
239 return reduce__ompi_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm /*, module*/);
241 return reduce__ompi_in_order_binary(sendbuf, recvbuf, count, datatype, op, root, comm /*, module,
245 if ((communicator_size < 8) && (message_size < 512)){
247 return reduce__ompi_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm);
248 } else if (((communicator_size < 8) && (message_size < 20480)) ||
249 (message_size < 2048) || (count <= 1)) {
252 return reduce__ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/);
253 } else if (communicator_size > (a1 * message_size + b1)) {
256 return reduce__ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
257 segsize, max_requests*/);
258 } else if (communicator_size > (a2 * message_size + b2)) {
261 return reduce__ompi_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
262 segsize, max_requests*/);
263 } else if (communicator_size > (a3 * message_size + b3)) {
266 return reduce__ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
267 comm/*, module, segsize, max_requests*/);
269 // if (communicator_size > (a4 * message_size + b4)) {
271 // segsize = 32*1024;
274 // segsize = 64*1024;
276 return reduce__ompi_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
277 segsize, max_requests*/);
280 /* for small messages use linear algorithm */
281 if (message_size <= 4096) {
283 fanout = communicator_size - 1;
284 /* when linear implemented or taken from basic put here, right now using chain as a linear system */
285 /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
286 return reduce__intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
287 /* return reduce__intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
289 if (message_size < 524288) {
290 if (message_size <= 65536 ) {
295 fanout = communicator_size/2;
297 /* later swap this for a binary tree */
299 return reduce__intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, module,
300 segsize, fanout, max_requests);
303 return reduce__intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
304 segsize, max_requests);
308 int reduce_scatter__ompi(const void *sbuf, void *rbuf,
315 int comm_size, i, pow2;
316 size_t total_message_size, dsize;
317 const double a = 0.0012;
318 const double b = 8.0;
319 const size_t small_message_size = 12 * 1024;
320 const size_t large_message_size = 256 * 1024;
323 XBT_DEBUG("reduce_scatter__ompi");
325 comm_size = comm->size();
326 // We need data size for decision function
328 total_message_size = 0;
329 for (i = 0; i < comm_size; i++) {
330 total_message_size += rcounts[i];
331 if (0 == rcounts[i]) {
336 if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) {
337 reduce_scatter__default(sbuf, rbuf, rcounts, dtype, op, comm);
341 total_message_size *= dsize;
343 // compute the nearest power of 2
344 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
346 if ((total_message_size <= small_message_size) ||
347 ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
348 (comm_size >= a * total_message_size + b)) {
349 return reduce_scatter__ompi_basic_recursivehalving(sbuf, rbuf, rcounts, dtype, op, comm);
351 return reduce_scatter__ompi_ring(sbuf, rbuf, rcounts, dtype, op, comm);
354 int allgather__ompi(const void *sbuf, int scount,
356 void* rbuf, int rcount,
361 int communicator_size, pow2_size;
362 size_t dsize, total_dsize;
364 communicator_size = comm->size();
366 /* Special case for 2 processes */
367 if (communicator_size == 2) {
368 return allgather__pair(sbuf, scount, sdtype,
369 rbuf, rcount, rdtype,
373 /* Determine complete data size */
374 dsize=sdtype->size();
375 total_dsize = dsize * scount * communicator_size;
377 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
379 /* Decision based on MX 2Gb results from Grig cluster at
380 The University of Tennesse, Knoxville
381 - if total message size is less than 50KB use either bruck or
382 recursive doubling for non-power of two and power of two nodes,
384 - else use ring and neighbor exchange algorithms for odd and even
385 number of nodes, respectively.
387 if (total_dsize < 50000) {
388 if (pow2_size == communicator_size) {
389 return allgather__rdb(sbuf, scount, sdtype,
390 rbuf, rcount, rdtype,
393 return allgather__bruck(sbuf, scount, sdtype,
394 rbuf, rcount, rdtype,
398 if (communicator_size % 2) {
399 return allgather__ring(sbuf, scount, sdtype,
400 rbuf, rcount, rdtype,
403 return allgather__ompi_neighborexchange(sbuf, scount, sdtype,
404 rbuf, rcount, rdtype,
409 #if defined(USE_MPICH2_DECISION)
410 /* Decision as in MPICH-2
411 presented in Thakur et.al. "Optimization of Collective Communication
412 Operations in MPICH", International Journal of High Performance Computing
413 Applications, Vol. 19, No. 1, 49-66 (2005)
414 - for power-of-two processes and small and medium size messages
415 (up to 512KB) use recursive doubling
416 - for non-power-of-two processes and small messages (80KB) use bruck,
417 - for everything else use ring.
419 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
420 return allgather__rdb(sbuf, scount, sdtype,
421 rbuf, rcount, rdtype,
423 } else if (total_dsize <= 81920) {
424 return allgather__bruck(sbuf, scount, sdtype,
425 rbuf, rcount, rdtype,
428 return allgather__ring(sbuf, scount, sdtype,
429 rbuf, rcount, rdtype,
431 #endif /* defined(USE_MPICH2_DECISION) */
434 int allgatherv__ompi(const void *sbuf, int scount,
436 void* rbuf, const int *rcounts,
443 int communicator_size;
444 size_t dsize, total_dsize;
446 communicator_size = comm->size();
448 /* Special case for 2 processes */
449 if (communicator_size == 2) {
450 return allgatherv__pair(sbuf, scount, sdtype,
451 rbuf, rcounts, rdispls, rdtype,
455 /* Determine complete data size */
456 dsize=sdtype->size();
458 for (i = 0; i < communicator_size; i++) {
459 total_dsize += dsize * rcounts[i];
462 /* Decision based on allgather decision. */
463 if (total_dsize < 50000) {
464 return allgatherv__ompi_bruck(sbuf, scount, sdtype,
465 rbuf, rcounts, rdispls, rdtype,
469 if (communicator_size % 2) {
470 return allgatherv__ring(sbuf, scount, sdtype,
471 rbuf, rcounts, rdispls, rdtype,
474 return allgatherv__ompi_neighborexchange(sbuf, scount, sdtype,
475 rbuf, rcounts, rdispls, rdtype,
481 int gather__ompi(const void *sbuf, int scount,
483 void* rbuf, int rcount,
489 //const int large_segment_size = 32768;
490 //const int small_segment_size = 1024;
492 //const size_t large_block_size = 92160;
493 const size_t intermediate_block_size = 6000;
494 const size_t small_block_size = 1024;
496 const int large_communicator_size = 60;
497 const int small_communicator_size = 10;
499 int communicator_size, rank;
500 size_t dsize, block_size;
502 XBT_DEBUG("smpi_coll_tuned_gather_ompi");
504 communicator_size = comm->size();
507 // Determine block size
509 dsize = rdtype->size();
510 block_size = dsize * rcount;
512 dsize = sdtype->size();
513 block_size = dsize * scount;
516 /* if (block_size > large_block_size) {*/
517 /* return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
518 /* rbuf, rcount, rdtype, */
521 /* } else*/ if (block_size > intermediate_block_size) {
522 return gather__ompi_linear_sync(sbuf, scount, sdtype,
523 rbuf, rcount, rdtype,
526 } else if ((communicator_size > large_communicator_size) ||
527 ((communicator_size > small_communicator_size) &&
528 (block_size < small_block_size))) {
529 return gather__ompi_binomial(sbuf, scount, sdtype,
530 rbuf, rcount, rdtype,
534 // Otherwise, use basic linear
535 return gather__ompi_basic_linear(sbuf, scount, sdtype,
536 rbuf, rcount, rdtype,
541 int scatter__ompi(const void *sbuf, int scount,
543 void* rbuf, int rcount,
545 int root, MPI_Comm comm
548 const size_t small_block_size = 300;
549 const int small_comm_size = 10;
550 int communicator_size, rank;
551 size_t dsize, block_size;
553 XBT_DEBUG("Coll_scatter_ompi::scatter");
555 communicator_size = comm->size();
557 // Determine block size
559 dsize=sdtype->size();
560 block_size = dsize * scount;
562 dsize=rdtype->size();
563 block_size = dsize * rcount;
566 if ((communicator_size > small_comm_size) &&
567 (block_size < small_block_size)) {
568 std::unique_ptr<unsigned char[]> tmp_buf;
570 tmp_buf = std::make_unique<unsigned char[]>(rcount * rdtype->get_extent());
571 sbuf = tmp_buf.get();
575 return scatter__ompi_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
577 return scatter__ompi_basic_linear(sbuf, scount, sdtype,
578 rbuf, rcount, rdtype,