1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
15 size_t dsize, block_dsize;
16 int comm_size = smpi_comm_size(comm);
17 const size_t intermediate_message = 10000;
20 * Decision function based on MX results from the Grig cluster at UTK.
22 * Currently, linear, recursive doubling, and nonoverlapping algorithms
23 * can handle both commutative and non-commutative operations.
24 * Ring algorithm does not support non-commutative operations.
26 dsize = smpi_datatype_size(dtype);
27 block_dsize = dsize * count;
29 if (block_dsize < intermediate_message) {
30 return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
35 if( smpi_op_is_commute(op) && (count > comm_size) ) {
36 const size_t segment_size = 1 << 20; /* 1 MB */
37 if ((comm_size * segment_size >= block_dsize)) {
38 //FIXME: ok, these are not the right algorithms, try to find closer ones
39 // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40 return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
43 // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
44 return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
51 return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
59 void* rbuf, int rcount,
63 int communicator_size;
64 size_t dsize, block_dsize;
65 communicator_size = smpi_comm_size(comm);
67 /* Decision function based on measurement on Grig cluster at
68 the University of Tennessee (2GB MX) up to 64 nodes.
69 Has better performance for messages of intermediate sizes than the old one */
70 /* determine block size */
71 dsize = smpi_datatype_size(sdtype);
72 block_dsize = dsize * scount;
74 if ((block_dsize < 200) && (communicator_size > 12)) {
75 return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
79 } else if (block_dsize < 3000) {
80 return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
85 return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
92 void *rbuf, int *rcounts, int *rdisps,
97 /* For starters, just keep the original algorithm. */
98 return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
99 rbuf, rcounts, rdisps,rdtype,
104 void smpi_coll_tuned_barrier_ompi(MPI_Comm comm)
105 { int communicator_size = smpi_comm_size(comm);
107 if( 2 == communicator_size )
108 return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
109 * Basic optimisation. If we have a power of 2 number of nodes
110 * the use the recursive doubling algorithm, otherwise
111 * bruck is the one we want.
113 bool has_one = false;
114 for( ; communicator_size > 0; communicator_size >>= 1 ) {
115 if( communicator_size & 0x1 ) {
117 return smpi_coll_tuned_barrier_intra_bruck(comm, module);
122 return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
126 MPI_Datatype datatype, int root,
130 /* Decision function based on MX results for
131 messages up to 36MB and communicator sizes up to 64 nodes */
132 const size_t small_message_size = 2048;
133 const size_t intermediate_message_size = 370728;
134 const double a_p16 = 3.2118e-6; /* [1 / byte] */
135 const double b_p16 = 8.7936;
136 const double a_p64 = 2.3679e-6; /* [1 / byte] */
137 const double b_p64 = 1.1787;
138 const double a_p128 = 1.6134e-6; /* [1 / byte] */
139 const double b_p128 = 2.1102;
141 int communicator_size;
143 size_t message_size, dsize;
145 communicator_size = smpi_comm_size(comm);
147 /* else we need data size for decision function */
148 dsize = smpi_datatype_size(datatype);
149 message_size = dsize * (unsigned long)count; /* needed for decision */
151 /* Handle messages of small and intermediate size, and
152 single-element broadcasts */
153 if ((message_size < small_message_size) || (count <= 1)) {
154 /* Binomial without segmentation */
155 return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
158 } else if (message_size < intermediate_message_size) {
159 // SplittedBinary with 1KB segments
160 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
164 //Handle large message sizes
165 else if (communicator_size < (a_p128 * message_size + b_p128)) {
166 //Pipeline with 128KB segments
167 //segsize = 1024 << 7;
168 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
172 } else if (communicator_size < 13) {
173 // Split Binary with 8KB segments
174 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
177 } else if (communicator_size < (a_p64 * message_size + b_p64)) {
178 // Pipeline with 64KB segments
179 //segsize = 1024 << 6;
180 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
184 } else if (communicator_size < (a_p16 * message_size + b_p16)) {
185 //Pipeline with 16KB segments
186 //segsize = 1024 << 4;
187 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
192 /* Pipeline with 8KB segments */
193 //segsize = 1024 << 3;
194 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
198 /* this is based on gige measurements */
200 if (communicator_size < 4) {
201 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
203 if (communicator_size == 4) {
204 if (message_size < 524288) segsize = 0;
205 else segsize = 16384;
206 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
208 if (communicator_size <= 8 && message_size < 4096) {
209 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
211 if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
213 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
215 if (message_size >= 524288) {
217 return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
220 /* once tested can swap this back in */
221 /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
222 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
226 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
227 int count, MPI_Datatype datatype,
232 int communicator_size=0;
234 size_t message_size, dsize;
235 const double a1 = 0.6016 / 1024.0; /* [1/B] */
236 const double b1 = 1.3496;
237 const double a2 = 0.0410 / 1024.0; /* [1/B] */
238 const double b2 = 9.7128;
239 const double a3 = 0.0422 / 1024.0; /* [1/B] */
240 const double b3 = 1.1614;
241 //const double a4 = 0.0033 / 1024.0; /* [1/B] */
242 //const double b4 = 1.6761;
244 //const int max_requests = 0; /* no limit on # of outstanding requests */
246 communicator_size = smpi_comm_size(comm);
248 /* need data size for decision function */
249 dsize=smpi_datatype_size(datatype);
250 message_size = dsize * count; /* needed for decision */
253 * If the operation is non commutative we currently have choice of linear
254 * or in-order binary tree algorithm.
256 if( !smpi_op_is_commute(op) ) {
257 if ((communicator_size < 12) && (message_size < 2048)) {
258 return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/);
260 return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
264 if ((communicator_size < 8) && (message_size < 512)){
266 return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
267 } else if (((communicator_size < 8) && (message_size < 20480)) ||
268 (message_size < 2048) || (count <= 1)) {
271 return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
272 segsize, max_requests*/);
273 } else if (communicator_size > (a1 * message_size + b1)) {
276 return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
277 segsize, max_requests*/);
278 } else if (communicator_size > (a2 * message_size + b2)) {
281 return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
282 segsize, max_requests*/);
283 } else if (communicator_size > (a3 * message_size + b3)) {
286 return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
287 comm/*, module, segsize, max_requests*/);
289 /*if (communicator_size > (a4 * message_size + b4)) {
296 return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
297 segsize, max_requests*/);
300 /* for small messages use linear algorithm */
301 if (message_size <= 4096) {
303 fanout = communicator_size - 1;
304 /* when linear implemented or taken from basic put here, right now using chain as a linear system */
305 /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
306 return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
307 /* return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
309 if (message_size < 524288) {
310 if (message_size <= 65536 ) {
315 fanout = communicator_size/2;
317 /* later swap this for a binary tree */
319 return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
320 segsize, fanout, max_requests);
323 return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
324 segsize, max_requests);
328 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
335 int comm_size, i, pow2;
336 size_t total_message_size, dsize;
337 const double a = 0.0012;
338 const double b = 8.0;
339 const size_t small_message_size = 12 * 1024;
340 const size_t large_message_size = 256 * 1024;
341 bool zerocounts = false;
343 OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
345 comm_size = smpi_comm_size(comm);
346 // We need data size for decision function
347 ompi_datatype_type_size(dtype, &dsize);
348 total_message_size = 0;
349 for (i = 0; i < comm_size; i++) {
350 total_message_size += rcounts[i];
351 if (0 == rcounts[i]) {
356 if( !ompi_op_is_commute(op) || (zerocounts)) {
357 return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
362 total_message_size *= dsize;
364 // compute the nearest power of 2
365 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
367 if ((total_message_size <= small_message_size) ||
368 ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
369 (comm_size >= a * total_message_size + b)) {
371 smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
375 return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
380 return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
386 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
388 void* rbuf, int rcount,
393 int communicator_size, pow2_size;
394 size_t dsize, total_dsize;
396 communicator_size = smpi_comm_size(comm);
398 /* Special case for 2 processes */
399 if (communicator_size == 2) {
400 return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
401 rbuf, rcount, rdtype,
405 /* Determine complete data size */
406 dsize=smpi_datatype_size(sdtype);
407 total_dsize = dsize * scount * communicator_size;
409 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
411 /* Decision based on MX 2Gb results from Grig cluster at
412 The University of Tennesse, Knoxville
413 - if total message size is less than 50KB use either bruck or
414 recursive doubling for non-power of two and power of two nodes,
416 - else use ring and neighbor exchange algorithms for odd and even
417 number of nodes, respectively.
419 if (total_dsize < 50000) {
420 if (pow2_size == communicator_size) {
421 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
422 rbuf, rcount, rdtype,
425 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
426 rbuf, rcount, rdtype,
430 //if (communicator_size % 2) {
431 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
432 rbuf, rcount, rdtype,
435 return smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
436 rbuf, rcount, rdtype,
441 #if defined(USE_MPICH2_DECISION)
442 /* Decision as in MPICH-2
443 presented in Thakur et.al. "Optimization of Collective Communication
444 Operations in MPICH", International Journal of High Performance Computing
445 Applications, Vol. 19, No. 1, 49-66 (2005)
446 - for power-of-two processes and small and medium size messages
447 (up to 512KB) use recursive doubling
448 - for non-power-of-two processes and small messages (80KB) use bruck,
449 - for everything else use ring.
451 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
452 return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
453 rbuf, rcount, rdtype,
455 } else if (total_dsize <= 81920) {
456 return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
457 rbuf, rcount, rdtype,
460 return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
461 rbuf, rcount, rdtype,
463 #endif /* defined(USE_MPICH2_DECISION) */
466 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
468 void* rbuf, int *rcounts,
475 int communicator_size;
476 size_t dsize, total_dsize;
478 communicator_size = smpi_comm_size(comm);
480 /* Special case for 2 processes */
481 if (communicator_size == 2) {
482 return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
483 rbuf, rcounts, rdispls, rdtype,
487 /* Determine complete data size */
488 dsize=smpi_datatype_size(sdtype);
490 for (i = 0; i < communicator_size; i++) {
491 total_dsize += dsize * rcounts[i];
494 /* Decision based on allgather decision. */
495 if (total_dsize < 50000) {
496 /* return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
497 rbuf, rcounts, rdispls, rdtype,
499 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
500 rbuf, rcounts, rdispls, rdtype,
504 // if (communicator_size % 2) {
505 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
506 rbuf, rcounts, rdispls, rdtype,
509 return smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
510 rbuf, rcounts, rdispls, rdtype,
516 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
518 void* rbuf, int rcount,
524 const int large_segment_size = 32768;
525 const int small_segment_size = 1024;
527 const size_t large_block_size = 92160;
528 const size_t intermediate_block_size = 6000;
529 const size_t small_block_size = 1024;
531 const int large_communicator_size = 60;
532 const int small_communicator_size = 10;
534 int communicator_size, rank;
535 size_t dsize, block_size;
537 OPAL_OUTPUT((smpi_coll_tuned_stream,
538 "smpi_coll_tuned_gather_ompi"));
540 communicator_size = smpi_comm_size(comm);
541 rank = ompi_comm_rank(comm);
543 // Determine block size
545 ompi_datatype_type_size(rdtype, &dsize);
546 block_size = dsize * rcount;
548 ompi_datatype_type_size(sdtype, &dsize);
549 block_size = dsize * scount;
552 if (block_size > large_block_size) {
553 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
554 rbuf, rcount, rdtype,
558 } else if (block_size > intermediate_block_size) {
559 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
560 rbuf, rcount, rdtype,
564 } else if ((communicator_size > large_communicator_size) ||
565 ((communicator_size > small_communicator_size) &&
566 (block_size < small_block_size))) {
567 return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
568 rbuf, rcount, rdtype,
572 // Otherwise, use basic linear
573 return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
574 rbuf, rcount, rdtype,
578 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
580 void* rbuf, int rcount,
582 int root, MPI_Comm comm,
585 const size_t small_block_size = 300;
586 const int small_comm_size = 10;
587 int communicator_size, rank;
588 size_t dsize, block_size;
590 OPAL_OUTPUT((smpi_coll_tuned_stream,
591 "smpi_coll_tuned_scatter_ompi"));
593 communicator_size = smpi_comm_size(comm);
594 rank = ompi_comm_rank(comm);
595 // Determine block size
597 ompi_datatype_type_size(sdtype, &dsize);
598 block_size = dsize * scount;
600 ompi_datatype_type_size(rdtype, &dsize);
601 block_size = dsize * rcount;
604 if ((communicator_size > small_comm_size) &&
605 (block_size < small_block_size)) {
606 return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
607 rbuf, rcount, rdtype,
610 return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
611 rbuf, rcount, rdtype,