1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2010, 2013. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
15 size_t dsize, block_dsize;
16 int comm_size = smpi_comm_size(comm);
17 const size_t intermediate_message = 10000;
20 * Decision function based on MX results from the Grig cluster at UTK.
22 * Currently, linear, recursive doubling, and nonoverlapping algorithms
23 * can handle both commutative and non-commutative operations.
24 * Ring algorithm does not support non-commutative operations.
26 dsize = smpi_datatype_size(dtype);
27 block_dsize = dsize * count;
29 if (block_dsize < intermediate_message) {
30 return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
35 if( smpi_op_is_commute(op) && (count > comm_size) ) {
36 const size_t segment_size = 1 << 20; /* 1 MB */
37 if ((comm_size * segment_size >= block_dsize)) {
38 //FIXME: ok, these are not the right algorithms, try to find closer ones
39 // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40 return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
43 return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
50 return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
56 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
58 void* rbuf, int rcount,
62 int communicator_size;
63 size_t dsize, block_dsize;
64 communicator_size = smpi_comm_size(comm);
66 /* Decision function based on measurement on Grig cluster at
67 the University of Tennessee (2GB MX) up to 64 nodes.
68 Has better performance for messages of intermediate sizes than the old one */
69 /* determine block size */
70 dsize = smpi_datatype_size(sdtype);
71 block_dsize = dsize * scount;
73 if ((block_dsize < 200) && (communicator_size > 12)) {
74 return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
78 } else if (block_dsize < 3000) {
79 return smpi_coll_tuned_alltoall_basic_linear(sbuf, scount, sdtype,
84 return smpi_coll_tuned_alltoall_ring (sbuf, scount, sdtype,
89 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
91 void *rbuf, int *rcounts, int *rdisps,
96 /* For starters, just keep the original algorithm. */
97 return smpi_coll_tuned_alltoallv_ompi_basic_linear(sbuf, scounts, sdisps, sdtype,
98 rbuf, rcounts, rdisps,rdtype,
103 int smpi_coll_tuned_barrier_ompi(MPI_Comm comm)
104 { int communicator_size = smpi_comm_size(comm);
106 if( 2 == communicator_size )
107 return smpi_coll_tuned_barrier_ompi_two_procs(comm);
108 /* * Basic optimisation. If we have a power of 2 number of nodes*/
109 /* * the use the recursive doubling algorithm, otherwise*/
110 /* * bruck is the one we want.*/
113 for( ; communicator_size > 0; communicator_size >>= 1 ) {
114 if( communicator_size & 0x1 ) {
116 return smpi_coll_tuned_barrier_ompi_bruck(comm);
121 return smpi_coll_tuned_barrier_ompi_recursivedoubling(comm);
124 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
125 MPI_Datatype datatype, int root,
129 /* Decision function based on MX results for
130 messages up to 36MB and communicator sizes up to 64 nodes */
131 const size_t small_message_size = 2048;
132 const size_t intermediate_message_size = 370728;
133 const double a_p16 = 3.2118e-6; /* [1 / byte] */
134 const double b_p16 = 8.7936;
135 const double a_p64 = 2.3679e-6; /* [1 / byte] */
136 const double b_p64 = 1.1787;
137 const double a_p128 = 1.6134e-6; /* [1 / byte] */
138 const double b_p128 = 2.1102;
140 int communicator_size;
142 size_t message_size, dsize;
144 communicator_size = smpi_comm_size(comm);
146 /* else we need data size for decision function */
147 dsize = smpi_datatype_size(datatype);
148 message_size = dsize * (unsigned long)count; /* needed for decision */
150 /* Handle messages of small and intermediate size, and
151 single-element broadcasts */
152 if ((message_size < small_message_size) || (count <= 1)) {
153 /* Binomial without segmentation */
154 return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
157 } else if (message_size < intermediate_message_size) {
158 // SplittedBinary with 1KB segments
159 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
163 //Handle large message sizes
164 else if (communicator_size < (a_p128 * message_size + b_p128)) {
165 //Pipeline with 128KB segments
166 //segsize = 1024 << 7;
167 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
171 } else if (communicator_size < 13) {
172 // Split Binary with 8KB segments
173 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
176 } else if (communicator_size < (a_p64 * message_size + b_p64)) {
177 // Pipeline with 64KB segments
178 //segsize = 1024 << 6;
179 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
183 } else if (communicator_size < (a_p16 * message_size + b_p16)) {
184 //Pipeline with 16KB segments
185 //segsize = 1024 << 4;
186 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
191 /* Pipeline with 8KB segments */
192 //segsize = 1024 << 3;
193 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
197 /* this is based on gige measurements */
199 if (communicator_size < 4) {
200 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
202 if (communicator_size == 4) {
203 if (message_size < 524288) segsize = 0;
204 else segsize = 16384;
205 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
207 if (communicator_size <= 8 && message_size < 4096) {
208 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
210 if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
212 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
214 if (message_size >= 524288) {
216 return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
219 /* once tested can swap this back in */
220 /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
221 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
225 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
226 int count, MPI_Datatype datatype,
231 int communicator_size=0;
233 size_t message_size, dsize;
234 const double a1 = 0.6016 / 1024.0; /* [1/B] */
235 const double b1 = 1.3496;
236 const double a2 = 0.0410 / 1024.0; /* [1/B] */
237 const double b2 = 9.7128;
238 const double a3 = 0.0422 / 1024.0; /* [1/B] */
239 const double b3 = 1.1614;
240 //const double a4 = 0.0033 / 1024.0; /* [1/B] */
241 //const double b4 = 1.6761;
243 //const int max_requests = 0; /* no limit on # of outstanding requests */
245 communicator_size = smpi_comm_size(comm);
247 /* need data size for decision function */
248 dsize=smpi_datatype_size(datatype);
249 message_size = dsize * count; /* needed for decision */
252 * If the operation is non commutative we currently have choice of linear
253 * or in-order binary tree algorithm.
255 if( !smpi_op_is_commute(op) ) {
256 if ((communicator_size < 12) && (message_size < 2048)) {
257 return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/);
259 return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
263 if ((communicator_size < 8) && (message_size < 512)){
265 return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
266 } else if (((communicator_size < 8) && (message_size < 20480)) ||
267 (message_size < 2048) || (count <= 1)) {
270 return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
271 segsize, max_requests*/);
272 } else if (communicator_size > (a1 * message_size + b1)) {
275 return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
276 segsize, max_requests*/);
277 } else if (communicator_size > (a2 * message_size + b2)) {
280 return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
281 segsize, max_requests*/);
282 } else if (communicator_size > (a3 * message_size + b3)) {
285 return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
286 comm/*, module, segsize, max_requests*/);
288 /*if (communicator_size > (a4 * message_size + b4)) {
295 return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
296 segsize, max_requests*/);
299 /* for small messages use linear algorithm */
300 if (message_size <= 4096) {
302 fanout = communicator_size - 1;
303 /* when linear implemented or taken from basic put here, right now using chain as a linear system */
304 /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
305 return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
306 /* return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
308 if (message_size < 524288) {
309 if (message_size <= 65536 ) {
314 fanout = communicator_size/2;
316 /* later swap this for a binary tree */
318 return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
319 segsize, fanout, max_requests);
322 return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
323 segsize, max_requests);
327 int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
334 int comm_size, i, pow2;
335 size_t total_message_size, dsize;
336 const double a = 0.0012;
337 const double b = 8.0;
338 const size_t small_message_size = 12 * 1024;
339 const size_t large_message_size = 256 * 1024;
342 XBT_DEBUG("smpi_coll_tuned_reduce_scatter_ompi");
344 comm_size = smpi_comm_size(comm);
345 // We need data size for decision function
346 dsize=smpi_datatype_size(dtype);
347 total_message_size = 0;
348 for (i = 0; i < comm_size; i++) {
349 total_message_size += rcounts[i];
350 if (0 == rcounts[i]) {
355 if( !smpi_op_is_commute(op) || (zerocounts)) {
356 smpi_mpi_reduce_scatter (sbuf, rbuf, rcounts,
362 total_message_size *= dsize;
364 // compute the nearest power of 2
365 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
367 if ((total_message_size <= small_message_size) ||
368 ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
369 (comm_size >= a * total_message_size + b)) {
371 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,
375 return smpi_coll_tuned_reduce_scatter_ompi_ring(sbuf, rbuf, rcounts,
383 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
385 void* rbuf, int rcount,
390 int communicator_size, pow2_size;
391 size_t dsize, total_dsize;
393 communicator_size = smpi_comm_size(comm);
395 /* Special case for 2 processes */
396 if (communicator_size == 2) {
397 return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
398 rbuf, rcount, rdtype,
402 /* Determine complete data size */
403 dsize=smpi_datatype_size(sdtype);
404 total_dsize = dsize * scount * communicator_size;
406 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
408 /* Decision based on MX 2Gb results from Grig cluster at
409 The University of Tennesse, Knoxville
410 - if total message size is less than 50KB use either bruck or
411 recursive doubling for non-power of two and power of two nodes,
413 - else use ring and neighbor exchange algorithms for odd and even
414 number of nodes, respectively.
416 if (total_dsize < 50000) {
417 if (pow2_size == communicator_size) {
418 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
419 rbuf, rcount, rdtype,
422 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
423 rbuf, rcount, rdtype,
427 if (communicator_size % 2) {
428 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
429 rbuf, rcount, rdtype,
432 return smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
433 rbuf, rcount, rdtype,
438 #if defined(USE_MPICH2_DECISION)
439 /* Decision as in MPICH-2
440 presented in Thakur et.al. "Optimization of Collective Communication
441 Operations in MPICH", International Journal of High Performance Computing
442 Applications, Vol. 19, No. 1, 49-66 (2005)
443 - for power-of-two processes and small and medium size messages
444 (up to 512KB) use recursive doubling
445 - for non-power-of-two processes and small messages (80KB) use bruck,
446 - for everything else use ring.
448 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
449 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
450 rbuf, rcount, rdtype,
452 } else if (total_dsize <= 81920) {
453 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
454 rbuf, rcount, rdtype,
457 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
458 rbuf, rcount, rdtype,
460 #endif /* defined(USE_MPICH2_DECISION) */
463 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
465 void* rbuf, int *rcounts,
472 int communicator_size;
473 size_t dsize, total_dsize;
475 communicator_size = smpi_comm_size(comm);
477 /* Special case for 2 processes */
478 if (communicator_size == 2) {
479 return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
480 rbuf, rcounts, rdispls, rdtype,
484 /* Determine complete data size */
485 dsize=smpi_datatype_size(sdtype);
487 for (i = 0; i < communicator_size; i++) {
488 total_dsize += dsize * rcounts[i];
491 /* Decision based on allgather decision. */
492 if (total_dsize < 50000) {
493 /* return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
494 rbuf, rcounts, rdispls, rdtype,
496 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
497 rbuf, rcounts, rdispls, rdtype,
501 if (communicator_size % 2) {
502 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
503 rbuf, rcounts, rdispls, rdtype,
506 return smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
507 rbuf, rcounts, rdispls, rdtype,
513 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
515 void* rbuf, int rcount,
521 //const int large_segment_size = 32768;
522 //const int small_segment_size = 1024;
524 //const size_t large_block_size = 92160;
525 const size_t intermediate_block_size = 6000;
526 const size_t small_block_size = 1024;
528 const int large_communicator_size = 60;
529 const int small_communicator_size = 10;
531 int communicator_size, rank;
532 size_t dsize, block_size;
534 XBT_DEBUG("smpi_coll_tuned_gather_ompi");
536 communicator_size = smpi_comm_size(comm);
537 rank = smpi_comm_rank(comm);
539 // Determine block size
541 dsize = smpi_datatype_size(rdtype);
542 block_size = dsize * rcount;
544 dsize = smpi_datatype_size(sdtype);
545 block_size = dsize * scount;
548 /* if (block_size > large_block_size) {*/
549 /* return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
550 /* rbuf, rcount, rdtype, */
553 /* } else*/ if (block_size > intermediate_block_size) {
554 return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype,
555 rbuf, rcount, rdtype,
558 } else if ((communicator_size > large_communicator_size) ||
559 ((communicator_size > small_communicator_size) &&
560 (block_size < small_block_size))) {
561 return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype,
562 rbuf, rcount, rdtype,
566 // Otherwise, use basic linear
567 return smpi_coll_tuned_gather_ompi_basic_linear (sbuf, scount, sdtype,
568 rbuf, rcount, rdtype,
572 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
574 void* rbuf, int rcount,
576 int root, MPI_Comm comm
579 const size_t small_block_size = 300;
580 const int small_comm_size = 10;
581 int communicator_size, rank;
582 size_t dsize, block_size;
584 XBT_DEBUG("smpi_coll_tuned_scatter_ompi");
586 communicator_size = smpi_comm_size(comm);
587 rank = smpi_comm_rank(comm);
588 // Determine block size
590 dsize=smpi_datatype_size(sdtype);
591 block_size = dsize * scount;
593 dsize=smpi_datatype_size(rdtype);
594 block_size = dsize * rcount;
597 if ((communicator_size > small_comm_size) &&
598 (block_size < small_block_size)) {
600 sbuf=xbt_malloc(rcount*smpi_datatype_get_extent(rdtype));
604 int ret=smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype,
605 rbuf, rcount, rdtype,
612 return smpi_coll_tuned_scatter_ompi_basic_linear (sbuf, scount, sdtype,
613 rbuf, rcount, rdtype,