1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector
4 /* Copyright (c) 2009-2022. The SimGrid Team.
5 * All rights reserved. */
7 /* This program is free software; you can redistribute it and/or modify it
8 * under the terms of the license (GNU LGPL) which comes with this package. */
10 #include "colls_private.hpp"
16 allreduce nonoverlapping, basic linear
24 int allreduce__ompi(const void *sbuf, void *rbuf, int count,
25 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
27 size_t total_dsize = dtype->size() * (ptrdiff_t)count;
28 int communicator_size = comm->size();
30 int(*funcs[]) (const void*, void*, int, MPI_Datatype, MPI_Op, MPI_Comm)={
35 &allreduce__ompi_ring_segmented,
40 * {1, "basic_linear"},
41 * {2, "nonoverlapping"},
42 * {3, "recursive_doubling"},
44 * {5, "segmented_ring"},
47 * Currently, ring, segmented ring, and rabenseifner do not support
48 * non-commutative operations.
50 if ((op != MPI_OP_NULL) && not op->is_commutative()) {
51 if (communicator_size < 4) {
52 if (total_dsize < 131072) {
57 } else if (communicator_size < 8) {
59 } else if (communicator_size < 16) {
60 if (total_dsize < 1048576) {
65 } else if (communicator_size < 128) {
67 } else if (communicator_size < 256) {
68 if (total_dsize < 131072) {
70 } else if (total_dsize < 524288) {
75 } else if (communicator_size < 512) {
76 if (total_dsize < 4096) {
78 } else if (total_dsize < 524288) {
84 if (total_dsize < 2048) {
91 if (communicator_size < 4) {
92 if (total_dsize < 8) {
94 } else if (total_dsize < 4096) {
96 } else if (total_dsize < 8192) {
98 } else if (total_dsize < 16384) {
100 } else if (total_dsize < 65536) {
102 } else if (total_dsize < 262144) {
107 } else if (communicator_size < 8) {
108 if (total_dsize < 16) {
110 } else if (total_dsize < 8192) {
115 } else if (communicator_size < 16) {
116 if (total_dsize < 8192) {
121 } else if (communicator_size < 32) {
122 if (total_dsize < 64) {
124 } else if (total_dsize < 4096) {
129 } else if (communicator_size < 64) {
130 if (total_dsize < 128) {
135 } else if (communicator_size < 128) {
136 if (total_dsize < 262144) {
141 } else if (communicator_size < 256) {
142 if (total_dsize < 131072) {
144 } else if (total_dsize < 262144) {
149 } else if (communicator_size < 512) {
150 if (total_dsize < 4096) {
155 } else if (communicator_size < 2048) {
156 if (total_dsize < 2048) {
158 } else if (total_dsize < 16384) {
163 } else if (communicator_size < 4096) {
164 if (total_dsize < 2048) {
166 } else if (total_dsize < 4096) {
168 } else if (total_dsize < 16384) {
174 if (total_dsize < 2048) {
176 } else if (total_dsize < 16384) {
178 } else if (total_dsize < 32768) {
185 return funcs[alg-1](sbuf, rbuf, count, dtype, op, comm);
190 int alltoall__ompi(const void *sbuf, int scount,
192 void* rbuf, int rcount,
197 size_t dsize, total_dsize;
198 int communicator_size = comm->size();
200 if (MPI_IN_PLACE != sbuf) {
201 dsize = sdtype->size();
203 dsize = rdtype->size();
205 total_dsize = dsize * (ptrdiff_t)scount;
206 int (*funcs[])(const void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = {
207 &alltoall__basic_linear,
210 &alltoall__basic_linear,
211 &alltoall__basic_linear
216 * {3, "modified_bruck"},
217 * {4, "linear_sync"},
220 if (communicator_size == 2) {
221 if (total_dsize < 2) {
223 } else if (total_dsize < 4) {
225 } else if (total_dsize < 16) {
227 } else if (total_dsize < 64) {
229 } else if (total_dsize < 256) {
231 } else if (total_dsize < 4096) {
233 } else if (total_dsize < 32768) {
235 } else if (total_dsize < 262144) {
237 } else if (total_dsize < 1048576) {
242 } else if (communicator_size < 8) {
243 if (total_dsize < 8192) {
245 } else if (total_dsize < 16384) {
247 } else if (total_dsize < 65536) {
249 } else if (total_dsize < 524288) {
251 } else if (total_dsize < 1048576) {
256 } else if (communicator_size < 16) {
257 if (total_dsize < 262144) {
262 } else if (communicator_size < 32) {
263 if (total_dsize < 4) {
265 } else if (total_dsize < 512) {
267 } else if (total_dsize < 8192) {
269 } else if (total_dsize < 32768) {
271 } else if (total_dsize < 262144) {
273 } else if (total_dsize < 524288) {
278 } else if (communicator_size < 64) {
279 if (total_dsize < 512) {
281 } else if (total_dsize < 524288) {
286 } else if (communicator_size < 128) {
287 if (total_dsize < 1024) {
289 } else if (total_dsize < 2048) {
291 } else if (total_dsize < 4096) {
293 } else if (total_dsize < 262144) {
298 } else if (communicator_size < 256) {
299 if (total_dsize < 1024) {
301 } else if (total_dsize < 2048) {
303 } else if (total_dsize < 262144) {
308 } else if (communicator_size < 512) {
309 if (total_dsize < 1024) {
311 } else if (total_dsize < 8192) {
313 } else if (total_dsize < 32768) {
318 } else if (communicator_size < 1024) {
319 if (total_dsize < 512) {
321 } else if (total_dsize < 8192) {
323 } else if (total_dsize < 16384) {
325 } else if (total_dsize < 131072) {
327 } else if (total_dsize < 262144) {
332 } else if (communicator_size < 2048) {
333 if (total_dsize < 512) {
335 } else if (total_dsize < 1024) {
337 } else if (total_dsize < 2048) {
339 } else if (total_dsize < 16384) {
341 } else if (total_dsize < 262144) {
346 } else if (communicator_size < 4096) {
347 if (total_dsize < 1024) {
349 } else if (total_dsize < 4096) {
351 } else if (total_dsize < 8192) {
353 } else if (total_dsize < 131072) {
359 if (total_dsize < 2048) {
361 } else if (total_dsize < 8192) {
363 } else if (total_dsize < 16384) {
365 } else if (total_dsize < 32768) {
367 } else if (total_dsize < 65536) {
374 return funcs[alg-1](sbuf, scount, sdtype,
375 rbuf, rcount, rdtype, comm);
378 int alltoallv__ompi(const void *sbuf, const int *scounts, const int *sdisps,
380 void *rbuf, const int *rcounts, const int *rdisps,
385 int communicator_size = comm->size();
387 int (*funcs[])(const void *, const int*, const int*, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = {
388 &alltoallv__ompi_basic_linear,
392 * {1, "basic_linear"},
395 * We can only optimize based on com size
397 if (communicator_size < 4) {
399 } else if (communicator_size < 64) {
401 } else if (communicator_size < 128) {
403 } else if (communicator_size < 256) {
405 } else if (communicator_size < 1024) {
410 return funcs[alg-1](sbuf, scounts, sdisps, sdtype,
411 rbuf, rcounts, rdisps,rdtype,
415 int barrier__ompi(MPI_Comm comm)
417 int communicator_size = comm->size();
419 int (*funcs[])(MPI_Comm) = {
420 &barrier__ompi_basic_linear,
421 &barrier__ompi_basic_linear,
422 &barrier__ompi_recursivedoubling,
423 &barrier__ompi_bruck,
424 &barrier__ompi_two_procs,
429 * {2, "double_ring"},
430 * {3, "recursive_doubling"},
435 * We can only optimize based on com size
437 if (communicator_size < 4) {
439 } else if (communicator_size < 8) {
441 } else if (communicator_size < 64) {
443 } else if (communicator_size < 256) {
445 } else if (communicator_size < 512) {
447 } else if (communicator_size < 1024) {
449 } else if (communicator_size < 4096) {
455 return funcs[alg-1](comm);
458 int bcast__ompi(void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
461 size_t total_dsize, dsize;
463 int communicator_size = comm->size();
465 dsize = datatype->size();
466 total_dsize = dsize * (unsigned long)count;
467 int (*funcs[])(void*, int, MPI_Datatype, int, MPI_Comm) = {
469 &bcast__ompi_pipeline,
470 &bcast__ompi_pipeline,
471 &bcast__ompi_split_bintree,
473 &bcast__binomial_tree,
474 &bcast__mvapich2_knomial_intra_node,
475 &bcast__scatter_rdb_allgather,
476 &bcast__scatter_LR_allgather,
479 * {1, "basic_linear"},
482 * {4, "split_binary_tree"},
483 * {5, "binary_tree"},
486 * {8, "scatter_allgather"},
487 * {9, "scatter_allgather_ring"},
489 if (communicator_size < 4) {
490 if (total_dsize < 32) {
492 } else if (total_dsize < 256) {
494 } else if (total_dsize < 512) {
496 } else if (total_dsize < 1024) {
498 } else if (total_dsize < 32768) {
500 } else if (total_dsize < 131072) {
502 } else if (total_dsize < 262144) {
504 } else if (total_dsize < 524288) {
506 } else if (total_dsize < 1048576) {
511 } else if (communicator_size < 8) {
512 if (total_dsize < 64) {
514 } else if (total_dsize < 128) {
516 } else if (total_dsize < 2048) {
518 } else if (total_dsize < 8192) {
520 } else if (total_dsize < 1048576) {
525 } else if (communicator_size < 16) {
526 if (total_dsize < 8) {
528 } else if (total_dsize < 64) {
530 } else if (total_dsize < 4096) {
532 } else if (total_dsize < 16384) {
534 } else if (total_dsize < 32768) {
539 } else if (communicator_size < 32) {
540 if (total_dsize < 4096) {
542 } else if (total_dsize < 1048576) {
547 } else if (communicator_size < 64) {
548 if (total_dsize < 2048) {
553 } else if (communicator_size < 128) {
555 } else if (communicator_size < 256) {
556 if (total_dsize < 2) {
558 } else if (total_dsize < 16384) {
560 } else if (total_dsize < 32768) {
562 } else if (total_dsize < 65536) {
567 } else if (communicator_size < 1024) {
568 if (total_dsize < 16384) {
570 } else if (total_dsize < 32768) {
575 } else if (communicator_size < 2048) {
576 if (total_dsize < 524288) {
581 } else if (communicator_size < 4096) {
582 if (total_dsize < 262144) {
588 if (total_dsize < 8192) {
590 } else if (total_dsize < 16384) {
592 } else if (total_dsize < 262144) {
598 return funcs[alg-1](buff, count, datatype, root, comm);
601 int reduce__ompi(const void *sendbuf, void *recvbuf,
602 int count, MPI_Datatype datatype,
606 size_t total_dsize, dsize;
608 int communicator_size = comm->size();
610 dsize=datatype->size();
611 total_dsize = dsize * count;
612 int (*funcs[])(const void*, void*, int, MPI_Datatype, MPI_Op, int, MPI_Comm) = {
613 &reduce__ompi_basic_linear,
615 &reduce__ompi_pipeline,
616 &reduce__ompi_binary,
617 &reduce__ompi_binomial,
618 &reduce__ompi_in_order_binary,
619 //&reduce__rab our rab can't be used with all datatypes
620 &reduce__ompi_basic_linear
628 * {6, "in-order_binary"},
629 * {7, "rabenseifner"},
631 * Currently, only linear and in-order binary tree algorithms are
632 * capable of non commutative ops.
634 if ((op != MPI_OP_NULL) && not op->is_commutative()) {
635 if (communicator_size < 4) {
636 if (total_dsize < 8) {
641 } else if (communicator_size < 8) {
643 } else if (communicator_size < 16) {
644 if (total_dsize < 1024) {
646 } else if (total_dsize < 8192) {
648 } else if (total_dsize < 16384) {
650 } else if (total_dsize < 262144) {
655 } else if (communicator_size < 128) {
657 } else if (communicator_size < 256) {
658 if (total_dsize < 512) {
660 } else if (total_dsize < 1024) {
669 if (communicator_size < 4) {
670 if (total_dsize < 8) {
672 } else if (total_dsize < 16) {
674 } else if (total_dsize < 32) {
676 } else if (total_dsize < 262144) {
678 } else if (total_dsize < 524288) {
680 } else if (total_dsize < 1048576) {
685 } else if (communicator_size < 8) {
686 if (total_dsize < 4096) {
688 } else if (total_dsize < 65536) {
690 } else if (total_dsize < 262144) {
692 } else if (total_dsize < 524288) {
694 } else if (total_dsize < 1048576) {
699 } else if (communicator_size < 16) {
700 if (total_dsize < 8192) {
705 } else if (communicator_size < 32) {
706 if (total_dsize < 4096) {
711 } else if (communicator_size < 256) {
713 } else if (communicator_size < 512) {
714 if (total_dsize < 8192) {
716 } else if (total_dsize < 16384) {
721 } else if (communicator_size < 2048) {
723 } else if (communicator_size < 4096) {
724 if (total_dsize < 512) {
726 } else if (total_dsize < 1024) {
728 } else if (total_dsize < 8192) {
730 } else if (total_dsize < 16384) {
736 if (total_dsize < 16) {
738 } else if (total_dsize < 32) {
740 } else if (total_dsize < 1024) {
742 } else if (total_dsize < 2048) {
744 } else if (total_dsize < 8192) {
746 } else if (total_dsize < 16384) {
754 return funcs[alg-1] (sendbuf, recvbuf, count, datatype, op, root, comm);
757 int reduce_scatter__ompi(const void *sbuf, void *rbuf,
764 size_t total_dsize, dsize;
765 int communicator_size = comm->size();
770 for (int i = 0; i < communicator_size; i++) {
771 total_dsize += rcounts[i];
772 // if (0 == rcounts[i]) {
776 total_dsize *= dsize;
777 int (*funcs[])(const void*, void*, const int*, MPI_Datatype, MPI_Op, MPI_Comm) = {
778 &reduce_scatter__default,
779 &reduce_scatter__ompi_basic_recursivehalving,
780 &reduce_scatter__ompi_ring,
781 &reduce_scatter__ompi_butterfly,
784 * {1, "non-overlapping"},
785 * {2, "recursive_halving"},
789 * Non commutative algorithm capability needs re-investigation.
790 * Defaulting to non overlapping for non commutative ops.
792 if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) {
795 if (communicator_size < 4) {
796 if (total_dsize < 65536) {
798 } else if (total_dsize < 131072) {
803 } else if (communicator_size < 8) {
804 if (total_dsize < 8) {
806 } else if (total_dsize < 262144) {
811 } else if (communicator_size < 32) {
812 if (total_dsize < 262144) {
817 } else if (communicator_size < 64) {
818 if (total_dsize < 64) {
820 } else if (total_dsize < 2048) {
822 } else if (total_dsize < 524288) {
827 } else if (communicator_size < 128) {
828 if (total_dsize < 256) {
830 } else if (total_dsize < 512) {
832 } else if (total_dsize < 2048) {
834 } else if (total_dsize < 4096) {
839 } else if (communicator_size < 256) {
840 if (total_dsize < 256) {
842 } else if (total_dsize < 512) {
847 } else if (communicator_size < 512) {
848 if (total_dsize < 256) {
850 } else if (total_dsize < 1024) {
855 } else if (communicator_size < 1024) {
856 if (total_dsize < 512) {
858 } else if (total_dsize < 2048) {
860 } else if (total_dsize < 8192) {
862 } else if (total_dsize < 16384) {
867 } else if (communicator_size < 2048) {
868 if (total_dsize < 512) {
870 } else if (total_dsize < 4096) {
872 } else if (total_dsize < 16384) {
874 } else if (total_dsize < 32768) {
879 } else if (communicator_size < 4096) {
880 if (total_dsize < 512) {
882 } else if (total_dsize < 4096) {
888 if (total_dsize < 1024) {
890 } else if (total_dsize < 8192) {
898 return funcs[alg-1] (sbuf, rbuf, rcounts, dtype, op, comm);
901 int allgather__ompi(const void *sbuf, int scount,
903 void* rbuf, int rcount,
908 int communicator_size;
909 size_t dsize, total_dsize;
911 communicator_size = comm->size();
912 if (MPI_IN_PLACE != sbuf) {
913 dsize = sdtype->size();
915 dsize = rdtype->size();
917 total_dsize = dsize * (ptrdiff_t)scount;
918 int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = {
919 &allgather__NTSLR_NB,
923 &allgather__ompi_neighborexchange,
929 * {3, "recursive_doubling"},
934 if (communicator_size == 2) {
936 } else if (communicator_size < 32) {
938 } else if (communicator_size < 64) {
939 if (total_dsize < 1024) {
941 } else if (total_dsize < 65536) {
946 } else if (communicator_size < 128) {
947 if (total_dsize < 512) {
949 } else if (total_dsize < 65536) {
954 } else if (communicator_size < 256) {
955 if (total_dsize < 512) {
957 } else if (total_dsize < 131072) {
959 } else if (total_dsize < 524288) {
961 } else if (total_dsize < 1048576) {
966 } else if (communicator_size < 512) {
967 if (total_dsize < 32) {
969 } else if (total_dsize < 128) {
971 } else if (total_dsize < 1024) {
973 } else if (total_dsize < 131072) {
975 } else if (total_dsize < 524288) {
977 } else if (total_dsize < 1048576) {
982 } else if (communicator_size < 1024) {
983 if (total_dsize < 64) {
985 } else if (total_dsize < 256) {
987 } else if (total_dsize < 2048) {
992 } else if (communicator_size < 2048) {
993 if (total_dsize < 4) {
995 } else if (total_dsize < 8) {
997 } else if (total_dsize < 16) {
999 } else if (total_dsize < 32) {
1001 } else if (total_dsize < 256) {
1003 } else if (total_dsize < 512) {
1005 } else if (total_dsize < 4096) {
1010 } else if (communicator_size < 4096) {
1011 if (total_dsize < 32) {
1013 } else if (total_dsize < 128) {
1015 } else if (total_dsize < 512) {
1017 } else if (total_dsize < 4096) {
1023 if (total_dsize < 2) {
1025 } else if (total_dsize < 8) {
1027 } else if (total_dsize < 16) {
1029 } else if (total_dsize < 512) {
1031 } else if (total_dsize < 4096) {
1038 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
1042 int allgatherv__ompi(const void *sbuf, int scount,
1043 MPI_Datatype sdtype,
1044 void* rbuf, const int *rcounts,
1046 MPI_Datatype rdtype,
1051 int communicator_size;
1052 size_t dsize, total_dsize;
1054 communicator_size = comm->size();
1055 if (MPI_IN_PLACE != sbuf) {
1056 dsize = sdtype->size();
1058 dsize = rdtype->size();
1062 for (i = 0; i < communicator_size; i++) {
1063 total_dsize += dsize * rcounts[i];
1066 /* use the per-rank data size as basis, similar to allgather */
1067 size_t per_rank_dsize = total_dsize / communicator_size;
1069 int (*funcs[])(const void*, int, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = {
1071 &allgatherv__ompi_bruck,
1072 &allgatherv__mpich_ring,
1073 &allgatherv__ompi_neighborexchange,
1083 if (communicator_size == 2) {
1084 if (per_rank_dsize < 2048) {
1086 } else if (per_rank_dsize < 4096) {
1088 } else if (per_rank_dsize < 8192) {
1093 } else if (communicator_size < 8) {
1094 if (per_rank_dsize < 256) {
1096 } else if (per_rank_dsize < 4096) {
1098 } else if (per_rank_dsize < 8192) {
1100 } else if (per_rank_dsize < 16384) {
1102 } else if (per_rank_dsize < 262144) {
1107 } else if (communicator_size < 16) {
1108 if (per_rank_dsize < 1024) {
1113 } else if (communicator_size < 32) {
1114 if (per_rank_dsize < 128) {
1116 } else if (per_rank_dsize < 262144) {
1121 } else if (communicator_size < 64) {
1122 if (per_rank_dsize < 256) {
1124 } else if (per_rank_dsize < 8192) {
1129 } else if (communicator_size < 128) {
1130 if (per_rank_dsize < 256) {
1132 } else if (per_rank_dsize < 4096) {
1137 } else if (communicator_size < 256) {
1138 if (per_rank_dsize < 1024) {
1140 } else if (per_rank_dsize < 65536) {
1145 } else if (communicator_size < 512) {
1146 if (per_rank_dsize < 1024) {
1151 } else if (communicator_size < 1024) {
1152 if (per_rank_dsize < 512) {
1154 } else if (per_rank_dsize < 1024) {
1156 } else if (per_rank_dsize < 4096) {
1158 } else if (per_rank_dsize < 1048576) {
1164 if (per_rank_dsize < 4096) {
1171 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm);
1174 int gather__ompi(const void *sbuf, int scount,
1175 MPI_Datatype sdtype,
1176 void* rbuf, int rcount,
1177 MPI_Datatype rdtype,
1182 int communicator_size, rank;
1183 size_t dsize, total_dsize;
1185 communicator_size = comm->size();
1186 rank = comm->rank();
1189 dsize = rdtype->size();
1190 total_dsize = dsize * rcount;
1192 dsize = sdtype->size();
1193 total_dsize = dsize * scount;
1195 int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = {
1196 &gather__ompi_basic_linear,
1197 &gather__ompi_binomial,
1198 &gather__ompi_linear_sync
1201 * {1, "basic_linear"},
1203 * {3, "linear_sync"},
1205 * We do not make any rank specific checks since the params
1206 * should be uniform across ranks.
1208 if (communicator_size < 4) {
1209 if (total_dsize < 2) {
1211 } else if (total_dsize < 4) {
1213 } else if (total_dsize < 32768) {
1215 } else if (total_dsize < 65536) {
1217 } else if (total_dsize < 131072) {
1222 } else if (communicator_size < 8) {
1223 if (total_dsize < 1024) {
1225 } else if (total_dsize < 8192) {
1227 } else if (total_dsize < 32768) {
1229 } else if (total_dsize < 262144) {
1234 } else if (communicator_size < 256) {
1236 } else if (communicator_size < 512) {
1237 if (total_dsize < 2048) {
1239 } else if (total_dsize < 8192) {
1248 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
1252 int scatter__ompi(const void *sbuf, int scount,
1253 MPI_Datatype sdtype,
1254 void* rbuf, int rcount,
1255 MPI_Datatype rdtype,
1256 int root, MPI_Comm comm
1259 int communicator_size, rank;
1260 size_t dsize, total_dsize;
1263 communicator_size = comm->size();
1264 rank = comm->rank();
1266 dsize=sdtype->size();
1267 total_dsize = dsize * scount;
1269 dsize=rdtype->size();
1270 total_dsize = dsize * rcount;
1272 int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = {
1273 &scatter__ompi_basic_linear,
1274 &scatter__ompi_binomial,
1275 &scatter__ompi_linear_nb
1278 * {1, "basic_linear"},
1282 * We do not make any rank specific checks since the params
1283 * should be uniform across ranks.
1285 if (communicator_size < 4) {
1286 if (total_dsize < 2) {
1288 } else if (total_dsize < 131072) {
1290 } else if (total_dsize < 262144) {
1295 } else if (communicator_size < 8) {
1296 if (total_dsize < 2048) {
1298 } else if (total_dsize < 4096) {
1300 } else if (total_dsize < 8192) {
1302 } else if (total_dsize < 32768) {
1304 } else if (total_dsize < 1048576) {
1309 } else if (communicator_size < 16) {
1310 if (total_dsize < 16384) {
1312 } else if (total_dsize < 1048576) {
1317 } else if (communicator_size < 32) {
1318 if (total_dsize < 16384) {
1320 } else if (total_dsize < 32768) {
1325 } else if (communicator_size < 64) {
1326 if (total_dsize < 512) {
1328 } else if (total_dsize < 8192) {
1330 } else if (total_dsize < 16384) {
1336 if (total_dsize < 512) {
1343 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);