1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector
4 /* Copyright (c) 2009-2022. The SimGrid Team.
5 * All rights reserved. */
7 /* This program is free software; you can redistribute it and/or modify it
8 * under the terms of the license (GNU LGPL) which comes with this package. */
10 #include "colls_private.hpp"
16 allreduce nonoverlapping, basic linear
19 reduce_scatter butterfly
26 int allreduce__ompi(const void *sbuf, void *rbuf, int count,
27 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
29 size_t total_dsize = dtype->size() * (ptrdiff_t)count;
30 int communicator_size = comm->size();
32 int(*funcs[]) (const void*, void*, int, MPI_Datatype, MPI_Op, MPI_Comm)={
37 &allreduce__ompi_ring_segmented,
42 * {1, "basic_linear"},
43 * {2, "nonoverlapping"},
44 * {3, "recursive_doubling"},
46 * {5, "segmented_ring"},
49 * Currently, ring, segmented ring, and rabenseifner do not support
50 * non-commutative operations.
52 if ((op != MPI_OP_NULL) && not op->is_commutative()) {
53 if (communicator_size < 4) {
54 if (total_dsize < 131072) {
59 } else if (communicator_size < 8) {
61 } else if (communicator_size < 16) {
62 if (total_dsize < 1048576) {
67 } else if (communicator_size < 128) {
69 } else if (communicator_size < 256) {
70 if (total_dsize < 131072) {
72 } else if (total_dsize < 524288) {
77 } else if (communicator_size < 512) {
78 if (total_dsize < 4096) {
80 } else if (total_dsize < 524288) {
86 if (total_dsize < 2048) {
93 if (communicator_size < 4) {
94 if (total_dsize < 8) {
96 } else if (total_dsize < 4096) {
98 } else if (total_dsize < 8192) {
100 } else if (total_dsize < 16384) {
102 } else if (total_dsize < 65536) {
104 } else if (total_dsize < 262144) {
109 } else if (communicator_size < 8) {
110 if (total_dsize < 16) {
112 } else if (total_dsize < 8192) {
117 } else if (communicator_size < 16) {
118 if (total_dsize < 8192) {
123 } else if (communicator_size < 32) {
124 if (total_dsize < 64) {
126 } else if (total_dsize < 4096) {
131 } else if (communicator_size < 64) {
132 if (total_dsize < 128) {
137 } else if (communicator_size < 128) {
138 if (total_dsize < 262144) {
143 } else if (communicator_size < 256) {
144 if (total_dsize < 131072) {
146 } else if (total_dsize < 262144) {
151 } else if (communicator_size < 512) {
152 if (total_dsize < 4096) {
157 } else if (communicator_size < 2048) {
158 if (total_dsize < 2048) {
160 } else if (total_dsize < 16384) {
165 } else if (communicator_size < 4096) {
166 if (total_dsize < 2048) {
168 } else if (total_dsize < 4096) {
170 } else if (total_dsize < 16384) {
176 if (total_dsize < 2048) {
178 } else if (total_dsize < 16384) {
180 } else if (total_dsize < 32768) {
187 return funcs[alg-1](sbuf, rbuf, count, dtype, op, comm);
192 int alltoall__ompi(const void *sbuf, int scount,
194 void* rbuf, int rcount,
199 size_t dsize, total_dsize;
200 int communicator_size = comm->size();
202 if (MPI_IN_PLACE != sbuf) {
203 dsize = sdtype->size();
205 dsize = rdtype->size();
207 total_dsize = dsize * (ptrdiff_t)scount;
208 int (*funcs[])(const void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = {
209 &alltoall__basic_linear,
212 &alltoall__basic_linear,
213 &alltoall__basic_linear
218 * {3, "modified_bruck"},
219 * {4, "linear_sync"},
222 if (communicator_size == 2) {
223 if (total_dsize < 2) {
225 } else if (total_dsize < 4) {
227 } else if (total_dsize < 16) {
229 } else if (total_dsize < 64) {
231 } else if (total_dsize < 256) {
233 } else if (total_dsize < 4096) {
235 } else if (total_dsize < 32768) {
237 } else if (total_dsize < 262144) {
239 } else if (total_dsize < 1048576) {
244 } else if (communicator_size < 8) {
245 if (total_dsize < 8192) {
247 } else if (total_dsize < 16384) {
249 } else if (total_dsize < 65536) {
251 } else if (total_dsize < 524288) {
253 } else if (total_dsize < 1048576) {
258 } else if (communicator_size < 16) {
259 if (total_dsize < 262144) {
264 } else if (communicator_size < 32) {
265 if (total_dsize < 4) {
267 } else if (total_dsize < 512) {
269 } else if (total_dsize < 8192) {
271 } else if (total_dsize < 32768) {
273 } else if (total_dsize < 262144) {
275 } else if (total_dsize < 524288) {
280 } else if (communicator_size < 64) {
281 if (total_dsize < 512) {
283 } else if (total_dsize < 524288) {
288 } else if (communicator_size < 128) {
289 if (total_dsize < 1024) {
291 } else if (total_dsize < 2048) {
293 } else if (total_dsize < 4096) {
295 } else if (total_dsize < 262144) {
300 } else if (communicator_size < 256) {
301 if (total_dsize < 1024) {
303 } else if (total_dsize < 2048) {
305 } else if (total_dsize < 262144) {
310 } else if (communicator_size < 512) {
311 if (total_dsize < 1024) {
313 } else if (total_dsize < 8192) {
315 } else if (total_dsize < 32768) {
320 } else if (communicator_size < 1024) {
321 if (total_dsize < 512) {
323 } else if (total_dsize < 8192) {
325 } else if (total_dsize < 16384) {
327 } else if (total_dsize < 131072) {
329 } else if (total_dsize < 262144) {
334 } else if (communicator_size < 2048) {
335 if (total_dsize < 512) {
337 } else if (total_dsize < 1024) {
339 } else if (total_dsize < 2048) {
341 } else if (total_dsize < 16384) {
343 } else if (total_dsize < 262144) {
348 } else if (communicator_size < 4096) {
349 if (total_dsize < 1024) {
351 } else if (total_dsize < 4096) {
353 } else if (total_dsize < 8192) {
355 } else if (total_dsize < 131072) {
361 if (total_dsize < 2048) {
363 } else if (total_dsize < 8192) {
365 } else if (total_dsize < 16384) {
367 } else if (total_dsize < 32768) {
369 } else if (total_dsize < 65536) {
376 return funcs[alg-1](sbuf, scount, sdtype,
377 rbuf, rcount, rdtype, comm);
380 int alltoallv__ompi(const void *sbuf, const int *scounts, const int *sdisps,
382 void *rbuf, const int *rcounts, const int *rdisps,
387 int communicator_size = comm->size();
389 int (*funcs[])(const void *, const int*, const int*, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = {
390 &alltoallv__ompi_basic_linear,
394 * {1, "basic_linear"},
397 * We can only optimize based on com size
399 if (communicator_size < 4) {
401 } else if (communicator_size < 64) {
403 } else if (communicator_size < 128) {
405 } else if (communicator_size < 256) {
407 } else if (communicator_size < 1024) {
412 return funcs[alg-1](sbuf, scounts, sdisps, sdtype,
413 rbuf, rcounts, rdisps,rdtype,
417 int barrier__ompi(MPI_Comm comm)
419 int communicator_size = comm->size();
421 int (*funcs[])(MPI_Comm) = {
422 &barrier__ompi_basic_linear,
423 &barrier__ompi_basic_linear,
424 &barrier__ompi_recursivedoubling,
425 &barrier__ompi_bruck,
426 &barrier__ompi_two_procs,
431 * {2, "double_ring"},
432 * {3, "recursive_doubling"},
437 * We can only optimize based on com size
439 if (communicator_size < 4) {
441 } else if (communicator_size < 8) {
443 } else if (communicator_size < 64) {
445 } else if (communicator_size < 256) {
447 } else if (communicator_size < 512) {
449 } else if (communicator_size < 1024) {
451 } else if (communicator_size < 4096) {
457 return funcs[alg-1](comm);
460 int bcast__ompi(void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
463 size_t total_dsize, dsize;
465 int communicator_size = comm->size();
467 dsize = datatype->size();
468 total_dsize = dsize * (unsigned long)count;
469 int (*funcs[])(void*, int, MPI_Datatype, int, MPI_Comm) = {
471 &bcast__ompi_pipeline,
472 &bcast__ompi_pipeline,
473 &bcast__ompi_split_bintree,
475 &bcast__binomial_tree,
476 &bcast__mvapich2_knomial_intra_node,
477 &bcast__scatter_rdb_allgather,
478 &bcast__scatter_LR_allgather,
481 * {1, "basic_linear"},
484 * {4, "split_binary_tree"},
485 * {5, "binary_tree"},
488 * {8, "scatter_allgather"},
489 * {9, "scatter_allgather_ring"},
491 if (communicator_size < 4) {
492 if (total_dsize < 32) {
494 } else if (total_dsize < 256) {
496 } else if (total_dsize < 512) {
498 } else if (total_dsize < 1024) {
500 } else if (total_dsize < 32768) {
502 } else if (total_dsize < 131072) {
504 } else if (total_dsize < 262144) {
506 } else if (total_dsize < 524288) {
508 } else if (total_dsize < 1048576) {
513 } else if (communicator_size < 8) {
514 if (total_dsize < 64) {
516 } else if (total_dsize < 128) {
518 } else if (total_dsize < 2048) {
520 } else if (total_dsize < 8192) {
522 } else if (total_dsize < 1048576) {
527 } else if (communicator_size < 16) {
528 if (total_dsize < 8) {
530 } else if (total_dsize < 64) {
532 } else if (total_dsize < 4096) {
534 } else if (total_dsize < 16384) {
536 } else if (total_dsize < 32768) {
541 } else if (communicator_size < 32) {
542 if (total_dsize < 4096) {
544 } else if (total_dsize < 1048576) {
549 } else if (communicator_size < 64) {
550 if (total_dsize < 2048) {
555 } else if (communicator_size < 128) {
557 } else if (communicator_size < 256) {
558 if (total_dsize < 2) {
560 } else if (total_dsize < 16384) {
562 } else if (total_dsize < 32768) {
564 } else if (total_dsize < 65536) {
569 } else if (communicator_size < 1024) {
570 if (total_dsize < 16384) {
572 } else if (total_dsize < 32768) {
577 } else if (communicator_size < 2048) {
578 if (total_dsize < 524288) {
583 } else if (communicator_size < 4096) {
584 if (total_dsize < 262144) {
590 if (total_dsize < 8192) {
592 } else if (total_dsize < 16384) {
594 } else if (total_dsize < 262144) {
600 return funcs[alg-1](buff, count, datatype, root, comm);
603 int reduce__ompi(const void *sendbuf, void *recvbuf,
604 int count, MPI_Datatype datatype,
608 size_t total_dsize, dsize;
610 int communicator_size = comm->size();
612 dsize=datatype->size();
613 total_dsize = dsize * count;
614 int (*funcs[])(const void*, void*, int, MPI_Datatype, MPI_Op, int, MPI_Comm) = {
615 &reduce__ompi_basic_linear,
617 &reduce__ompi_pipeline,
618 &reduce__ompi_binary,
619 &reduce__ompi_binomial,
620 &reduce__ompi_in_order_binary,
621 //&reduce__rab our rab can't be used with all datatypes
622 &reduce__ompi_basic_linear
630 * {6, "in-order_binary"},
631 * {7, "rabenseifner"},
633 * Currently, only linear and in-order binary tree algorithms are
634 * capable of non commutative ops.
636 if ((op != MPI_OP_NULL) && not op->is_commutative()) {
637 if (communicator_size < 4) {
638 if (total_dsize < 8) {
643 } else if (communicator_size < 8) {
645 } else if (communicator_size < 16) {
646 if (total_dsize < 1024) {
648 } else if (total_dsize < 8192) {
650 } else if (total_dsize < 16384) {
652 } else if (total_dsize < 262144) {
657 } else if (communicator_size < 128) {
659 } else if (communicator_size < 256) {
660 if (total_dsize < 512) {
662 } else if (total_dsize < 1024) {
671 if (communicator_size < 4) {
672 if (total_dsize < 8) {
674 } else if (total_dsize < 16) {
676 } else if (total_dsize < 32) {
678 } else if (total_dsize < 262144) {
680 } else if (total_dsize < 524288) {
682 } else if (total_dsize < 1048576) {
687 } else if (communicator_size < 8) {
688 if (total_dsize < 4096) {
690 } else if (total_dsize < 65536) {
692 } else if (total_dsize < 262144) {
694 } else if (total_dsize < 524288) {
696 } else if (total_dsize < 1048576) {
701 } else if (communicator_size < 16) {
702 if (total_dsize < 8192) {
707 } else if (communicator_size < 32) {
708 if (total_dsize < 4096) {
713 } else if (communicator_size < 256) {
715 } else if (communicator_size < 512) {
716 if (total_dsize < 8192) {
718 } else if (total_dsize < 16384) {
723 } else if (communicator_size < 2048) {
725 } else if (communicator_size < 4096) {
726 if (total_dsize < 512) {
728 } else if (total_dsize < 1024) {
730 } else if (total_dsize < 8192) {
732 } else if (total_dsize < 16384) {
738 if (total_dsize < 16) {
740 } else if (total_dsize < 32) {
742 } else if (total_dsize < 1024) {
744 } else if (total_dsize < 2048) {
746 } else if (total_dsize < 8192) {
748 } else if (total_dsize < 16384) {
756 return funcs[alg-1] (sendbuf, recvbuf, count, datatype, op, root, comm);
759 int reduce_scatter__ompi(const void *sbuf, void *rbuf,
766 size_t total_dsize, dsize;
767 int communicator_size = comm->size();
772 for (int i = 0; i < communicator_size; i++) {
773 total_dsize += rcounts[i];
774 // if (0 == rcounts[i]) {
778 total_dsize *= dsize;
779 int (*funcs[])(const void*, void*, const int*, MPI_Datatype, MPI_Op, MPI_Comm) = {
780 &reduce_scatter__default,
781 &reduce_scatter__ompi_basic_recursivehalving,
782 &reduce_scatter__ompi_ring,
783 &reduce_scatter__ompi_ring,
786 * {1, "non-overlapping"},
787 * {2, "recursive_halving"},
791 * Non commutative algorithm capability needs re-investigation.
792 * Defaulting to non overlapping for non commutative ops.
794 if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) {
797 if (communicator_size < 4) {
798 if (total_dsize < 65536) {
800 } else if (total_dsize < 131072) {
805 } else if (communicator_size < 8) {
806 if (total_dsize < 8) {
808 } else if (total_dsize < 262144) {
813 } else if (communicator_size < 32) {
814 if (total_dsize < 262144) {
819 } else if (communicator_size < 64) {
820 if (total_dsize < 64) {
822 } else if (total_dsize < 2048) {
824 } else if (total_dsize < 524288) {
829 } else if (communicator_size < 128) {
830 if (total_dsize < 256) {
832 } else if (total_dsize < 512) {
834 } else if (total_dsize < 2048) {
836 } else if (total_dsize < 4096) {
841 } else if (communicator_size < 256) {
842 if (total_dsize < 256) {
844 } else if (total_dsize < 512) {
849 } else if (communicator_size < 512) {
850 if (total_dsize < 256) {
852 } else if (total_dsize < 1024) {
857 } else if (communicator_size < 1024) {
858 if (total_dsize < 512) {
860 } else if (total_dsize < 2048) {
862 } else if (total_dsize < 8192) {
864 } else if (total_dsize < 16384) {
869 } else if (communicator_size < 2048) {
870 if (total_dsize < 512) {
872 } else if (total_dsize < 4096) {
874 } else if (total_dsize < 16384) {
876 } else if (total_dsize < 32768) {
881 } else if (communicator_size < 4096) {
882 if (total_dsize < 512) {
884 } else if (total_dsize < 4096) {
890 if (total_dsize < 1024) {
892 } else if (total_dsize < 8192) {
900 return funcs[alg-1] (sbuf, rbuf, rcounts, dtype, op, comm);
903 int allgather__ompi(const void *sbuf, int scount,
905 void* rbuf, int rcount,
910 int communicator_size;
911 size_t dsize, total_dsize;
913 communicator_size = comm->size();
914 if (MPI_IN_PLACE != sbuf) {
915 dsize = sdtype->size();
917 dsize = rdtype->size();
919 total_dsize = dsize * (ptrdiff_t)scount;
920 int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = {
921 &allgather__NTSLR_NB,
925 &allgather__ompi_neighborexchange,
931 * {3, "recursive_doubling"},
936 if (communicator_size == 2) {
938 } else if (communicator_size < 32) {
940 } else if (communicator_size < 64) {
941 if (total_dsize < 1024) {
943 } else if (total_dsize < 65536) {
948 } else if (communicator_size < 128) {
949 if (total_dsize < 512) {
951 } else if (total_dsize < 65536) {
956 } else if (communicator_size < 256) {
957 if (total_dsize < 512) {
959 } else if (total_dsize < 131072) {
961 } else if (total_dsize < 524288) {
963 } else if (total_dsize < 1048576) {
968 } else if (communicator_size < 512) {
969 if (total_dsize < 32) {
971 } else if (total_dsize < 128) {
973 } else if (total_dsize < 1024) {
975 } else if (total_dsize < 131072) {
977 } else if (total_dsize < 524288) {
979 } else if (total_dsize < 1048576) {
984 } else if (communicator_size < 1024) {
985 if (total_dsize < 64) {
987 } else if (total_dsize < 256) {
989 } else if (total_dsize < 2048) {
994 } else if (communicator_size < 2048) {
995 if (total_dsize < 4) {
997 } else if (total_dsize < 8) {
999 } else if (total_dsize < 16) {
1001 } else if (total_dsize < 32) {
1003 } else if (total_dsize < 256) {
1005 } else if (total_dsize < 512) {
1007 } else if (total_dsize < 4096) {
1012 } else if (communicator_size < 4096) {
1013 if (total_dsize < 32) {
1015 } else if (total_dsize < 128) {
1017 } else if (total_dsize < 512) {
1019 } else if (total_dsize < 4096) {
1025 if (total_dsize < 2) {
1027 } else if (total_dsize < 8) {
1029 } else if (total_dsize < 16) {
1031 } else if (total_dsize < 512) {
1033 } else if (total_dsize < 4096) {
1040 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
1044 int allgatherv__ompi(const void *sbuf, int scount,
1045 MPI_Datatype sdtype,
1046 void* rbuf, const int *rcounts,
1048 MPI_Datatype rdtype,
1053 int communicator_size;
1054 size_t dsize, total_dsize;
1056 communicator_size = comm->size();
1057 if (MPI_IN_PLACE != sbuf) {
1058 dsize = sdtype->size();
1060 dsize = rdtype->size();
1064 for (i = 0; i < communicator_size; i++) {
1065 total_dsize += dsize * rcounts[i];
1068 /* use the per-rank data size as basis, similar to allgather */
1069 size_t per_rank_dsize = total_dsize / communicator_size;
1071 int (*funcs[])(const void*, int, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = {
1073 &allgatherv__ompi_bruck,
1074 &allgatherv__mpich_ring,
1075 &allgatherv__ompi_neighborexchange,
1085 if (communicator_size == 2) {
1086 if (per_rank_dsize < 2048) {
1088 } else if (per_rank_dsize < 4096) {
1090 } else if (per_rank_dsize < 8192) {
1095 } else if (communicator_size < 8) {
1096 if (per_rank_dsize < 256) {
1098 } else if (per_rank_dsize < 4096) {
1100 } else if (per_rank_dsize < 8192) {
1102 } else if (per_rank_dsize < 16384) {
1104 } else if (per_rank_dsize < 262144) {
1109 } else if (communicator_size < 16) {
1110 if (per_rank_dsize < 1024) {
1115 } else if (communicator_size < 32) {
1116 if (per_rank_dsize < 128) {
1118 } else if (per_rank_dsize < 262144) {
1123 } else if (communicator_size < 64) {
1124 if (per_rank_dsize < 256) {
1126 } else if (per_rank_dsize < 8192) {
1131 } else if (communicator_size < 128) {
1132 if (per_rank_dsize < 256) {
1134 } else if (per_rank_dsize < 4096) {
1139 } else if (communicator_size < 256) {
1140 if (per_rank_dsize < 1024) {
1142 } else if (per_rank_dsize < 65536) {
1147 } else if (communicator_size < 512) {
1148 if (per_rank_dsize < 1024) {
1153 } else if (communicator_size < 1024) {
1154 if (per_rank_dsize < 512) {
1156 } else if (per_rank_dsize < 1024) {
1158 } else if (per_rank_dsize < 4096) {
1160 } else if (per_rank_dsize < 1048576) {
1166 if (per_rank_dsize < 4096) {
1173 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm);
1176 int gather__ompi(const void *sbuf, int scount,
1177 MPI_Datatype sdtype,
1178 void* rbuf, int rcount,
1179 MPI_Datatype rdtype,
1184 int communicator_size, rank;
1185 size_t dsize, total_dsize;
1187 communicator_size = comm->size();
1188 rank = comm->rank();
1191 dsize = rdtype->size();
1192 total_dsize = dsize * rcount;
1194 dsize = sdtype->size();
1195 total_dsize = dsize * scount;
1197 int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = {
1198 &gather__ompi_basic_linear,
1199 &gather__ompi_binomial,
1200 &gather__ompi_linear_sync
1203 * {1, "basic_linear"},
1205 * {3, "linear_sync"},
1207 * We do not make any rank specific checks since the params
1208 * should be uniform across ranks.
1210 if (communicator_size < 4) {
1211 if (total_dsize < 2) {
1213 } else if (total_dsize < 4) {
1215 } else if (total_dsize < 32768) {
1217 } else if (total_dsize < 65536) {
1219 } else if (total_dsize < 131072) {
1224 } else if (communicator_size < 8) {
1225 if (total_dsize < 1024) {
1227 } else if (total_dsize < 8192) {
1229 } else if (total_dsize < 32768) {
1231 } else if (total_dsize < 262144) {
1236 } else if (communicator_size < 256) {
1238 } else if (communicator_size < 512) {
1239 if (total_dsize < 2048) {
1241 } else if (total_dsize < 8192) {
1250 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
1254 int scatter__ompi(const void *sbuf, int scount,
1255 MPI_Datatype sdtype,
1256 void* rbuf, int rcount,
1257 MPI_Datatype rdtype,
1258 int root, MPI_Comm comm
1261 int communicator_size, rank;
1262 size_t dsize, total_dsize;
1265 communicator_size = comm->size();
1266 rank = comm->rank();
1268 dsize=sdtype->size();
1269 total_dsize = dsize * scount;
1271 dsize=rdtype->size();
1272 total_dsize = dsize * rcount;
1274 int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = {
1275 &scatter__ompi_basic_linear,
1276 &scatter__ompi_binomial,
1277 &scatter__ompi_basic_linear
1280 * {1, "basic_linear"},
1284 * We do not make any rank specific checks since the params
1285 * should be uniform across ranks.
1287 if (communicator_size < 4) {
1288 if (total_dsize < 2) {
1290 } else if (total_dsize < 131072) {
1292 } else if (total_dsize < 262144) {
1297 } else if (communicator_size < 8) {
1298 if (total_dsize < 2048) {
1300 } else if (total_dsize < 4096) {
1302 } else if (total_dsize < 8192) {
1304 } else if (total_dsize < 32768) {
1306 } else if (total_dsize < 1048576) {
1311 } else if (communicator_size < 16) {
1312 if (total_dsize < 16384) {
1314 } else if (total_dsize < 1048576) {
1319 } else if (communicator_size < 32) {
1320 if (total_dsize < 16384) {
1322 } else if (total_dsize < 32768) {
1327 } else if (communicator_size < 64) {
1328 if (total_dsize < 512) {
1330 } else if (total_dsize < 8192) {
1332 } else if (total_dsize < 16384) {
1338 if (total_dsize < 512) {
1345 return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);