1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
10 /************ Alltoall variables and initializers */
12 #define MV2_MAX_NB_THRESHOLDS 32
18 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19 void *recvbuf, int recvcount, MPI_Datatype recvtype,
21 } mv2_alltoall_tuning_element;
26 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring
47 static void init_mv2_alltoall_tables_stampede(){
49 int agg_table_sum = 0;
50 mv2_alltoall_tuning_table **table_ptrs = NULL;
51 mv2_alltoall_num_ppn_conf = 3;
52 mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53 * mv2_alltoall_num_ppn_conf);
54 table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55 * mv2_alltoall_num_ppn_conf);
56 mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
57 mv2_alltoall_num_ppn_conf);
58 mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
59 mv2_alltoall_table_ppn_conf[0] = 1;
60 mv2_size_alltoall_tuning_table[0] = 6;
61 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
64 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
67 {{0, -1, &MPIR_Alltoall_inplace_MV2},
73 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
74 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
77 {{0, -1, &MPIR_Alltoall_inplace_MV2},
83 {{0, 8, &MPIR_Alltoall_RD_MV2},
84 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
87 {{0, -1, &MPIR_Alltoall_inplace_MV2},
93 {{0, 64, &MPIR_Alltoall_RD_MV2},
94 {64, 512, &MPIR_Alltoall_bruck_MV2},
95 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
98 {{0,-1, &MPIR_Alltoall_inplace_MV2},
104 {{0, 32, &MPIR_Alltoall_RD_MV2},
105 {32, 2048, &MPIR_Alltoall_bruck_MV2},
106 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
109 {{0, -1, &MPIR_Alltoall_inplace_MV2},
115 {{0, 8, &MPIR_Alltoall_RD_MV2},
116 {8, 1024, &MPIR_Alltoall_bruck_MV2},
117 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
120 {{0, -1, &MPIR_Alltoall_inplace_MV2},
124 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
125 mv2_alltoall_table_ppn_conf[1] = 2;
126 mv2_size_alltoall_tuning_table[1] = 6;
127 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
130 {{0, 32, &MPIR_Alltoall_RD_MV2},
131 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
134 {{0, -1, &MPIR_Alltoall_inplace_MV2},
140 {{0, 64, &MPIR_Alltoall_RD_MV2},
141 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
144 {{0, -1, &MPIR_Alltoall_inplace_MV2},
150 {{0, 64, &MPIR_Alltoall_RD_MV2},
151 {64, 2048, &MPIR_Alltoall_bruck_MV2},
152 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
155 {{0,-1, &MPIR_Alltoall_inplace_MV2},
161 {{0, 16, &MPIR_Alltoall_RD_MV2},
162 {16, 2048, &MPIR_Alltoall_bruck_MV2},
163 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
166 {{0, -1, &MPIR_Alltoall_inplace_MV2},
172 {{0, 8, &MPIR_Alltoall_RD_MV2},
173 {8, 1024, &MPIR_Alltoall_bruck_MV2},
174 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
177 {{0, -1, &MPIR_Alltoall_inplace_MV2},
183 {{0, 4, &MPIR_Alltoall_RD_MV2},
184 {4, 2048, &MPIR_Alltoall_bruck_MV2},
185 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
188 {{0, -1, &MPIR_Alltoall_inplace_MV2},
192 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
193 mv2_alltoall_table_ppn_conf[2] = 16;
194 mv2_size_alltoall_tuning_table[2] = 7;
195 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
198 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
199 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
202 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
208 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
209 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
212 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
218 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
219 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
220 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
223 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
229 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
230 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
233 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
239 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
240 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
243 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
249 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
250 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
253 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
258 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
259 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
262 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
267 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
269 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
270 agg_table_sum += mv2_size_alltoall_tuning_table[i];
272 mv2_alltoall_thresholds_table[0] =
273 xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
274 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
275 (sizeof(mv2_alltoall_tuning_table)
276 * mv2_size_alltoall_tuning_table[0]));
277 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
278 mv2_alltoall_thresholds_table[i] =
279 mv2_alltoall_thresholds_table[i - 1]
280 + mv2_size_alltoall_tuning_table[i - 1];
281 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
282 (sizeof(mv2_alltoall_tuning_table)
283 * mv2_size_alltoall_tuning_table[i]));
285 xbt_free(table_ptrs);
291 /************ Allgather variables and initializers */
296 int (*MV2_pt_Allgather_function)(void *sendbuf,
298 MPI_Datatype sendtype,
301 MPI_Datatype recvtype, MPI_Comm comm_ptr);
302 } mv2_allgather_tuning_element;
306 int two_level[MV2_MAX_NB_THRESHOLDS];
307 int size_inter_table;
308 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
309 } mv2_allgather_tuning_table;
311 int (*MV2_Allgather_function)(void *sendbuf,
313 MPI_Datatype sendtype,
316 MPI_Datatype recvtype, MPI_Comm comm);
318 int *mv2_allgather_table_ppn_conf = NULL;
319 int mv2_allgather_num_ppn_conf = 1;
320 int *mv2_size_allgather_tuning_table = NULL;
321 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
323 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
325 MPI_Datatype sendtype,
328 MPI_Datatype recvtype, MPI_Comm comm_ptr)
333 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
334 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
335 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
336 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
338 static void init_mv2_allgather_tables_stampede(){
340 int agg_table_sum = 0;
341 mv2_allgather_tuning_table **table_ptrs = NULL;
342 mv2_allgather_num_ppn_conf = 3;
343 mv2_allgather_thresholds_table
344 = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345 * mv2_allgather_num_ppn_conf);
346 table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347 * mv2_allgather_num_ppn_conf);
348 mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
349 mv2_allgather_num_ppn_conf);
350 mv2_allgather_table_ppn_conf
351 = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
352 mv2_allgather_table_ppn_conf[0] = 1;
353 mv2_size_allgather_tuning_table[0] = 6;
354 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
360 {0, -1, &MPIR_Allgather_Ring_MV2},
368 {0, 262144, &MPIR_Allgather_RD_MV2},
369 {262144, -1, &MPIR_Allgather_Ring_MV2},
377 {0, 131072, &MPIR_Allgather_RD_MV2},
378 {131072, -1, &MPIR_Allgather_Ring_MV2},
386 {0, 131072, &MPIR_Allgather_RD_MV2},
387 {131072, -1, &MPIR_Allgather_Ring_MV2},
395 {0, 65536, &MPIR_Allgather_RD_MV2},
396 {65536, -1, &MPIR_Allgather_Ring_MV2},
404 {0, 32768, &MPIR_Allgather_RD_MV2},
405 {32768, -1, &MPIR_Allgather_Ring_MV2},
409 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
410 mv2_allgather_table_ppn_conf[1] = 2;
411 mv2_size_allgather_tuning_table[1] = 6;
412 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
418 {0, 524288, &MPIR_Allgather_RD_MV2},
419 {524288, -1, &MPIR_Allgather_Ring_MV2},
427 {0, 32768, &MPIR_Allgather_RD_MV2},
428 {32768, 524288, &MPIR_Allgather_Ring_MV2},
429 {524288, -1, &MPIR_Allgather_Ring_MV2},
437 {0, 16384, &MPIR_Allgather_RD_MV2},
438 {16384, 524288, &MPIR_Allgather_Ring_MV2},
439 {524288, -1, &MPIR_Allgather_Ring_MV2},
447 {0, 65536, &MPIR_Allgather_RD_MV2},
448 {65536, 524288, &MPIR_Allgather_Ring_MV2},
449 {524288, -1, &MPIR_Allgather_Ring_MV2},
457 {0, 32768, &MPIR_Allgather_RD_MV2},
458 {32768, 524288, &MPIR_Allgather_Ring_MV2},
459 {524288, -1, &MPIR_Allgather_Ring_MV2},
467 {0, 65536, &MPIR_Allgather_RD_MV2},
468 {65536, 524288, &MPIR_Allgather_Ring_MV2},
469 {524288, -1, &MPIR_Allgather_Ring_MV2},
473 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
474 mv2_allgather_table_ppn_conf[2] = 16;
475 mv2_size_allgather_tuning_table[2] = 6;
476 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
482 {0, 1024, &MPIR_Allgather_RD_MV2},
483 {1024, -1, &MPIR_Allgather_Ring_MV2},
491 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492 {1024, -1, &MPIR_Allgather_Ring_MV2},
500 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501 {1024, -1, &MPIR_Allgather_Ring_MV2},
509 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510 {1024, -1, &MPIR_Allgather_Ring_MV2},
518 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519 {1024, -1, &MPIR_Allgather_Ring_MV2},
527 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
528 {1024, -1, &MPIR_Allgather_Ring_MV2},
533 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
535 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
536 agg_table_sum += mv2_size_allgather_tuning_table[i];
538 mv2_allgather_thresholds_table[0] =
539 xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
540 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
541 (sizeof(mv2_allgather_tuning_table)
542 * mv2_size_allgather_tuning_table[0]));
543 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
544 mv2_allgather_thresholds_table[i] =
545 mv2_allgather_thresholds_table[i - 1]
546 + mv2_size_allgather_tuning_table[i - 1];
547 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
548 (sizeof(mv2_allgather_tuning_table)
549 * mv2_size_allgather_tuning_table[i]));
551 xbt_free(table_ptrs);
555 /************ Gather variables and initializers */
560 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
561 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
562 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
563 } mv2_gather_tuning_element;
568 int size_inter_table;
569 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
570 int size_intra_table;
571 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
572 } mv2_gather_tuning_table;
574 int mv2_size_gather_tuning_table=7;
575 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
577 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
579 MPI_Datatype sendtype,
582 MPI_Datatype recvtype,
583 int root, MPI_Comm comm);
585 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
586 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
589 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
590 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
591 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
594 static void init_mv2_gather_tables_stampede(){
596 mv2_size_gather_tuning_table=7;
597 mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
598 sizeof (mv2_gather_tuning_table));
599 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
601 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
602 {524288, -1, &MPIR_Gather_intra}},
603 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
605 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
606 {16384, 131072, &MPIR_Gather_intra},
607 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
608 1,{{0, -1, &MPIR_Gather_intra}}},
610 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
611 {256, 16384, &MPIR_Gather_MV2_Direct},
612 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
613 1,{{0, -1, &MPIR_Gather_intra}}},
615 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
616 {512, 16384, &MPIR_Gather_MV2_Direct},
617 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
618 1,{{0, -1, &MPIR_Gather_intra}}},
620 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
621 {512, 16384, &MPIR_Gather_MV2_Direct},
622 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
623 1,{{0, -1, &MPIR_Gather_intra}}},
625 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626 {512, 16384, &MPIR_Gather_MV2_Direct},
627 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
628 1,{{0, -1, &MPIR_Gather_intra}}},
630 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631 {512, 16384, &MPIR_Gather_MV2_Direct},
632 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
633 1,{{0, -1, &MPIR_Gather_intra}}},
636 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
637 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
642 /************ Allgatherv variables and initializers */
647 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
649 MPI_Datatype sendtype,
653 MPI_Datatype recvtype,
655 } mv2_allgatherv_tuning_element;
659 int size_inter_table;
660 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
661 } mv2_allgatherv_tuning_table;
663 int (*MV2_Allgatherv_function)(void *sendbuf,
665 MPI_Datatype sendtype,
669 MPI_Datatype recvtype,
672 int mv2_size_allgatherv_tuning_table = 0;
673 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
675 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
676 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
677 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
680 static void init_mv2_allgatherv_tables_stampede(){
681 mv2_size_allgatherv_tuning_table = 6;
682 mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
683 sizeof (mv2_allgatherv_tuning_table));
684 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
689 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
690 {512, -1, &MPIR_Allgatherv_Ring_MV2},
697 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698 {512, -1, &MPIR_Allgatherv_Ring_MV2},
705 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
706 {256, -1, &MPIR_Allgatherv_Ring_MV2},
713 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714 {256, -1, &MPIR_Allgatherv_Ring_MV2},
721 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722 {256, -1, &MPIR_Allgatherv_Ring_MV2},
729 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730 {256, -1, &MPIR_Allgatherv_Ring_MV2},
735 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
736 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
740 /************ Allreduce variables and initializers */
745 int (*MV2_pt_Allreduce_function)(void *sendbuf,
748 MPI_Datatype datatype,
749 MPI_Op op, MPI_Comm comm);
750 } mv2_allreduce_tuning_element;
755 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
756 int size_inter_table;
757 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
758 int size_intra_table;
759 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
760 } mv2_allreduce_tuning_table;
763 int (*MV2_Allreduce_function)(void *sendbuf,
766 MPI_Datatype datatype,
767 MPI_Op op, MPI_Comm comm)=NULL;
770 int (*MV2_Allreduce_intra_function)( void *sendbuf,
773 MPI_Datatype datatype,
774 MPI_Op op, MPI_Comm comm)=NULL;
776 int mv2_size_allreduce_tuning_table = 0;
777 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
783 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
786 MPI_Datatype datatype,
787 MPI_Op op, MPI_Comm comm)
792 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
795 MPI_Datatype datatype,
796 MPI_Op op, MPI_Comm comm)
801 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
804 MPI_Datatype datatype,
805 MPI_Op op, MPI_Comm comm)
807 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
811 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
814 MPI_Datatype datatype,
815 MPI_Op op, MPI_Comm comm)
817 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
821 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
822 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
823 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
826 static void init_mv2_allreduce_tables_stampede(){
827 mv2_size_allreduce_tuning_table = 8;
828 mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
829 sizeof (mv2_allreduce_tuning_table));
830 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
837 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
838 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
842 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
843 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
852 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
853 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
854 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
858 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
859 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
868 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
869 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
870 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
874 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
875 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
884 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
885 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
886 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
890 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
891 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
900 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
901 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
902 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
906 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
907 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
916 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
917 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
918 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
922 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
923 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
932 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
933 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
934 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
935 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
939 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
940 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
949 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
950 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
951 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
952 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
953 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
957 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
958 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
963 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
964 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
973 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
974 int root, MPI_Comm comm_ptr);
975 int zcpy_pipelined_knomial_factor;
976 } mv2_bcast_tuning_element;
980 int bcast_segment_size;
981 int intra_node_knomial_factor;
982 int inter_node_knomial_factor;
983 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
984 int size_inter_table;
985 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
986 int size_intra_table;
987 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
988 } mv2_bcast_tuning_table;
990 int mv2_size_bcast_tuning_table = 0;
991 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
994 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
995 int root, MPI_Comm comm_ptr) = NULL;
997 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
998 int root, MPI_Comm comm_ptr) = NULL;
1000 int zcpy_knomial_factor = 2;
1001 int mv2_pipelined_zcpy_knomial_factor = -1;
1002 int bcast_segment_size = 8192;
1003 int mv2_inter_node_knomial_factor = 4;
1004 int mv2_intra_node_knomial_factor = 4;
1005 #define mv2_bcast_two_level_system_size 64
1006 #define mv2_bcast_short_msg 16384
1007 #define mv2_bcast_large_msg 512*1024
1009 #define INTRA_NODE_ROOT 0
1011 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1012 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1013 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
1014 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1015 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
1016 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1017 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1018 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1019 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1020 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
1021 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
1023 static void init_mv2_bcast_tables_stampede(){
1025 mv2_size_bcast_tuning_table=8;
1026 mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1027 sizeof (mv2_bcast_tuning_table));
1029 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1033 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1036 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1037 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1038 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1039 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1040 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1041 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1042 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1043 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1044 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1045 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1046 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1050 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1051 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1052 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1053 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1054 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1055 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1056 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1057 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1058 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1059 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1060 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1066 {1, 1, 1, 1, 1, 1, 1, 1},
1069 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1070 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1071 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1072 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1073 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1074 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1075 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1076 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1080 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1081 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1082 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1083 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1084 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1085 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1086 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1087 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1093 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1096 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1097 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1098 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1099 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1100 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1101 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1102 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1103 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1104 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1108 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1109 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1110 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1111 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1112 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1113 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1114 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1115 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1116 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1125 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1126 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1127 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1128 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1132 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1133 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1134 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1135 {524288, -1, NULL, -1}
1144 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1145 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1146 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1147 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1148 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1152 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1153 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1154 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1155 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1156 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1165 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1166 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1167 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1168 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1169 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1173 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1174 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1175 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1176 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1177 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1186 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1187 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1188 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1189 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1190 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1194 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1195 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1196 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1197 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1198 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1204 {1, 1, 1, 1, 1, 1, 1},
1207 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1208 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1209 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1210 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1211 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1212 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1213 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1217 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1218 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1219 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1220 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1221 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1222 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1223 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1228 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1229 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1233 /************ Reduce variables and initializers */
1238 int (*MV2_pt_Reduce_function)(void *sendbuf,
1241 MPI_Datatype datatype,
1245 } mv2_reduce_tuning_element;
1251 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1252 int size_inter_table;
1253 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1254 int size_intra_table;
1255 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1256 } mv2_reduce_tuning_table;
1258 int mv2_size_reduce_tuning_table = 0;
1259 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1262 int mv2_reduce_intra_knomial_factor = -1;
1263 int mv2_reduce_inter_knomial_factor = -1;
1265 int (*MV2_Reduce_function)( void *sendbuf,
1268 MPI_Datatype datatype,
1271 MPI_Comm comm_ptr)=NULL;
1273 int (*MV2_Reduce_intra_function)( void *sendbuf,
1276 MPI_Datatype datatype,
1279 MPI_Comm comm_ptr)=NULL;
1282 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1283 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1284 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1285 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1286 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1287 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1290 static void init_mv2_reduce_tables_stampede(){
1292 mv2_size_reduce_tuning_table = 8;
1293 mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1294 sizeof (mv2_reduce_tuning_table));
1295 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1303 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1304 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1305 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1309 {0, 65536, &MPIR_Reduce_shmem_MV2},
1310 {65536,-1, &MPIR_Reduce_binomial_MV2},
1317 {1, 1, 1, 1, 0, 0, 0},
1320 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1321 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1322 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1323 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1324 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1325 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1326 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1330 {0, 8192, &MPIR_Reduce_shmem_MV2},
1331 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1332 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1333 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1334 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1335 {262144,-1, &MPIR_Reduce_binomial_MV2},
1345 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1346 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1347 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1348 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1349 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1353 {0, 8192, &MPIR_Reduce_shmem_MV2},
1354 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1355 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1356 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1357 {262144, -1, &MPIR_Reduce_binomial_MV2},
1367 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1368 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1369 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1370 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1371 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1372 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1376 {0, 8192, &MPIR_Reduce_shmem_MV2},
1377 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1378 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1379 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1380 {262144, -1, &MPIR_Reduce_binomial_MV2},
1387 {1, 1, 1, 0, 1, 1, 0},
1390 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1391 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1392 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1393 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1394 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1395 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1396 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1400 {0, 8192, &MPIR_Reduce_shmem_MV2},
1401 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1402 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1403 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1404 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1405 {262144, -1, &MPIR_Reduce_binomial_MV2},
1415 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1416 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1417 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1418 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1419 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1420 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1424 {0, 8192, &MPIR_Reduce_shmem_MV2},
1425 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1426 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1427 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1428 {262144, -1, &MPIR_Reduce_binomial_MV2},
1438 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1439 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1440 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1441 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1442 {262144, -1, &MPIR_Reduce_binomial_MV2},
1446 {0, 8192, &MPIR_Reduce_shmem_MV2},
1447 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1448 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1449 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1450 {262144, -1, &MPIR_Reduce_binomial_MV2},
1460 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1461 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1462 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1463 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1464 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1465 {131072, -1, &MPIR_Reduce_binomial_MV2},
1469 {0, 2048, &MPIR_Reduce_shmem_MV2},
1470 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1471 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1472 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1473 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1474 {131072, -1, &MPIR_Reduce_shmem_MV2},
1479 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1480 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1483 /************ Reduce scatter variables and initializers */
1488 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1491 MPI_Datatype datatype,
1494 } mv2_red_scat_tuning_element;
1498 int size_inter_table;
1499 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1500 } mv2_red_scat_tuning_table;
1502 int mv2_size_red_scat_tuning_table = 0;
1503 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1506 int (*MV2_Red_scat_function)(void *sendbuf,
1509 MPI_Datatype datatype,
1515 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1518 MPI_Datatype datatype,
1522 smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1525 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1526 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1527 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1532 static void init_mv2_reduce_scatter_tables_stampede(){
1533 mv2_size_red_scat_tuning_table = 6;
1534 mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1535 sizeof (mv2_red_scat_tuning_table));
1536 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1541 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1542 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1543 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1550 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1551 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1552 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1559 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1560 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1561 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1568 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1569 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1576 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1577 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1584 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1585 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1590 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1591 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1594 /************ Scatter variables and initializers */
1599 int (*MV2_pt_Scatter_function)(void *sendbuf,
1601 MPI_Datatype sendtype,
1604 MPI_Datatype recvtype,
1605 int root, MPI_Comm comm);
1606 } mv2_scatter_tuning_element;
1610 int size_inter_table;
1611 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1612 int size_intra_table;
1613 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1614 } mv2_scatter_tuning_table;
1617 int *mv2_scatter_table_ppn_conf = NULL;
1618 int mv2_scatter_num_ppn_conf = 1;
1619 int *mv2_size_scatter_tuning_table = NULL;
1620 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1622 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1623 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1624 int root, MPI_Comm comm)=NULL;
1626 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1627 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1628 int root, MPI_Comm comm)=NULL;
1629 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1631 MPI_Datatype sendtype,
1634 MPI_Datatype recvtype,
1635 int root, MPI_Comm comm_ptr);
1637 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1639 MPI_Datatype sendtype,
1642 MPI_Datatype recvtype,
1643 int root, MPI_Comm comm_ptr)
1648 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1649 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1650 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1651 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1656 static void init_mv2_scatter_tables_stampede(){
1658 int agg_table_sum = 0;
1660 mv2_scatter_tuning_table **table_ptrs = NULL;
1661 mv2_scatter_num_ppn_conf = 3;
1662 mv2_scatter_thresholds_table
1663 = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1664 * mv2_scatter_num_ppn_conf);
1665 table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1666 * mv2_scatter_num_ppn_conf);
1667 mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1668 mv2_scatter_num_ppn_conf);
1669 mv2_scatter_table_ppn_conf
1670 = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1671 mv2_scatter_table_ppn_conf[0] = 1;
1672 mv2_size_scatter_tuning_table[0] = 6;
1673 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1677 {0, -1, &MPIR_Scatter_MV2_Binomial},
1681 {0, -1, &MPIR_Scatter_MV2_Binomial},
1688 {0, -1, &MPIR_Scatter_MV2_Direct},
1692 {0, -1, &MPIR_Scatter_MV2_Direct},
1699 {0, -1, &MPIR_Scatter_MV2_Direct},
1703 {0, -1, &MPIR_Scatter_MV2_Direct},
1710 {0, -1, &MPIR_Scatter_MV2_Direct},
1714 {0, -1, &MPIR_Scatter_MV2_Direct},
1721 {0, -1, &MPIR_Scatter_MV2_Direct},
1725 {0, -1, &MPIR_Scatter_MV2_Direct},
1732 {0, 32, &MPIR_Scatter_MV2_Binomial},
1733 {32, -1, &MPIR_Scatter_MV2_Direct},
1737 {0, -1, &MPIR_Scatter_MV2_Binomial},
1741 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1742 mv2_scatter_table_ppn_conf[1] = 2;
1743 mv2_size_scatter_tuning_table[1] = 6;
1744 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1748 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1749 {4096, -1, &MPIR_Scatter_MV2_Direct},
1753 {0, -1, &MPIR_Scatter_MV2_Direct},
1760 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1761 {512, -1, &MPIR_Scatter_MV2_Direct},
1765 {0, -1, &MPIR_Scatter_MV2_Binomial},
1772 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1773 {2048, -1, &MPIR_Scatter_MV2_Direct},
1777 {0, -1, &MPIR_Scatter_MV2_Binomial},
1784 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1785 {2048, -1, &MPIR_Scatter_MV2_Direct},
1789 {0, -1, &MPIR_Scatter_MV2_Binomial},
1796 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1797 {8192, -1, &MPIR_Scatter_MV2_Direct},
1801 {0, -1, &MPIR_Scatter_MV2_Binomial},
1808 {0, 16, &MPIR_Scatter_MV2_Binomial},
1809 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1810 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1811 {16384, -1, &MPIR_Scatter_MV2_Direct},
1815 {0, 128, &MPIR_Scatter_MV2_Direct},
1816 {128, -1, &MPIR_Scatter_MV2_Binomial},
1820 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1821 mv2_scatter_table_ppn_conf[2] = 16;
1822 mv2_size_scatter_tuning_table[2] = 8;
1823 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1828 {0, 256, &MPIR_Scatter_MV2_Binomial},
1829 {256, -1, &MPIR_Scatter_MV2_Direct},
1833 { 0, -1, &MPIR_Scatter_MV2_Direct},
1841 {0, 512, &MPIR_Scatter_MV2_Binomial},
1842 {512, -1, &MPIR_Scatter_MV2_Direct},
1846 { 0, -1, &MPIR_Scatter_MV2_Direct},
1854 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1855 {1024, -1, &MPIR_Scatter_MV2_Direct},
1859 { 0, -1, &MPIR_Scatter_MV2_Direct},
1867 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1868 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1869 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1870 {2048, -1, &MPIR_Scatter_MV2_Direct},
1874 { 0, -1, &MPIR_Scatter_MV2_Direct},
1882 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1883 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1884 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1885 {2048, -1, &MPIR_Scatter_MV2_Direct},
1889 { 0, -1, &MPIR_Scatter_MV2_Direct},
1897 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1898 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1899 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1900 {4096, -1, &MPIR_Scatter_MV2_Direct},
1904 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1911 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1912 {0, 16, &MPIR_Scatter_MV2_Binomial},
1913 {16, 32, &MPIR_Scatter_MV2_Binomial},
1914 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1915 {4096, -1, &MPIR_Scatter_MV2_Direct},
1919 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1926 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1927 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1928 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1929 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1930 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1931 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1932 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1936 {0, 16, &MPIR_Scatter_MV2_Binomial},
1937 {16, 128, &MPIR_Scatter_MV2_Binomial},
1938 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1939 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1940 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1941 {65536, -1, &MPIR_Scatter_MV2_Direct},
1945 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1947 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1948 agg_table_sum += mv2_size_scatter_tuning_table[i];
1950 mv2_scatter_thresholds_table[0] =
1951 xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1952 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1953 (sizeof(mv2_scatter_tuning_table)
1954 * mv2_size_scatter_tuning_table[0]));
1955 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1956 mv2_scatter_thresholds_table[i] =
1957 mv2_scatter_thresholds_table[i - 1]
1958 + mv2_size_scatter_tuning_table[i - 1];
1959 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1960 (sizeof(mv2_scatter_tuning_table)
1961 * mv2_size_scatter_tuning_table[i]));
1963 xbt_free(table_ptrs);