1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
10 /************ Alltoall variables and initializers */
12 #define MV2_MAX_NB_THRESHOLDS 32
18 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19 void *recvbuf, int recvcount, MPI_Datatype recvtype,
21 } mv2_alltoall_tuning_element;
26 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring
47 static void init_mv2_alltoall_tables_stampede(){
49 int agg_table_sum = 0;
50 mv2_alltoall_tuning_table **table_ptrs = NULL;
51 mv2_alltoall_num_ppn_conf = 3;
52 mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53 * mv2_alltoall_num_ppn_conf);
54 table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55 * mv2_alltoall_num_ppn_conf);
56 mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
57 mv2_alltoall_num_ppn_conf);
58 mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
59 mv2_alltoall_table_ppn_conf[0] = 1;
60 mv2_size_alltoall_tuning_table[0] = 6;
61 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
64 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
67 {{0, -1, &MPIR_Alltoall_inplace_MV2},
73 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
74 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
77 {{0, -1, &MPIR_Alltoall_inplace_MV2},
83 {{0, 8, &MPIR_Alltoall_RD_MV2},
84 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
87 {{0, -1, &MPIR_Alltoall_inplace_MV2},
93 {{0, 64, &MPIR_Alltoall_RD_MV2},
94 {64, 512, &MPIR_Alltoall_bruck_MV2},
95 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
98 {{0,-1, &MPIR_Alltoall_inplace_MV2},
104 {{0, 32, &MPIR_Alltoall_RD_MV2},
105 {32, 2048, &MPIR_Alltoall_bruck_MV2},
106 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
109 {{0, -1, &MPIR_Alltoall_inplace_MV2},
115 {{0, 8, &MPIR_Alltoall_RD_MV2},
116 {8, 1024, &MPIR_Alltoall_bruck_MV2},
117 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
120 {{0, -1, &MPIR_Alltoall_inplace_MV2},
124 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
125 mv2_alltoall_table_ppn_conf[1] = 2;
126 mv2_size_alltoall_tuning_table[1] = 6;
127 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
130 {{0, 32, &MPIR_Alltoall_RD_MV2},
131 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
134 {{0, -1, &MPIR_Alltoall_inplace_MV2},
140 {{0, 64, &MPIR_Alltoall_RD_MV2},
141 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
144 {{0, -1, &MPIR_Alltoall_inplace_MV2},
150 {{0, 64, &MPIR_Alltoall_RD_MV2},
151 {64, 2048, &MPIR_Alltoall_bruck_MV2},
152 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
155 {{0,-1, &MPIR_Alltoall_inplace_MV2},
161 {{0, 16, &MPIR_Alltoall_RD_MV2},
162 {16, 2048, &MPIR_Alltoall_bruck_MV2},
163 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
166 {{0, -1, &MPIR_Alltoall_inplace_MV2},
172 {{0, 8, &MPIR_Alltoall_RD_MV2},
173 {8, 1024, &MPIR_Alltoall_bruck_MV2},
174 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
177 {{0, -1, &MPIR_Alltoall_inplace_MV2},
183 {{0, 4, &MPIR_Alltoall_RD_MV2},
184 {4, 2048, &MPIR_Alltoall_bruck_MV2},
185 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
188 {{0, -1, &MPIR_Alltoall_inplace_MV2},
192 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
193 mv2_alltoall_table_ppn_conf[2] = 16;
194 mv2_size_alltoall_tuning_table[2] = 7;
195 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
198 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
199 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
202 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
208 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
209 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
212 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
218 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
219 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
220 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
223 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
229 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
230 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
233 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
239 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
240 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
243 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
249 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
250 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
253 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
258 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
259 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
262 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
267 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
269 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
270 agg_table_sum += mv2_size_alltoall_tuning_table[i];
272 mv2_alltoall_thresholds_table[0] =
273 xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
274 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
275 (sizeof(mv2_alltoall_tuning_table)
276 * mv2_size_alltoall_tuning_table[0]));
277 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
278 mv2_alltoall_thresholds_table[i] =
279 mv2_alltoall_thresholds_table[i - 1]
280 + mv2_size_alltoall_tuning_table[i - 1];
281 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
282 (sizeof(mv2_alltoall_tuning_table)
283 * mv2_size_alltoall_tuning_table[i]));
285 xbt_free(table_ptrs);
291 /************ Allgather variables and initializers */
296 int (*MV2_pt_Allgather_function)(void *sendbuf,
298 MPI_Datatype sendtype,
301 MPI_Datatype recvtype, MPI_Comm comm_ptr);
302 } mv2_allgather_tuning_element;
306 int two_level[MV2_MAX_NB_THRESHOLDS];
307 int size_inter_table;
308 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
309 } mv2_allgather_tuning_table;
311 int (*MV2_Allgather_function)(void *sendbuf,
313 MPI_Datatype sendtype,
316 MPI_Datatype recvtype, MPI_Comm comm);
318 int *mv2_allgather_table_ppn_conf = NULL;
319 int mv2_allgather_num_ppn_conf = 1;
320 int *mv2_size_allgather_tuning_table = NULL;
321 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
323 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
325 MPI_Datatype sendtype,
328 MPI_Datatype recvtype, MPI_Comm comm_ptr)
333 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
334 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
335 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
336 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
338 static void init_mv2_allgather_tables_stampede(){
340 int agg_table_sum = 0;
341 mv2_allgather_tuning_table **table_ptrs = NULL;
342 mv2_allgather_num_ppn_conf = 3;
343 mv2_allgather_thresholds_table
344 = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345 * mv2_allgather_num_ppn_conf);
346 table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347 * mv2_allgather_num_ppn_conf);
348 mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
349 mv2_allgather_num_ppn_conf);
350 mv2_allgather_table_ppn_conf
351 = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
352 mv2_allgather_table_ppn_conf[0] = 1;
353 mv2_size_allgather_tuning_table[0] = 6;
354 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
360 {0, -1, &MPIR_Allgather_Ring_MV2},
368 {0, 262144, &MPIR_Allgather_RD_MV2},
369 {262144, -1, &MPIR_Allgather_Ring_MV2},
377 {0, 131072, &MPIR_Allgather_RD_MV2},
378 {131072, -1, &MPIR_Allgather_Ring_MV2},
386 {0, 131072, &MPIR_Allgather_RD_MV2},
387 {131072, -1, &MPIR_Allgather_Ring_MV2},
395 {0, 65536, &MPIR_Allgather_RD_MV2},
396 {65536, -1, &MPIR_Allgather_Ring_MV2},
404 {0, 32768, &MPIR_Allgather_RD_MV2},
405 {32768, -1, &MPIR_Allgather_Ring_MV2},
409 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
410 mv2_allgather_table_ppn_conf[1] = 2;
411 mv2_size_allgather_tuning_table[1] = 6;
412 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
418 {0, 524288, &MPIR_Allgather_RD_MV2},
419 {524288, -1, &MPIR_Allgather_Ring_MV2},
427 {0, 32768, &MPIR_Allgather_RD_MV2},
428 {32768, 524288, &MPIR_Allgather_Ring_MV2},
429 {524288, -1, &MPIR_Allgather_Ring_MV2},
437 {0, 16384, &MPIR_Allgather_RD_MV2},
438 {16384, 524288, &MPIR_Allgather_Ring_MV2},
439 {524288, -1, &MPIR_Allgather_Ring_MV2},
447 {0, 65536, &MPIR_Allgather_RD_MV2},
448 {65536, 524288, &MPIR_Allgather_Ring_MV2},
449 {524288, -1, &MPIR_Allgather_Ring_MV2},
457 {0, 32768, &MPIR_Allgather_RD_MV2},
458 {32768, 524288, &MPIR_Allgather_Ring_MV2},
459 {524288, -1, &MPIR_Allgather_Ring_MV2},
467 {0, 65536, &MPIR_Allgather_RD_MV2},
468 {65536, 524288, &MPIR_Allgather_Ring_MV2},
469 {524288, -1, &MPIR_Allgather_Ring_MV2},
473 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
474 mv2_allgather_table_ppn_conf[2] = 16;
475 mv2_size_allgather_tuning_table[2] = 6;
476 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
482 {0, 1024, &MPIR_Allgather_RD_MV2},
483 {1024, -1, &MPIR_Allgather_Ring_MV2},
491 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492 {1024, -1, &MPIR_Allgather_Ring_MV2},
500 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501 {1024, -1, &MPIR_Allgather_Ring_MV2},
509 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510 {1024, -1, &MPIR_Allgather_Ring_MV2},
518 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519 {1024, -1, &MPIR_Allgather_Ring_MV2},
527 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
528 {1024, -1, &MPIR_Allgather_Ring_MV2},
533 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
535 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
536 agg_table_sum += mv2_size_allgather_tuning_table[i];
538 mv2_allgather_thresholds_table[0] =
539 xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
540 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
541 (sizeof(mv2_allgather_tuning_table)
542 * mv2_size_allgather_tuning_table[0]));
543 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
544 mv2_allgather_thresholds_table[i] =
545 mv2_allgather_thresholds_table[i - 1]
546 + mv2_size_allgather_tuning_table[i - 1];
547 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
548 (sizeof(mv2_allgather_tuning_table)
549 * mv2_size_allgather_tuning_table[i]));
551 xbt_free(table_ptrs);
555 /************ Gather variables and initializers */
560 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
561 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
562 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
563 } mv2_gather_tuning_element;
568 int size_inter_table;
569 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
570 int size_intra_table;
571 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
572 } mv2_gather_tuning_table;
574 int mv2_size_gather_tuning_table=7;
575 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
577 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
579 MPI_Datatype sendtype,
582 MPI_Datatype recvtype,
583 int root, MPI_Comm comm);
585 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
586 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
589 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
590 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
591 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
594 static void init_mv2_gather_tables_stampede(){
596 mv2_size_gather_tuning_table=7;
597 mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
598 sizeof (mv2_gather_tuning_table));
599 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
601 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
602 {524288, -1, &MPIR_Gather_intra}},
603 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
605 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
606 {16384, 131072, &MPIR_Gather_intra},
607 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
608 1,{{0, -1, &MPIR_Gather_intra}}},
610 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
611 {256, 16384, &MPIR_Gather_MV2_Direct},
612 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
613 1,{{0, -1, &MPIR_Gather_intra}}},
615 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
616 {512, 16384, &MPIR_Gather_MV2_Direct},
617 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
618 1,{{0, -1, &MPIR_Gather_intra}}},
620 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
621 {512, 16384, &MPIR_Gather_MV2_Direct},
622 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
623 1,{{0, -1, &MPIR_Gather_intra}}},
625 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626 {512, 16384, &MPIR_Gather_MV2_Direct},
627 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
628 1,{{0, -1, &MPIR_Gather_intra}}},
630 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631 {512, 16384, &MPIR_Gather_MV2_Direct},
632 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
633 1,{{0, -1, &MPIR_Gather_intra}}},
636 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
637 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
642 /************ Allgatherv variables and initializers */
647 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
649 MPI_Datatype sendtype,
653 MPI_Datatype recvtype,
655 } mv2_allgatherv_tuning_element;
659 int size_inter_table;
660 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
661 } mv2_allgatherv_tuning_table;
663 int (*MV2_Allgatherv_function)(void *sendbuf,
665 MPI_Datatype sendtype,
669 MPI_Datatype recvtype,
672 int mv2_size_allgatherv_tuning_table = 0;
673 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
675 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
676 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
677 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
680 static void init_mv2_allgatherv_tables_stampede(){
681 mv2_size_allgatherv_tuning_table = 6;
682 mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
683 sizeof (mv2_allgatherv_tuning_table));
684 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
689 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
690 {512, -1, &MPIR_Allgatherv_Ring_MV2},
697 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698 {512, -1, &MPIR_Allgatherv_Ring_MV2},
705 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
706 {256, -1, &MPIR_Allgatherv_Ring_MV2},
713 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714 {256, -1, &MPIR_Allgatherv_Ring_MV2},
721 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722 {256, -1, &MPIR_Allgatherv_Ring_MV2},
729 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730 {256, -1, &MPIR_Allgatherv_Ring_MV2},
735 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
736 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
740 /************ Allreduce variables and initializers */
745 int (*MV2_pt_Allreduce_function)(void *sendbuf,
748 MPI_Datatype datatype,
749 MPI_Op op, MPI_Comm comm);
750 } mv2_allreduce_tuning_element;
755 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
756 int size_inter_table;
757 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
758 int size_intra_table;
759 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
760 } mv2_allreduce_tuning_table;
763 int (*MV2_Allreduce_function)(void *sendbuf,
766 MPI_Datatype datatype,
767 MPI_Op op, MPI_Comm comm)=NULL;
770 int (*MV2_Allreduce_intra_function)( void *sendbuf,
773 MPI_Datatype datatype,
774 MPI_Op op, MPI_Comm comm)=NULL;
776 int mv2_size_allreduce_tuning_table = 0;
777 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
783 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
786 MPI_Datatype datatype,
787 MPI_Op op, MPI_Comm comm)
792 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
795 MPI_Datatype datatype,
796 MPI_Op op, MPI_Comm comm)
801 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
804 MPI_Datatype datatype,
805 MPI_Op op, MPI_Comm comm)
807 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
811 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
814 MPI_Datatype datatype,
815 MPI_Op op, MPI_Comm comm)
817 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
821 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
822 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
823 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
826 static void init_mv2_allreduce_tables_stampede(){
827 mv2_size_allreduce_tuning_table = 8;
828 mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
829 sizeof (mv2_allreduce_tuning_table));
830 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
837 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
838 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
842 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
843 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
852 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
853 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
854 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
858 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
859 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
868 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
869 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
870 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
874 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
875 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
884 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
885 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
886 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
890 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
891 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
900 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
901 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
902 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
906 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
907 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
916 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
917 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
918 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
922 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
923 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
932 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
933 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
934 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
935 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
939 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
940 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
949 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
950 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
951 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
952 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
953 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
957 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
958 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
963 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
964 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
969 Bcast deactivated for now, defaults to mpich one
973 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
974 int root, MPI_Comm comm_ptr);
975 int zcpy_pipelined_knomial_factor;
976 } mv2_bcast_tuning_element;
980 int bcast_segment_size;
981 int intra_node_knomial_factor;
982 int inter_node_knomial_factor;
983 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
984 int size_inter_table;
985 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
986 int size_intra_table;
987 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
988 } mv2_bcast_tuning_table;
990 int mv2_size_bcast_tuning_table = 0;
991 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
994 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
995 int root, MPI_Comm comm_ptr) = NULL;
997 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
998 int root, MPI_Comm comm_ptr) = NULL;
1005 static void init_mv2_bcast_tables_stampede(){
1007 mv2_size_bcast_tuning_table=8;
1008 mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1009 sizeof (mv2_bcast_tuning_table));
1011 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1015 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1018 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1019 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1020 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1021 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1022 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1023 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1024 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1025 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1026 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1027 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1028 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1032 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1033 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1034 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1035 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1036 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1037 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1038 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1039 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1040 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1041 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1042 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1048 {1, 1, 1, 1, 1, 1, 1, 1},
1051 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1052 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1053 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1054 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1055 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1056 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1057 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1058 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1062 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1063 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1064 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1065 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1066 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1067 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1068 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1069 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1075 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1078 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1079 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1080 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1081 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1082 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1084 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1085 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1086 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1090 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1091 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1092 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1093 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1094 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1095 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1096 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1097 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1098 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1107 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1108 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1109 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1110 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1114 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1115 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1116 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1117 {524288, -1, NULL, -1}
1126 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1127 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1128 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1129 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1130 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1134 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1135 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1136 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1137 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1138 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1147 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1148 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1149 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1150 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1151 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1155 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1156 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1157 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1158 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1159 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1168 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1169 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1170 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1171 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1172 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1176 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1177 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1178 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1179 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1180 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1186 {1, 1, 1, 1, 1, 1, 1},
1189 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1190 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1191 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1192 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1193 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1194 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1195 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1199 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1200 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1201 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1202 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1203 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1204 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1205 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1210 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1211 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1215 /************ Reduce variables and initializers */
1220 int (*MV2_pt_Reduce_function)(void *sendbuf,
1223 MPI_Datatype datatype,
1227 } mv2_reduce_tuning_element;
1233 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1234 int size_inter_table;
1235 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1236 int size_intra_table;
1237 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1238 } mv2_reduce_tuning_table;
1240 int mv2_size_reduce_tuning_table = 0;
1241 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1244 int mv2_reduce_intra_knomial_factor = 2;
1245 int mv2_reduce_inter_knomial_factor = 2;
1247 int (*MV2_Reduce_function)( void *sendbuf,
1250 MPI_Datatype datatype,
1253 MPI_Comm comm_ptr)=NULL;
1255 int (*MV2_Reduce_intra_function)( void *sendbuf,
1258 MPI_Datatype datatype,
1261 MPI_Comm comm_ptr)=NULL;
1264 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1265 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1266 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1267 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1268 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1269 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1272 static void init_mv2_reduce_tables_stampede(){
1274 mv2_size_reduce_tuning_table = 8;
1275 mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1276 sizeof (mv2_reduce_tuning_table));
1277 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1285 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1286 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1287 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1291 {0, 65536, &MPIR_Reduce_shmem_MV2},
1292 {65536,-1, &MPIR_Reduce_binomial_MV2},
1299 {1, 1, 1, 1, 0, 0, 0},
1302 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1303 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1304 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1305 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1306 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1307 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1308 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1312 {0, 8192, &MPIR_Reduce_shmem_MV2},
1313 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1314 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1315 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1316 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1317 {262144,-1, &MPIR_Reduce_binomial_MV2},
1327 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1328 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1329 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1330 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1331 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1335 {0, 8192, &MPIR_Reduce_shmem_MV2},
1336 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1337 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1338 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1339 {262144, -1, &MPIR_Reduce_binomial_MV2},
1349 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1350 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1351 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1352 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1353 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1354 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1358 {0, 8192, &MPIR_Reduce_shmem_MV2},
1359 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1360 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1361 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1362 {262144, -1, &MPIR_Reduce_binomial_MV2},
1369 {1, 1, 1, 0, 1, 1, 0},
1372 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1373 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1374 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1375 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1376 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1377 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1378 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1382 {0, 8192, &MPIR_Reduce_shmem_MV2},
1383 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1384 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1385 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1386 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1387 {262144, -1, &MPIR_Reduce_binomial_MV2},
1397 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1398 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1399 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1400 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1401 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1402 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1406 {0, 8192, &MPIR_Reduce_shmem_MV2},
1407 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1408 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1409 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1410 {262144, -1, &MPIR_Reduce_binomial_MV2},
1420 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1421 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1422 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1423 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1424 {262144, -1, &MPIR_Reduce_binomial_MV2},
1428 {0, 8192, &MPIR_Reduce_shmem_MV2},
1429 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1430 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1431 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1432 {262144, -1, &MPIR_Reduce_binomial_MV2},
1442 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1443 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1444 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1445 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1446 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1447 {131072, -1, &MPIR_Reduce_binomial_MV2},
1451 {0, 2048, &MPIR_Reduce_shmem_MV2},
1452 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1453 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1454 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1455 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1456 {131072, -1, &MPIR_Reduce_shmem_MV2},
1461 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1462 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1465 /************ Reduce scatter variables and initializers */
1470 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1473 MPI_Datatype datatype,
1476 } mv2_red_scat_tuning_element;
1480 int size_inter_table;
1481 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1482 } mv2_red_scat_tuning_table;
1484 int mv2_size_red_scat_tuning_table = 0;
1485 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1488 int (*MV2_Red_scat_function)(void *sendbuf,
1491 MPI_Datatype datatype,
1497 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1500 MPI_Datatype datatype,
1504 smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1507 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1508 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1509 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1514 static void init_mv2_reduce_scatter_tables_stampede(){
1515 mv2_size_red_scat_tuning_table = 6;
1516 mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1517 sizeof (mv2_red_scat_tuning_table));
1518 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1523 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1524 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1525 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1532 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1533 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1534 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1541 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1542 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1543 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1550 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1551 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1558 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1559 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1566 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1567 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1572 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1573 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1576 /************ Scatter variables and initializers */
1581 int (*MV2_pt_Scatter_function)(void *sendbuf,
1583 MPI_Datatype sendtype,
1586 MPI_Datatype recvtype,
1587 int root, MPI_Comm comm);
1588 } mv2_scatter_tuning_element;
1592 int size_inter_table;
1593 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1594 int size_intra_table;
1595 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1596 } mv2_scatter_tuning_table;
1599 int *mv2_scatter_table_ppn_conf = NULL;
1600 int mv2_scatter_num_ppn_conf = 1;
1601 int *mv2_size_scatter_tuning_table = NULL;
1602 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1604 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1605 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1606 int root, MPI_Comm comm)=NULL;
1608 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1609 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1610 int root, MPI_Comm comm)=NULL;
1611 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1613 MPI_Datatype sendtype,
1616 MPI_Datatype recvtype,
1617 int root, MPI_Comm comm_ptr);
1619 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1621 MPI_Datatype sendtype,
1624 MPI_Datatype recvtype,
1625 int root, MPI_Comm comm_ptr)
1630 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1631 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1632 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
1633 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1638 static void init_mv2_scatter_tables_stampede(){
1640 int agg_table_sum = 0;
1642 mv2_scatter_tuning_table **table_ptrs = NULL;
1643 mv2_scatter_num_ppn_conf = 3;
1644 mv2_scatter_thresholds_table
1645 = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1646 * mv2_scatter_num_ppn_conf);
1647 table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1648 * mv2_scatter_num_ppn_conf);
1649 mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1650 mv2_scatter_num_ppn_conf);
1651 mv2_scatter_table_ppn_conf
1652 = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1653 mv2_scatter_table_ppn_conf[0] = 1;
1654 mv2_size_scatter_tuning_table[0] = 6;
1655 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1659 {0, -1, &MPIR_Scatter_MV2_Binomial},
1663 {0, -1, &MPIR_Scatter_MV2_Binomial},
1670 {0, -1, &MPIR_Scatter_MV2_Direct},
1674 {0, -1, &MPIR_Scatter_MV2_Direct},
1681 {0, -1, &MPIR_Scatter_MV2_Direct},
1685 {0, -1, &MPIR_Scatter_MV2_Direct},
1692 {0, -1, &MPIR_Scatter_MV2_Direct},
1696 {0, -1, &MPIR_Scatter_MV2_Direct},
1703 {0, -1, &MPIR_Scatter_MV2_Direct},
1707 {0, -1, &MPIR_Scatter_MV2_Direct},
1714 {0, 32, &MPIR_Scatter_MV2_Binomial},
1715 {32, -1, &MPIR_Scatter_MV2_Direct},
1719 {0, -1, &MPIR_Scatter_MV2_Binomial},
1723 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1724 mv2_scatter_table_ppn_conf[1] = 2;
1725 mv2_size_scatter_tuning_table[1] = 6;
1726 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1730 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1731 {4096, -1, &MPIR_Scatter_MV2_Direct},
1735 {0, -1, &MPIR_Scatter_MV2_Direct},
1742 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1743 {512, -1, &MPIR_Scatter_MV2_Direct},
1747 {0, -1, &MPIR_Scatter_MV2_Binomial},
1754 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1755 {2048, -1, &MPIR_Scatter_MV2_Direct},
1759 {0, -1, &MPIR_Scatter_MV2_Binomial},
1766 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1767 {2048, -1, &MPIR_Scatter_MV2_Direct},
1771 {0, -1, &MPIR_Scatter_MV2_Binomial},
1778 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1779 {8192, -1, &MPIR_Scatter_MV2_Direct},
1783 {0, -1, &MPIR_Scatter_MV2_Binomial},
1790 {0, 16, &MPIR_Scatter_MV2_Binomial},
1791 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1792 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1793 {16384, -1, &MPIR_Scatter_MV2_Direct},
1797 {0, 128, &MPIR_Scatter_MV2_Direct},
1798 {128, -1, &MPIR_Scatter_MV2_Binomial},
1802 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1803 mv2_scatter_table_ppn_conf[2] = 16;
1804 mv2_size_scatter_tuning_table[2] = 8;
1805 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1810 {0, 256, &MPIR_Scatter_MV2_Binomial},
1811 {256, -1, &MPIR_Scatter_MV2_Direct},
1815 { 0, -1, &MPIR_Scatter_MV2_Direct},
1823 {0, 512, &MPIR_Scatter_MV2_Binomial},
1824 {512, -1, &MPIR_Scatter_MV2_Direct},
1828 { 0, -1, &MPIR_Scatter_MV2_Direct},
1836 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1837 {1024, -1, &MPIR_Scatter_MV2_Direct},
1841 { 0, -1, &MPIR_Scatter_MV2_Direct},
1849 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1850 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1851 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1852 {2048, -1, &MPIR_Scatter_MV2_Direct},
1856 { 0, -1, &MPIR_Scatter_MV2_Direct},
1864 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1865 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1866 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1867 {2048, -1, &MPIR_Scatter_MV2_Direct},
1871 { 0, -1, &MPIR_Scatter_MV2_Direct},
1879 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1880 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1881 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1882 {4096, -1, &MPIR_Scatter_MV2_Direct},
1886 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1893 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1894 {0, 16, &MPIR_Scatter_MV2_Binomial},
1895 {16, 32, &MPIR_Scatter_MV2_Binomial},
1896 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1897 {4096, -1, &MPIR_Scatter_MV2_Direct},
1901 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1908 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1909 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1910 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1911 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1912 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1913 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1914 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1918 {0, 16, &MPIR_Scatter_MV2_Binomial},
1919 {16, 128, &MPIR_Scatter_MV2_Binomial},
1920 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1921 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1922 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1923 {65536, -1, &MPIR_Scatter_MV2_Direct},
1927 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1929 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1930 agg_table_sum += mv2_size_scatter_tuning_table[i];
1932 mv2_scatter_thresholds_table[0] =
1933 xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1934 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1935 (sizeof(mv2_scatter_tuning_table)
1936 * mv2_size_scatter_tuning_table[0]));
1937 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1938 mv2_scatter_thresholds_table[i] =
1939 mv2_scatter_thresholds_table[i - 1]
1940 + mv2_size_scatter_tuning_table[i - 1];
1941 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1942 (sizeof(mv2_scatter_tuning_table)
1943 * mv2_size_scatter_tuning_table[i]));
1945 xbt_free(table_ptrs);