1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
10 /************ Alltoall variables and initializers */
12 #define MV2_MAX_NB_THRESHOLDS 32
16 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
17 void *recvbuf, int recvcount, MPI_Datatype recvtype,
19 } mv2_alltoall_tuning_element;
24 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
25 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
26 } mv2_alltoall_tuning_table;
28 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
30 /* Indicates number of processes per node */
31 int *mv2_alltoall_table_ppn_conf = NULL;
32 /* Indicates total number of configurations */
33 int mv2_alltoall_num_ppn_conf = 1;
34 int *mv2_size_alltoall_tuning_table = NULL;
35 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
39 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
40 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_ring
41 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
42 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring
45 static void init_mv2_alltoall_tables_stampede(){
47 int agg_table_sum = 0;
48 mv2_alltoall_tuning_table **table_ptrs = NULL;
49 mv2_alltoall_num_ppn_conf = 3;
50 mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
51 * mv2_alltoall_num_ppn_conf);
52 table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53 * mv2_alltoall_num_ppn_conf);
54 mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
55 mv2_alltoall_num_ppn_conf);
56 mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
57 mv2_alltoall_table_ppn_conf[0] = 1;
58 mv2_size_alltoall_tuning_table[0] = 6;
59 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
62 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
65 {{0, -1, &MPIR_Alltoall_inplace_MV2},
71 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
72 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
75 {{0, -1, &MPIR_Alltoall_inplace_MV2},
81 {{0, 8, &MPIR_Alltoall_RD_MV2},
82 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
85 {{0, -1, &MPIR_Alltoall_inplace_MV2},
91 {{0, 64, &MPIR_Alltoall_RD_MV2},
92 {64, 512, &MPIR_Alltoall_bruck_MV2},
93 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
96 {{0,-1, &MPIR_Alltoall_inplace_MV2},
102 {{0, 32, &MPIR_Alltoall_RD_MV2},
103 {32, 2048, &MPIR_Alltoall_bruck_MV2},
104 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
107 {{0, -1, &MPIR_Alltoall_inplace_MV2},
113 {{0, 8, &MPIR_Alltoall_RD_MV2},
114 {8, 1024, &MPIR_Alltoall_bruck_MV2},
115 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
118 {{0, -1, &MPIR_Alltoall_inplace_MV2},
122 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
123 mv2_alltoall_table_ppn_conf[1] = 2;
124 mv2_size_alltoall_tuning_table[1] = 6;
125 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
128 {{0, 32, &MPIR_Alltoall_RD_MV2},
129 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
132 {{0, -1, &MPIR_Alltoall_inplace_MV2},
138 {{0, 64, &MPIR_Alltoall_RD_MV2},
139 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
142 {{0, -1, &MPIR_Alltoall_inplace_MV2},
148 {{0, 64, &MPIR_Alltoall_RD_MV2},
149 {64, 2048, &MPIR_Alltoall_bruck_MV2},
150 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
153 {{0,-1, &MPIR_Alltoall_inplace_MV2},
159 {{0, 16, &MPIR_Alltoall_RD_MV2},
160 {16, 2048, &MPIR_Alltoall_bruck_MV2},
161 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
164 {{0, -1, &MPIR_Alltoall_inplace_MV2},
170 {{0, 8, &MPIR_Alltoall_RD_MV2},
171 {8, 1024, &MPIR_Alltoall_bruck_MV2},
172 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
175 {{0, -1, &MPIR_Alltoall_inplace_MV2},
181 {{0, 4, &MPIR_Alltoall_RD_MV2},
182 {4, 2048, &MPIR_Alltoall_bruck_MV2},
183 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
186 {{0, -1, &MPIR_Alltoall_inplace_MV2},
190 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
191 mv2_alltoall_table_ppn_conf[2] = 16;
192 mv2_size_alltoall_tuning_table[2] = 7;
193 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
196 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
197 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
200 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
206 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
207 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
210 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
216 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
217 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
218 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
221 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
227 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
228 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
231 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
237 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
238 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
241 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
247 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
248 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
251 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
256 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
257 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
260 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
265 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
267 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
268 agg_table_sum += mv2_size_alltoall_tuning_table[i];
270 mv2_alltoall_thresholds_table[0] =
271 xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
272 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
273 (sizeof(mv2_alltoall_tuning_table)
274 * mv2_size_alltoall_tuning_table[0]));
275 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
276 mv2_alltoall_thresholds_table[i] =
277 mv2_alltoall_thresholds_table[i - 1]
278 + mv2_size_alltoall_tuning_table[i - 1];
279 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
280 (sizeof(mv2_alltoall_tuning_table)
281 * mv2_size_alltoall_tuning_table[i]));
283 xbt_free(table_ptrs);
289 /************ Allgather variables and initializers */
294 int (*MV2_pt_Allgather_function)(void *sendbuf,
296 MPI_Datatype sendtype,
299 MPI_Datatype recvtype, MPI_Comm comm_ptr);
300 } mv2_allgather_tuning_element;
304 int two_level[MV2_MAX_NB_THRESHOLDS];
305 int size_inter_table;
306 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
307 } mv2_allgather_tuning_table;
309 int (*MV2_Allgather_function)(void *sendbuf,
311 MPI_Datatype sendtype,
314 MPI_Datatype recvtype, MPI_Comm comm);
316 int *mv2_allgather_table_ppn_conf = NULL;
317 int mv2_allgather_num_ppn_conf = 1;
318 int *mv2_size_allgather_tuning_table = NULL;
319 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
321 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
322 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
323 #define MPIR_Allgather_RD_Allgather_Comm_MV2 smpi_coll_tuned_allgather_rdb
324 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
327 static void init_mv2_allgather_tables_stampede(){
329 int agg_table_sum = 0;
330 mv2_allgather_tuning_table **table_ptrs = NULL;
331 mv2_allgather_num_ppn_conf = 3;
332 mv2_allgather_thresholds_table
333 = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
334 * mv2_allgather_num_ppn_conf);
335 table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
336 * mv2_allgather_num_ppn_conf);
337 mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
338 mv2_allgather_num_ppn_conf);
339 mv2_allgather_table_ppn_conf
340 = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
341 mv2_allgather_table_ppn_conf[0] = 1;
342 mv2_size_allgather_tuning_table[0] = 6;
343 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
349 {0, -1, &MPIR_Allgather_Ring_MV2},
357 {0, 262144, &MPIR_Allgather_RD_MV2},
358 {262144, -1, &MPIR_Allgather_Ring_MV2},
366 {0, 131072, &MPIR_Allgather_RD_MV2},
367 {131072, -1, &MPIR_Allgather_Ring_MV2},
375 {0, 131072, &MPIR_Allgather_RD_MV2},
376 {131072, -1, &MPIR_Allgather_Ring_MV2},
384 {0, 65536, &MPIR_Allgather_RD_MV2},
385 {65536, -1, &MPIR_Allgather_Ring_MV2},
393 {0, 32768, &MPIR_Allgather_RD_MV2},
394 {32768, -1, &MPIR_Allgather_Ring_MV2},
398 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
399 mv2_allgather_table_ppn_conf[1] = 2;
400 mv2_size_allgather_tuning_table[1] = 6;
401 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
407 {0, 524288, &MPIR_Allgather_RD_MV2},
408 {524288, -1, &MPIR_Allgather_Ring_MV2},
416 {0, 32768, &MPIR_Allgather_RD_MV2},
417 {32768, 524288, &MPIR_Allgather_Ring_MV2},
418 {524288, -1, &MPIR_Allgather_Ring_MV2},
426 {0, 16384, &MPIR_Allgather_RD_MV2},
427 {16384, 524288, &MPIR_Allgather_Ring_MV2},
428 {524288, -1, &MPIR_Allgather_Ring_MV2},
436 {0, 65536, &MPIR_Allgather_RD_MV2},
437 {65536, 524288, &MPIR_Allgather_Ring_MV2},
438 {524288, -1, &MPIR_Allgather_Ring_MV2},
446 {0, 32768, &MPIR_Allgather_RD_MV2},
447 {32768, 524288, &MPIR_Allgather_Ring_MV2},
448 {524288, -1, &MPIR_Allgather_Ring_MV2},
456 {0, 65536, &MPIR_Allgather_RD_MV2},
457 {65536, 524288, &MPIR_Allgather_Ring_MV2},
458 {524288, -1, &MPIR_Allgather_Ring_MV2},
462 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
463 mv2_allgather_table_ppn_conf[2] = 16;
464 mv2_size_allgather_tuning_table[2] = 6;
465 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
471 {0, 1024, &MPIR_Allgather_RD_MV2},
472 {1024, -1, &MPIR_Allgather_Ring_MV2},
480 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
481 {1024, -1, &MPIR_Allgather_Ring_MV2},
489 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
490 {1024, -1, &MPIR_Allgather_Ring_MV2},
498 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
499 {1024, -1, &MPIR_Allgather_Ring_MV2},
507 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
508 {1024, -1, &MPIR_Allgather_Ring_MV2},
516 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
517 {1024, -1, &MPIR_Allgather_Ring_MV2},
522 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
524 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
525 agg_table_sum += mv2_size_allgather_tuning_table[i];
527 mv2_allgather_thresholds_table[0] =
528 xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
529 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
530 (sizeof(mv2_allgather_tuning_table)
531 * mv2_size_allgather_tuning_table[0]));
532 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
533 mv2_allgather_thresholds_table[i] =
534 mv2_allgather_thresholds_table[i - 1]
535 + mv2_size_allgather_tuning_table[i - 1];
536 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
537 (sizeof(mv2_allgather_tuning_table)
538 * mv2_size_allgather_tuning_table[i]));
540 xbt_free(table_ptrs);
544 /************ Gather variables and initializers */
549 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
550 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
551 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
552 } mv2_gather_tuning_element;
557 int size_inter_table;
558 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
559 int size_intra_table;
560 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
561 } mv2_gather_tuning_table;
563 int mv2_size_gather_tuning_table=7;
564 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
566 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
568 MPI_Datatype sendtype,
571 MPI_Datatype recvtype,
572 int root, MPI_Comm comm);
574 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
575 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
578 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
579 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_ompi_basic_linear
580 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
583 static void init_mv2_gather_tables_stampede(){
585 mv2_size_gather_tuning_table=7;
586 mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
587 sizeof (mv2_gather_tuning_table));
588 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
590 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
591 {524288, -1, &MPIR_Gather_intra}},
592 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
594 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
595 {16384, 131072, &MPIR_Gather_intra},
596 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
597 1,{{0, -1, &MPIR_Gather_intra}}},
599 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
600 {256, 16384, &MPIR_Gather_MV2_Direct},
601 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
602 1,{{0, -1, &MPIR_Gather_intra}}},
604 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
605 {512, 16384, &MPIR_Gather_MV2_Direct},
606 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
607 1,{{0, -1, &MPIR_Gather_intra}}},
609 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
610 {512, 16384, &MPIR_Gather_MV2_Direct},
611 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
612 1,{{0, -1, &MPIR_Gather_intra}}},
614 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
615 {512, 16384, &MPIR_Gather_MV2_Direct},
616 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
617 1,{{0, -1, &MPIR_Gather_intra}}},
619 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
620 {512, 16384, &MPIR_Gather_MV2_Direct},
621 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
622 1,{{0, -1, &MPIR_Gather_intra}}},
625 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
626 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
631 /************ Allgatherv variables and initializers */
636 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
638 MPI_Datatype sendtype,
642 MPI_Datatype recvtype,
644 } mv2_allgatherv_tuning_element;
648 int size_inter_table;
649 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
650 } mv2_allgatherv_tuning_table;
652 int (*MV2_Allgatherv_function)(void *sendbuf,
654 MPI_Datatype sendtype,
658 MPI_Datatype recvtype,
661 int mv2_size_allgatherv_tuning_table = 0;
662 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
664 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
665 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
666 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
669 static void init_mv2_allgatherv_tables_stampede(){
670 mv2_size_allgatherv_tuning_table = 6;
671 mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
672 sizeof (mv2_allgatherv_tuning_table));
673 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
678 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
679 {512, -1, &MPIR_Allgatherv_Ring_MV2},
686 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
687 {512, -1, &MPIR_Allgatherv_Ring_MV2},
694 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
695 {256, -1, &MPIR_Allgatherv_Ring_MV2},
702 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
703 {256, -1, &MPIR_Allgatherv_Ring_MV2},
710 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
711 {256, -1, &MPIR_Allgatherv_Ring_MV2},
718 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
719 {256, -1, &MPIR_Allgatherv_Ring_MV2},
724 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
725 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
729 /************ Allreduce variables and initializers */
734 int (*MV2_pt_Allreduce_function)(void *sendbuf,
737 MPI_Datatype datatype,
738 MPI_Op op, MPI_Comm comm);
739 } mv2_allreduce_tuning_element;
744 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
745 int size_inter_table;
746 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
747 int size_intra_table;
748 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
749 } mv2_allreduce_tuning_table;
752 int (*MV2_Allreduce_function)(void *sendbuf,
755 MPI_Datatype datatype,
756 MPI_Op op, MPI_Comm comm)=NULL;
759 int (*MV2_Allreduce_intra_function)( void *sendbuf,
762 MPI_Datatype datatype,
763 MPI_Op op, MPI_Comm comm)=NULL;
765 int mv2_size_allreduce_tuning_table = 0;
766 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
772 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
775 MPI_Datatype datatype,
776 MPI_Op op, MPI_Comm comm)
781 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
784 MPI_Datatype datatype,
785 MPI_Op op, MPI_Comm comm)
790 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
793 MPI_Datatype datatype,
794 MPI_Op op, MPI_Comm comm)
796 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
800 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
803 MPI_Datatype datatype,
804 MPI_Op op, MPI_Comm comm)
806 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
810 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
811 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
815 static void init_mv2_allreduce_tables_stampede(){
816 mv2_size_allreduce_tuning_table = 8;
817 mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
818 sizeof (mv2_allreduce_tuning_table));
819 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
826 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
827 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
831 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
832 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
841 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
842 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
843 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
847 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
848 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
857 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
858 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
859 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
863 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
864 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
873 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
874 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
875 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
879 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
880 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
889 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
890 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
891 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
895 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
896 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
905 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
906 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
907 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
911 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
912 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
921 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
922 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
923 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
924 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
928 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
929 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
938 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
939 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
940 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
941 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
942 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
946 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
947 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
952 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
953 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
958 Bcast deactivated for now, defaults to mpich one
962 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
963 int root, MPI_Comm comm_ptr);
964 int zcpy_pipelined_knomial_factor;
965 } mv2_bcast_tuning_element;
969 int bcast_segment_size;
970 int intra_node_knomial_factor;
971 int inter_node_knomial_factor;
972 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
973 int size_inter_table;
974 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
975 int size_intra_table;
976 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
977 } mv2_bcast_tuning_table;
979 int mv2_size_bcast_tuning_table = 0;
980 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
983 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
984 int root, MPI_Comm comm_ptr) = NULL;
986 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
987 int root, MPI_Comm comm_ptr) = NULL;
994 static void init_mv2_bcast_tables_stampede(){
996 mv2_size_bcast_tuning_table=8;
997 mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
998 sizeof (mv2_bcast_tuning_table));
1000 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1004 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1007 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1008 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1009 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1010 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1011 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1012 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1013 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1014 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1015 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1016 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1017 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1021 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1022 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1023 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1024 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1025 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1026 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1027 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1028 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1029 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1030 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1031 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1037 {1, 1, 1, 1, 1, 1, 1, 1},
1040 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1041 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1042 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1043 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1044 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1045 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1046 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1047 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1051 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1052 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1053 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1054 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1055 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1056 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1057 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1058 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1064 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1067 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1068 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1069 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1070 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1071 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1072 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1073 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1074 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1075 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1079 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1080 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1081 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1082 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1083 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1084 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1085 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1086 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1087 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1096 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1097 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1098 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1099 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1103 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1104 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1105 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1106 {524288, -1, NULL, -1}
1115 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1116 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1117 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1118 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1119 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1123 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1124 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1125 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1126 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1127 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1136 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1137 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1138 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1139 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1140 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1144 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1145 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1146 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1147 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1148 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1157 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1158 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1159 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1160 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1161 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1165 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1166 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1167 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1168 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1169 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1175 {1, 1, 1, 1, 1, 1, 1},
1178 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1179 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1180 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1181 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1182 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1183 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1184 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1188 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1189 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1190 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1191 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1192 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1193 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1194 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1199 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1200 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1204 /************ Reduce variables and initializers */
1209 int (*MV2_pt_Reduce_function)(void *sendbuf,
1212 MPI_Datatype datatype,
1216 } mv2_reduce_tuning_element;
1222 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1223 int size_inter_table;
1224 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1225 int size_intra_table;
1226 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1227 } mv2_reduce_tuning_table;
1229 int mv2_size_reduce_tuning_table = 0;
1230 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1233 int mv2_reduce_intra_knomial_factor = -1;
1234 int mv2_reduce_inter_knomial_factor = -1;
1236 int (*MV2_Reduce_function)( void *sendbuf,
1239 MPI_Datatype datatype,
1242 MPI_Comm comm_ptr)=NULL;
1244 int (*MV2_Reduce_intra_function)( void *sendbuf,
1247 MPI_Datatype datatype,
1250 MPI_Comm comm_ptr)=NULL;
1253 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial
1254 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial
1255 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1256 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1257 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1261 static void init_mv2_reduce_tables_stampede(){
1263 mv2_size_reduce_tuning_table = 8;
1264 mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1265 sizeof (mv2_reduce_tuning_table));
1266 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1274 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1275 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1276 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1280 {0, 65536, &MPIR_Reduce_shmem_MV2},
1281 {65536,-1, &MPIR_Reduce_binomial_MV2},
1288 {1, 1, 1, 1, 0, 0, 0},
1291 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1292 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1293 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1294 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1295 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1296 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1297 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1301 {0, 8192, &MPIR_Reduce_shmem_MV2},
1302 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1303 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1304 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1305 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1306 {262144,-1, &MPIR_Reduce_binomial_MV2},
1316 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1317 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1318 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1319 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1320 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1324 {0, 8192, &MPIR_Reduce_shmem_MV2},
1325 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1326 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1327 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1328 {262144, -1, &MPIR_Reduce_binomial_MV2},
1338 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1340 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1341 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1342 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1343 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1347 {0, 8192, &MPIR_Reduce_shmem_MV2},
1348 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1349 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1350 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1351 {262144, -1, &MPIR_Reduce_binomial_MV2},
1358 {1, 1, 1, 0, 1, 1, 0},
1361 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1362 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1364 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1365 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1366 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1367 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1371 {0, 8192, &MPIR_Reduce_shmem_MV2},
1372 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1374 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1375 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1376 {262144, -1, &MPIR_Reduce_binomial_MV2},
1386 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1388 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1389 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1390 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1391 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1395 {0, 8192, &MPIR_Reduce_shmem_MV2},
1396 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1397 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1398 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1399 {262144, -1, &MPIR_Reduce_binomial_MV2},
1409 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1410 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1411 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1412 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1413 {262144, -1, &MPIR_Reduce_binomial_MV2},
1417 {0, 8192, &MPIR_Reduce_shmem_MV2},
1418 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1419 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1420 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1421 {262144, -1, &MPIR_Reduce_binomial_MV2},
1431 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1432 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1433 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1434 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1435 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1436 {131072, -1, &MPIR_Reduce_binomial_MV2},
1440 {0, 2048, &MPIR_Reduce_shmem_MV2},
1441 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1442 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1443 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1444 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1445 {131072, -1, &MPIR_Reduce_shmem_MV2},
1450 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1451 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1454 /************ Reduce scatter variables and initializers */
1459 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1462 MPI_Datatype datatype,
1465 } mv2_red_scat_tuning_element;
1469 int size_inter_table;
1470 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1471 } mv2_red_scat_tuning_table;
1473 int mv2_size_red_scat_tuning_table = 0;
1474 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1477 int (*MV2_Red_scat_function)(void *sendbuf,
1480 MPI_Datatype datatype,
1486 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1489 MPI_Datatype datatype,
1493 smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1496 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1497 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1498 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1503 static void init_mv2_reduce_scatter_tables_stampede(){
1504 mv2_size_red_scat_tuning_table = 6;
1505 mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1506 sizeof (mv2_red_scat_tuning_table));
1507 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1512 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1513 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1514 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1521 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1522 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1523 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1530 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1531 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1532 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1539 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1540 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1547 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1548 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1555 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1556 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1561 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1562 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1565 /************ Scatter variables and initializers */
1570 int (*MV2_pt_Scatter_function)(void *sendbuf,
1572 MPI_Datatype sendtype,
1575 MPI_Datatype recvtype,
1576 int root, MPI_Comm comm);
1577 } mv2_scatter_tuning_element;
1581 int size_inter_table;
1582 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1583 int size_intra_table;
1584 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1585 } mv2_scatter_tuning_table;
1588 int *mv2_scatter_table_ppn_conf = NULL;
1589 int mv2_scatter_num_ppn_conf = 1;
1590 int *mv2_size_scatter_tuning_table = NULL;
1591 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1593 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1594 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1595 int root, MPI_Comm comm)=NULL;
1597 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1598 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1599 int root, MPI_Comm comm)=NULL;
1600 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1602 MPI_Datatype sendtype,
1605 MPI_Datatype recvtype,
1606 int root, MPI_Comm comm_ptr);
1608 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1610 MPI_Datatype sendtype,
1613 MPI_Datatype recvtype,
1614 int root, MPI_Comm comm_ptr)
1619 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1620 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1621 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
1622 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1627 static void init_mv2_scatter_tables_stampede(){
1629 int agg_table_sum = 0;
1631 mv2_scatter_tuning_table **table_ptrs = NULL;
1632 mv2_scatter_num_ppn_conf = 3;
1633 mv2_scatter_thresholds_table
1634 = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1635 * mv2_scatter_num_ppn_conf);
1636 table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1637 * mv2_scatter_num_ppn_conf);
1638 mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1639 mv2_scatter_num_ppn_conf);
1640 mv2_scatter_table_ppn_conf
1641 = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1642 mv2_scatter_table_ppn_conf[0] = 1;
1643 mv2_size_scatter_tuning_table[0] = 6;
1644 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1648 {0, -1, &MPIR_Scatter_MV2_Binomial},
1652 {0, -1, &MPIR_Scatter_MV2_Binomial},
1659 {0, -1, &MPIR_Scatter_MV2_Direct},
1663 {0, -1, &MPIR_Scatter_MV2_Direct},
1670 {0, -1, &MPIR_Scatter_MV2_Direct},
1674 {0, -1, &MPIR_Scatter_MV2_Direct},
1681 {0, -1, &MPIR_Scatter_MV2_Direct},
1685 {0, -1, &MPIR_Scatter_MV2_Direct},
1692 {0, -1, &MPIR_Scatter_MV2_Direct},
1696 {0, -1, &MPIR_Scatter_MV2_Direct},
1703 {0, 32, &MPIR_Scatter_MV2_Binomial},
1704 {32, -1, &MPIR_Scatter_MV2_Direct},
1708 {0, -1, &MPIR_Scatter_MV2_Binomial},
1712 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1713 mv2_scatter_table_ppn_conf[1] = 2;
1714 mv2_size_scatter_tuning_table[1] = 6;
1715 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1719 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1720 {4096, -1, &MPIR_Scatter_MV2_Direct},
1724 {0, -1, &MPIR_Scatter_MV2_Direct},
1731 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1732 {512, -1, &MPIR_Scatter_MV2_Direct},
1736 {0, -1, &MPIR_Scatter_MV2_Binomial},
1743 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1744 {2048, -1, &MPIR_Scatter_MV2_Direct},
1748 {0, -1, &MPIR_Scatter_MV2_Binomial},
1755 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1756 {2048, -1, &MPIR_Scatter_MV2_Direct},
1760 {0, -1, &MPIR_Scatter_MV2_Binomial},
1767 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1768 {8192, -1, &MPIR_Scatter_MV2_Direct},
1772 {0, -1, &MPIR_Scatter_MV2_Binomial},
1779 {0, 16, &MPIR_Scatter_MV2_Binomial},
1780 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1781 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1782 {16384, -1, &MPIR_Scatter_MV2_Direct},
1786 {0, 128, &MPIR_Scatter_MV2_Direct},
1787 {128, -1, &MPIR_Scatter_MV2_Binomial},
1791 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1792 mv2_scatter_table_ppn_conf[2] = 16;
1793 mv2_size_scatter_tuning_table[2] = 8;
1794 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1799 {0, 256, &MPIR_Scatter_MV2_Binomial},
1800 {256, -1, &MPIR_Scatter_MV2_Direct},
1804 { 0, -1, &MPIR_Scatter_MV2_Direct},
1812 {0, 512, &MPIR_Scatter_MV2_Binomial},
1813 {512, -1, &MPIR_Scatter_MV2_Direct},
1817 { 0, -1, &MPIR_Scatter_MV2_Direct},
1825 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1826 {1024, -1, &MPIR_Scatter_MV2_Direct},
1830 { 0, -1, &MPIR_Scatter_MV2_Direct},
1838 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1839 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1840 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1841 {2048, -1, &MPIR_Scatter_MV2_Direct},
1845 { 0, -1, &MPIR_Scatter_MV2_Direct},
1853 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1854 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1855 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1856 {2048, -1, &MPIR_Scatter_MV2_Direct},
1860 { 0, -1, &MPIR_Scatter_MV2_Direct},
1868 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1869 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1870 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1871 {4096, -1, &MPIR_Scatter_MV2_Direct},
1875 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1882 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1883 {0, 16, &MPIR_Scatter_MV2_Binomial},
1884 {16, 32, &MPIR_Scatter_MV2_Binomial},
1885 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1886 {4096, -1, &MPIR_Scatter_MV2_Direct},
1890 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1897 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1898 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1899 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1900 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1901 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1902 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1903 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1907 {0, 16, &MPIR_Scatter_MV2_Binomial},
1908 {16, 128, &MPIR_Scatter_MV2_Binomial},
1909 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1910 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1911 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1912 {65536, -1, &MPIR_Scatter_MV2_Direct},
1916 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1918 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1919 agg_table_sum += mv2_size_scatter_tuning_table[i];
1921 mv2_scatter_thresholds_table[0] =
1922 xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1923 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1924 (sizeof(mv2_scatter_tuning_table)
1925 * mv2_size_scatter_tuning_table[0]));
1926 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1927 mv2_scatter_thresholds_table[i] =
1928 mv2_scatter_thresholds_table[i - 1]
1929 + mv2_size_scatter_tuning_table[i - 1];
1930 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1931 (sizeof(mv2_scatter_tuning_table)
1932 * mv2_size_scatter_tuning_table[i]));
1934 xbt_free(table_ptrs);