1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
4 /* Copyright (c) 2009-2017. The SimGrid Team. All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 /************ Alltoall variables and initializers */
11 #define MV2_MAX_NB_THRESHOLDS 32
13 using namespace simgrid::smpi;
18 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19 void *recvbuf, int recvcount, MPI_Datatype recvtype,
21 } mv2_alltoall_tuning_element;
26 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
40 #define MPIR_Alltoall_bruck_MV2 Coll_alltoall_bruck::alltoall
41 #define MPIR_Alltoall_RD_MV2 Coll_alltoall_rdb::alltoall
42 #define MPIR_Alltoall_Scatter_dest_MV2 Coll_alltoall_mvapich2_scatter_dest::alltoall
43 #define MPIR_Alltoall_pairwise_MV2 Coll_alltoall_pair::alltoall
44 #define MPIR_Alltoall_inplace_MV2 Coll_alltoall_ring::alltoall
47 static void init_mv2_alltoall_tables_stampede(){
49 int agg_table_sum = 0;
50 mv2_alltoall_tuning_table **table_ptrs = NULL;
51 mv2_alltoall_num_ppn_conf = 3;
52 if(smpi_coll_cleanup_callback==NULL)
53 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
54 mv2_alltoall_thresholds_table = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55 * mv2_alltoall_num_ppn_conf));
56 table_ptrs = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
57 * mv2_alltoall_num_ppn_conf));
58 mv2_size_alltoall_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
59 mv2_alltoall_num_ppn_conf));
60 mv2_alltoall_table_ppn_conf = static_cast<int*>(xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)));
61 mv2_alltoall_table_ppn_conf[0] = 1;
62 mv2_size_alltoall_tuning_table[0] = 6;
63 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
66 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
69 {{0, -1, &MPIR_Alltoall_inplace_MV2},
75 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
76 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
79 {{0, -1, &MPIR_Alltoall_inplace_MV2},
85 {{0, 8, &MPIR_Alltoall_RD_MV2},
86 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
89 {{0, -1, &MPIR_Alltoall_inplace_MV2},
95 {{0, 64, &MPIR_Alltoall_RD_MV2},
96 {64, 512, &MPIR_Alltoall_bruck_MV2},
97 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
100 {{0,-1, &MPIR_Alltoall_inplace_MV2},
106 {{0, 32, &MPIR_Alltoall_RD_MV2},
107 {32, 2048, &MPIR_Alltoall_bruck_MV2},
108 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
111 {{0, -1, &MPIR_Alltoall_inplace_MV2},
117 {{0, 8, &MPIR_Alltoall_RD_MV2},
118 {8, 1024, &MPIR_Alltoall_bruck_MV2},
119 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
122 {{0, -1, &MPIR_Alltoall_inplace_MV2},
126 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
127 mv2_alltoall_table_ppn_conf[1] = 2;
128 mv2_size_alltoall_tuning_table[1] = 6;
129 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
132 {{0, 32, &MPIR_Alltoall_RD_MV2},
133 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
136 {{0, -1, &MPIR_Alltoall_inplace_MV2},
142 {{0, 64, &MPIR_Alltoall_RD_MV2},
143 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
146 {{0, -1, &MPIR_Alltoall_inplace_MV2},
152 {{0, 64, &MPIR_Alltoall_RD_MV2},
153 {64, 2048, &MPIR_Alltoall_bruck_MV2},
154 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
157 {{0,-1, &MPIR_Alltoall_inplace_MV2},
163 {{0, 16, &MPIR_Alltoall_RD_MV2},
164 {16, 2048, &MPIR_Alltoall_bruck_MV2},
165 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
168 {{0, -1, &MPIR_Alltoall_inplace_MV2},
174 {{0, 8, &MPIR_Alltoall_RD_MV2},
175 {8, 1024, &MPIR_Alltoall_bruck_MV2},
176 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
179 {{0, -1, &MPIR_Alltoall_inplace_MV2},
185 {{0, 4, &MPIR_Alltoall_RD_MV2},
186 {4, 2048, &MPIR_Alltoall_bruck_MV2},
187 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
190 {{0, -1, &MPIR_Alltoall_inplace_MV2},
194 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
195 mv2_alltoall_table_ppn_conf[2] = 16;
196 mv2_size_alltoall_tuning_table[2] = 7;
197 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
200 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
201 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
204 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
210 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
211 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
214 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
220 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
221 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
222 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
225 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
231 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
232 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
235 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
241 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
242 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
245 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
251 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
252 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
255 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
260 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
261 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
264 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
269 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
271 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
272 agg_table_sum += mv2_size_alltoall_tuning_table[i];
274 mv2_alltoall_thresholds_table[0] =
275 static_cast<mv2_alltoall_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)));
276 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
277 (sizeof(mv2_alltoall_tuning_table)
278 * mv2_size_alltoall_tuning_table[0]));
279 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
280 mv2_alltoall_thresholds_table[i] =
281 mv2_alltoall_thresholds_table[i - 1]
282 + mv2_size_alltoall_tuning_table[i - 1];
283 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
284 (sizeof(mv2_alltoall_tuning_table)
285 * mv2_size_alltoall_tuning_table[i]));
287 xbt_free(table_ptrs);
293 /************ Allgather variables and initializers */
298 int (*MV2_pt_Allgatherction)(void *sendbuf,
300 MPI_Datatype sendtype,
303 MPI_Datatype recvtype, MPI_Comm comm_ptr);
304 } mv2_allgather_tuning_element;
308 int two_level[MV2_MAX_NB_THRESHOLDS];
309 int size_inter_table;
310 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
311 } mv2_allgather_tuning_table;
313 int (*MV2_Allgatherction)(void *sendbuf,
315 MPI_Datatype sendtype,
318 MPI_Datatype recvtype, MPI_Comm comm);
320 int *mv2_allgather_table_ppn_conf = NULL;
321 int mv2_allgather_num_ppn_conf = 1;
322 int *mv2_size_allgather_tuning_table = NULL;
323 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
325 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
327 MPI_Datatype sendtype,
330 MPI_Datatype recvtype, MPI_Comm comm_ptr)
335 #define MPIR_Allgather_Bruck_MV2 Coll_allgather_bruck::allgather
336 #define MPIR_Allgather_RD_MV2 Coll_allgather_rdb::allgather
337 #define MPIR_Allgather_Ring_MV2 Coll_allgather_ring::allgather
338 #define MPIR_2lvl_Allgather_MV2 Coll_allgather_mvapich2_smp::allgather
340 static void init_mv2_allgather_tables_stampede(){
342 int agg_table_sum = 0;
344 if(smpi_coll_cleanup_callback==NULL)
345 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
346 mv2_allgather_tuning_table **table_ptrs = NULL;
347 mv2_allgather_num_ppn_conf = 3;
348 mv2_allgather_thresholds_table
349 = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
350 * mv2_allgather_num_ppn_conf));
351 table_ptrs = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
352 * mv2_allgather_num_ppn_conf));
353 mv2_size_allgather_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
354 mv2_allgather_num_ppn_conf));
355 mv2_allgather_table_ppn_conf
356 = static_cast<int*>(xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)));
357 mv2_allgather_table_ppn_conf[0] = 1;
358 mv2_size_allgather_tuning_table[0] = 6;
359 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
365 {0, -1, &MPIR_Allgather_Ring_MV2},
373 {0, 262144, &MPIR_Allgather_RD_MV2},
374 {262144, -1, &MPIR_Allgather_Ring_MV2},
382 {0, 131072, &MPIR_Allgather_RD_MV2},
383 {131072, -1, &MPIR_Allgather_Ring_MV2},
391 {0, 131072, &MPIR_Allgather_RD_MV2},
392 {131072, -1, &MPIR_Allgather_Ring_MV2},
400 {0, 65536, &MPIR_Allgather_RD_MV2},
401 {65536, -1, &MPIR_Allgather_Ring_MV2},
409 {0, 32768, &MPIR_Allgather_RD_MV2},
410 {32768, -1, &MPIR_Allgather_Ring_MV2},
414 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
415 mv2_allgather_table_ppn_conf[1] = 2;
416 mv2_size_allgather_tuning_table[1] = 6;
417 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
423 {0, 524288, &MPIR_Allgather_RD_MV2},
424 {524288, -1, &MPIR_Allgather_Ring_MV2},
432 {0, 32768, &MPIR_Allgather_RD_MV2},
433 {32768, 524288, &MPIR_Allgather_Ring_MV2},
434 {524288, -1, &MPIR_Allgather_Ring_MV2},
442 {0, 16384, &MPIR_Allgather_RD_MV2},
443 {16384, 524288, &MPIR_Allgather_Ring_MV2},
444 {524288, -1, &MPIR_Allgather_Ring_MV2},
452 {0, 65536, &MPIR_Allgather_RD_MV2},
453 {65536, 524288, &MPIR_Allgather_Ring_MV2},
454 {524288, -1, &MPIR_Allgather_Ring_MV2},
462 {0, 32768, &MPIR_Allgather_RD_MV2},
463 {32768, 524288, &MPIR_Allgather_Ring_MV2},
464 {524288, -1, &MPIR_Allgather_Ring_MV2},
472 {0, 65536, &MPIR_Allgather_RD_MV2},
473 {65536, 524288, &MPIR_Allgather_Ring_MV2},
474 {524288, -1, &MPIR_Allgather_Ring_MV2},
478 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
479 mv2_allgather_table_ppn_conf[2] = 16;
480 mv2_size_allgather_tuning_table[2] = 6;
481 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
487 {0, 1024, &MPIR_Allgather_RD_MV2},
488 {1024, -1, &MPIR_Allgather_Ring_MV2},
496 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
497 {1024, -1, &MPIR_Allgather_Ring_MV2},
505 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
506 {1024, -1, &MPIR_Allgather_Ring_MV2},
514 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
515 {1024, -1, &MPIR_Allgather_Ring_MV2},
523 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
524 {1024, -1, &MPIR_Allgather_Ring_MV2},
532 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
533 {1024, -1, &MPIR_Allgather_Ring_MV2},
538 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
540 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
541 agg_table_sum += mv2_size_allgather_tuning_table[i];
543 mv2_allgather_thresholds_table[0] =
544 static_cast<mv2_allgather_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)));
545 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
546 (sizeof(mv2_allgather_tuning_table)
547 * mv2_size_allgather_tuning_table[0]));
548 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
549 mv2_allgather_thresholds_table[i] =
550 mv2_allgather_thresholds_table[i - 1]
551 + mv2_size_allgather_tuning_table[i - 1];
552 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
553 (sizeof(mv2_allgather_tuning_table)
554 * mv2_size_allgather_tuning_table[i]));
556 xbt_free(table_ptrs);
560 /************ Gather variables and initializers */
565 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
566 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
567 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
568 } mv2_gather_tuning_element;
573 int size_inter_table;
574 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
575 int size_intra_table;
576 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
577 } mv2_gather_tuning_table;
579 int mv2_size_gather_tuning_table=7;
580 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
582 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
584 MPI_Datatype sendtype,
587 MPI_Datatype recvtype,
588 int root, MPI_Comm comm);
590 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
591 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
595 #define MPIR_Gather_MV2_Direct Coll_gather_ompi_basic_linear::gather
596 #define MPIR_Gather_MV2_two_level_Direct Coll_gather_mvapich2_two_level::gather
597 #define MPIR_Gather_intra Coll_gather_mpich::gather
600 static void init_mv2_gather_tables_stampede(){
602 if(smpi_coll_cleanup_callback==NULL)
603 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
604 mv2_size_gather_tuning_table=7;
605 mv2_gather_thresholds_table = static_cast<mv2_gather_tuning_table*>(xbt_malloc(mv2_size_gather_tuning_table*
606 sizeof (mv2_gather_tuning_table)));
607 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
609 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
610 {524288, -1, &MPIR_Gather_intra}},
611 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
613 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
614 {16384, 131072, &MPIR_Gather_intra},
615 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
616 1,{{0, -1, &MPIR_Gather_intra}}},
618 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
619 {256, 16384, &MPIR_Gather_MV2_Direct},
620 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
621 1,{{0, -1, &MPIR_Gather_intra}}},
623 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
624 {512, 16384, &MPIR_Gather_MV2_Direct},
625 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
626 1,{{0, -1, &MPIR_Gather_intra}}},
628 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
629 {512, 16384, &MPIR_Gather_MV2_Direct},
630 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
631 1,{{0, -1, &MPIR_Gather_intra}}},
633 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
634 {512, 16384, &MPIR_Gather_MV2_Direct},
635 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
636 1,{{0, -1, &MPIR_Gather_intra}}},
638 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
639 {512, 16384, &MPIR_Gather_MV2_Direct},
640 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
641 1,{{0, -1, &MPIR_Gather_intra}}},
644 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
645 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
650 /************ Allgatherv variables and initializers */
655 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
657 MPI_Datatype sendtype,
661 MPI_Datatype recvtype,
663 } mv2_allgatherv_tuning_element;
667 int size_inter_table;
668 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
669 } mv2_allgatherv_tuning_table;
671 int (*MV2_Allgatherv_function)(void *sendbuf,
673 MPI_Datatype sendtype,
677 MPI_Datatype recvtype,
680 int mv2_size_allgatherv_tuning_table = 0;
681 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
683 #define MPIR_Allgatherv_Rec_Doubling_MV2 Coll_allgatherv_mpich_rdb::allgatherv
684 #define MPIR_Allgatherv_Bruck_MV2 Coll_allgatherv_ompi_bruck::allgatherv
685 #define MPIR_Allgatherv_Ring_MV2 Coll_allgatherv_mpich_ring::allgatherv
688 static void init_mv2_allgatherv_tables_stampede(){
689 if(smpi_coll_cleanup_callback==NULL)
690 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
691 mv2_size_allgatherv_tuning_table = 6;
692 mv2_allgatherv_thresholds_table = static_cast<mv2_allgatherv_tuning_table*>(xbt_malloc(mv2_size_allgatherv_tuning_table *
693 sizeof (mv2_allgatherv_tuning_table)));
694 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
699 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
700 {512, -1, &MPIR_Allgatherv_Ring_MV2},
707 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
708 {512, -1, &MPIR_Allgatherv_Ring_MV2},
715 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
716 {256, -1, &MPIR_Allgatherv_Ring_MV2},
723 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
724 {256, -1, &MPIR_Allgatherv_Ring_MV2},
731 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
732 {256, -1, &MPIR_Allgatherv_Ring_MV2},
739 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
740 {256, -1, &MPIR_Allgatherv_Ring_MV2},
745 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
746 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
750 /************ Allreduce variables and initializers */
755 int (*MV2_pt_Allreducection)(void *sendbuf,
758 MPI_Datatype datatype,
759 MPI_Op op, MPI_Comm comm);
760 } mv2_allreduce_tuning_element;
765 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
766 int size_inter_table;
767 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
768 int size_intra_table;
769 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
770 } mv2_allreduce_tuning_table;
773 int (*MV2_Allreducection)(void *sendbuf,
776 MPI_Datatype datatype,
777 MPI_Op op, MPI_Comm comm)=NULL;
780 int (*MV2_Allreduce_intra_function)( void *sendbuf,
783 MPI_Datatype datatype,
784 MPI_Op op, MPI_Comm comm)=NULL;
786 int mv2_size_allreduce_tuning_table = 0;
787 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
793 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
796 MPI_Datatype datatype,
797 MPI_Op op, MPI_Comm comm)
802 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
805 MPI_Datatype datatype,
806 MPI_Op op, MPI_Comm comm)
811 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
814 MPI_Datatype datatype,
815 MPI_Op op, MPI_Comm comm)
817 Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
821 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
824 MPI_Datatype datatype,
825 MPI_Op op, MPI_Comm comm)
827 Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
831 #define MPIR_Allreduce_pt2pt_rd_MV2 Coll_allreduce_rdb::allreduce
832 #define MPIR_Allreduce_pt2pt_rs_MV2 Coll_allreduce_mvapich2_rs::allreduce
833 #define MPIR_Allreduce_two_level_MV2 Coll_allreduce_mvapich2_two_level::allreduce
836 static void init_mv2_allreduce_tables_stampede(){
837 if(smpi_coll_cleanup_callback==NULL)
838 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
839 mv2_size_allreduce_tuning_table = 8;
840 mv2_allreduce_thresholds_table = static_cast<mv2_allreduce_tuning_table*>(xbt_malloc(mv2_size_allreduce_tuning_table *
841 sizeof (mv2_allreduce_tuning_table)));
842 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
849 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
850 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
854 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
855 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
864 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
865 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
866 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
870 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
871 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
880 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
881 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
882 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
886 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
887 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
896 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
897 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
898 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
902 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
903 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
912 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
913 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
914 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
918 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
919 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
928 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
929 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
930 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
934 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
935 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
944 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
945 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
946 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
947 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
951 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
952 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
961 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
962 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
963 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
964 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
965 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
969 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
970 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
975 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
976 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
985 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
986 int root, MPI_Comm comm_ptr);
987 int zcpy_pipelined_knomial_factor;
988 } mv2_bcast_tuning_element;
992 int bcast_segment_size;
993 int intra_node_knomial_factor;
994 int inter_node_knomial_factor;
995 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
996 int size_inter_table;
997 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
998 int size_intra_table;
999 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1000 } mv2_bcast_tuning_table;
1002 int mv2_size_bcast_tuning_table = 0;
1003 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1006 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1007 int root, MPI_Comm comm_ptr) = NULL;
1009 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1010 int root, MPI_Comm comm_ptr) = NULL;
1012 int zcpy_knomial_factor = 2;
1013 int mv2_pipelined_zcpy_knomial_factor = -1;
1014 int bcast_segment_size = 8192;
1015 int mv2_inter_node_knomial_factor = 4;
1016 int mv2_intra_node_knomial_factor = 4;
1017 #define mv2_bcast_two_level_system_size 64
1018 #define mv2_bcast_short_msg 16384
1019 #define mv2_bcast_large_msg 512*1024
1021 #define INTRA_NODE_ROOT 0
1023 #define MPIR_Pipelined_Bcast_Zcpy_MV2 Coll_bcast_mpich::bcast
1024 #define MPIR_Pipelined_Bcast_MV2 Coll_bcast_mpich::bcast
1025 #define MPIR_Bcast_binomial_MV2 Coll_bcast_binomial_tree::bcast
1026 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 Coll_bcast_scatter_LR_allgather::bcast
1027 #define MPIR_Bcast_scatter_doubling_allgather_MV2 Coll_bcast_scatter_rdb_allgather::bcast
1028 #define MPIR_Bcast_scatter_ring_allgather_MV2 Coll_bcast_scatter_LR_allgather::bcast
1029 #define MPIR_Shmem_Bcast_MV2 Coll_bcast_mpich::bcast
1030 #define MPIR_Bcast_tune_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1031 #define MPIR_Bcast_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1032 #define MPIR_Knomial_Bcast_intra_node_MV2 Coll_bcast_mvapich2_knomial_intra_node::bcast
1033 #define MPIR_Bcast_intra_MV2 Coll_bcast_mvapich2_intra_node::bcast
1035 static void init_mv2_bcast_tables_stampede(){
1037 if(smpi_coll_cleanup_callback==NULL)
1038 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1039 mv2_size_bcast_tuning_table=8;
1040 mv2_bcast_thresholds_table = static_cast<mv2_bcast_tuning_table*>(xbt_malloc(mv2_size_bcast_tuning_table *
1041 sizeof (mv2_bcast_tuning_table)));
1043 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1047 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1050 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1051 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1052 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1053 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1054 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1055 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1056 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1057 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1058 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1059 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1060 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1064 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1065 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1066 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1067 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1068 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1069 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1070 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1071 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1072 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1073 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1074 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1080 {1, 1, 1, 1, 1, 1, 1, 1},
1083 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1084 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1085 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1087 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1088 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1089 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1090 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1094 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1095 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1096 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1097 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1098 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1099 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1100 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1101 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1107 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1110 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1111 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1112 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1115 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1116 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1117 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1118 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1122 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1123 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1124 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1125 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1126 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1127 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1128 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1129 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1130 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1139 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1140 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1141 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1142 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1146 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1147 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1148 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1149 {524288, -1, NULL, -1}
1158 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1159 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1160 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1161 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1162 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1166 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1167 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1168 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1169 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1170 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1179 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1180 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1181 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1182 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1183 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1187 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1188 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1189 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1190 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1191 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1200 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1201 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1202 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1203 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1204 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1208 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1209 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1210 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1211 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1212 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1218 {1, 1, 1, 1, 1, 1, 1},
1221 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1222 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1223 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1224 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1225 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1226 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1227 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1231 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1232 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1233 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1234 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1235 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1236 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1237 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1242 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1243 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1247 /************ Reduce variables and initializers */
1252 int (*MV2_pt_Reduce_function)(void *sendbuf,
1255 MPI_Datatype datatype,
1259 } mv2_reduce_tuning_element;
1265 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1266 int size_inter_table;
1267 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1268 int size_intra_table;
1269 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1270 } mv2_reduce_tuning_table;
1272 int mv2_size_reduce_tuning_table = 0;
1273 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1276 int mv2_reduce_intra_knomial_factor = -1;
1277 int mv2_reduce_inter_knomial_factor = -1;
1279 int (*MV2_Reduce_function)( void *sendbuf,
1282 MPI_Datatype datatype,
1285 MPI_Comm comm_ptr)=NULL;
1287 int (*MV2_Reduce_intra_function)( void *sendbuf,
1290 MPI_Datatype datatype,
1293 MPI_Comm comm_ptr)=NULL;
1296 #define MPIR_Reduce_inter_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1297 #define MPIR_Reduce_intra_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1298 #define MPIR_Reduce_binomial_MV2 Coll_reduce_binomial::reduce
1299 #define MPIR_Reduce_redscat_gather_MV2 Coll_reduce_scatter_gather::reduce
1300 #define MPIR_Reduce_shmem_MV2 Coll_reduce_ompi_basic_linear::reduce
1301 #define MPIR_Reduce_two_level_helper_MV2 Coll_reduce_mvapich2_two_level::reduce
1304 static void init_mv2_reduce_tables_stampede(){
1305 if(smpi_coll_cleanup_callback==NULL)
1306 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1308 mv2_size_reduce_tuning_table = 8;
1309 mv2_reduce_thresholds_table = static_cast<mv2_reduce_tuning_table*>(xbt_malloc(mv2_size_reduce_tuning_table *
1310 sizeof (mv2_reduce_tuning_table)));
1311 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1319 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1320 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1321 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1325 {0, 65536, &MPIR_Reduce_shmem_MV2},
1326 {65536,-1, &MPIR_Reduce_binomial_MV2},
1333 {1, 1, 1, 1, 0, 0, 0},
1336 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1337 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1338 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1340 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1341 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1342 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1346 {0, 8192, &MPIR_Reduce_shmem_MV2},
1347 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1348 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1349 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1350 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1351 {262144,-1, &MPIR_Reduce_binomial_MV2},
1361 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1362 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1364 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1365 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1369 {0, 8192, &MPIR_Reduce_shmem_MV2},
1370 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1371 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1372 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373 {262144, -1, &MPIR_Reduce_binomial_MV2},
1383 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1384 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1385 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1386 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1388 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1392 {0, 8192, &MPIR_Reduce_shmem_MV2},
1393 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1394 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1395 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1396 {262144, -1, &MPIR_Reduce_binomial_MV2},
1403 {1, 1, 1, 0, 1, 1, 0},
1406 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1407 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1408 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1409 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1410 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1411 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1412 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1416 {0, 8192, &MPIR_Reduce_shmem_MV2},
1417 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1418 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1419 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1420 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1421 {262144, -1, &MPIR_Reduce_binomial_MV2},
1431 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1432 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1433 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1434 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1435 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1436 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1440 {0, 8192, &MPIR_Reduce_shmem_MV2},
1441 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1442 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1443 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1444 {262144, -1, &MPIR_Reduce_binomial_MV2},
1454 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1455 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1456 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1457 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1458 {262144, -1, &MPIR_Reduce_binomial_MV2},
1462 {0, 8192, &MPIR_Reduce_shmem_MV2},
1463 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1464 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1465 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1466 {262144, -1, &MPIR_Reduce_binomial_MV2},
1476 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1477 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1478 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1479 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1480 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1481 {131072, -1, &MPIR_Reduce_binomial_MV2},
1485 {0, 2048, &MPIR_Reduce_shmem_MV2},
1486 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1487 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1488 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1489 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1490 {131072, -1, &MPIR_Reduce_shmem_MV2},
1495 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1496 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1499 /************ Reduce scatter variables and initializers */
1504 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1507 MPI_Datatype datatype,
1510 } mv2_red_scat_tuning_element;
1514 int size_inter_table;
1515 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1516 } mv2_red_scat_tuning_table;
1518 int mv2_size_red_scat_tuning_table = 0;
1519 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1522 int (*MV2_Red_scat_function)(void *sendbuf,
1525 MPI_Datatype datatype,
1531 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1534 MPI_Datatype datatype,
1538 Coll_reduce_scatter_default::reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1541 #define MPIR_Reduce_scatter_non_comm_MV2 Coll_reduce_scatter_mpich_noncomm::reduce_scatter
1542 #define MPIR_Reduce_scatter_Rec_Halving_MV2 Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter
1543 #define MPIR_Reduce_scatter_Pair_Wise_MV2 Coll_reduce_scatter_mpich_pair::reduce_scatter
1548 static void init_mv2_reduce_scatter_tables_stampede(){
1549 if(smpi_coll_cleanup_callback==NULL)
1550 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1551 mv2_size_red_scat_tuning_table = 6;
1552 mv2_red_scat_thresholds_table = static_cast<mv2_red_scat_tuning_table*>(xbt_malloc(mv2_size_red_scat_tuning_table *
1553 sizeof (mv2_red_scat_tuning_table)));
1554 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1559 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1560 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1561 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1568 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1569 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1570 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1577 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1578 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1579 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1586 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1587 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1594 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1595 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1602 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1603 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1608 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1609 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1612 /************ Scatter variables and initializers */
1617 int (*MV2_pt_Scatter_function)(void *sendbuf,
1619 MPI_Datatype sendtype,
1622 MPI_Datatype recvtype,
1623 int root, MPI_Comm comm);
1624 } mv2_scatter_tuning_element;
1628 int size_inter_table;
1629 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1630 int size_intra_table;
1631 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1632 } mv2_scatter_tuning_table;
1635 int *mv2_scatter_table_ppn_conf = NULL;
1636 int mv2_scatter_num_ppn_conf = 1;
1637 int *mv2_size_scatter_tuning_table = NULL;
1638 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1640 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1641 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1642 int root, MPI_Comm comm)=NULL;
1644 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1645 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1646 int root, MPI_Comm comm)=NULL;
1647 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1649 MPI_Datatype sendtype,
1652 MPI_Datatype recvtype,
1653 int root, MPI_Comm comm_ptr);
1655 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1657 MPI_Datatype sendtype,
1660 MPI_Datatype recvtype,
1661 int root, MPI_Comm comm_ptr)
1666 #define MPIR_Scatter_MV2_Binomial Coll_scatter_ompi_binomial::scatter
1667 #define MPIR_Scatter_MV2_Direct Coll_scatter_ompi_basic_linear::scatter
1668 #define MPIR_Scatter_MV2_two_level_Binomial Coll_scatter_mvapich2_two_level_binomial::scatter
1669 #define MPIR_Scatter_MV2_two_level_Direct Coll_scatter_mvapich2_two_level_direct::scatter
1674 static void init_mv2_scatter_tables_stampede(){
1675 if(smpi_coll_cleanup_callback==NULL)
1676 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1678 int agg_table_sum = 0;
1680 mv2_scatter_tuning_table **table_ptrs = NULL;
1681 mv2_scatter_num_ppn_conf = 3;
1682 mv2_scatter_thresholds_table
1683 = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1684 * mv2_scatter_num_ppn_conf));
1685 table_ptrs = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1686 * mv2_scatter_num_ppn_conf));
1687 mv2_size_scatter_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
1688 mv2_scatter_num_ppn_conf));
1689 mv2_scatter_table_ppn_conf
1690 = static_cast<int*>(xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)));
1691 mv2_scatter_table_ppn_conf[0] = 1;
1692 mv2_size_scatter_tuning_table[0] = 6;
1693 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1697 {0, -1, &MPIR_Scatter_MV2_Binomial},
1701 {0, -1, &MPIR_Scatter_MV2_Binomial},
1708 {0, -1, &MPIR_Scatter_MV2_Direct},
1712 {0, -1, &MPIR_Scatter_MV2_Direct},
1719 {0, -1, &MPIR_Scatter_MV2_Direct},
1723 {0, -1, &MPIR_Scatter_MV2_Direct},
1730 {0, -1, &MPIR_Scatter_MV2_Direct},
1734 {0, -1, &MPIR_Scatter_MV2_Direct},
1741 {0, -1, &MPIR_Scatter_MV2_Direct},
1745 {0, -1, &MPIR_Scatter_MV2_Direct},
1752 {0, 32, &MPIR_Scatter_MV2_Binomial},
1753 {32, -1, &MPIR_Scatter_MV2_Direct},
1757 {0, -1, &MPIR_Scatter_MV2_Binomial},
1761 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1762 mv2_scatter_table_ppn_conf[1] = 2;
1763 mv2_size_scatter_tuning_table[1] = 6;
1764 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1768 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1769 {4096, -1, &MPIR_Scatter_MV2_Direct},
1773 {0, -1, &MPIR_Scatter_MV2_Direct},
1780 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1781 {512, -1, &MPIR_Scatter_MV2_Direct},
1785 {0, -1, &MPIR_Scatter_MV2_Binomial},
1792 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1793 {2048, -1, &MPIR_Scatter_MV2_Direct},
1797 {0, -1, &MPIR_Scatter_MV2_Binomial},
1804 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1805 {2048, -1, &MPIR_Scatter_MV2_Direct},
1809 {0, -1, &MPIR_Scatter_MV2_Binomial},
1816 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1817 {8192, -1, &MPIR_Scatter_MV2_Direct},
1821 {0, -1, &MPIR_Scatter_MV2_Binomial},
1828 {0, 16, &MPIR_Scatter_MV2_Binomial},
1829 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1830 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1831 {16384, -1, &MPIR_Scatter_MV2_Direct},
1835 {0, 128, &MPIR_Scatter_MV2_Direct},
1836 {128, -1, &MPIR_Scatter_MV2_Binomial},
1840 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1841 mv2_scatter_table_ppn_conf[2] = 16;
1842 mv2_size_scatter_tuning_table[2] = 8;
1843 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1848 {0, 256, &MPIR_Scatter_MV2_Binomial},
1849 {256, -1, &MPIR_Scatter_MV2_Direct},
1853 { 0, -1, &MPIR_Scatter_MV2_Direct},
1861 {0, 512, &MPIR_Scatter_MV2_Binomial},
1862 {512, -1, &MPIR_Scatter_MV2_Direct},
1866 { 0, -1, &MPIR_Scatter_MV2_Direct},
1874 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1875 {1024, -1, &MPIR_Scatter_MV2_Direct},
1879 { 0, -1, &MPIR_Scatter_MV2_Direct},
1887 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1888 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1889 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1890 {2048, -1, &MPIR_Scatter_MV2_Direct},
1894 { 0, -1, &MPIR_Scatter_MV2_Direct},
1902 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1903 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1904 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1905 {2048, -1, &MPIR_Scatter_MV2_Direct},
1909 { 0, -1, &MPIR_Scatter_MV2_Direct},
1917 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1918 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1919 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1920 {4096, -1, &MPIR_Scatter_MV2_Direct},
1924 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1931 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1932 {0, 16, &MPIR_Scatter_MV2_Binomial},
1933 {16, 32, &MPIR_Scatter_MV2_Binomial},
1934 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1935 {4096, -1, &MPIR_Scatter_MV2_Direct},
1939 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1946 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1947 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1948 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1949 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1950 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1951 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1952 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1956 {0, 16, &MPIR_Scatter_MV2_Binomial},
1957 {16, 128, &MPIR_Scatter_MV2_Binomial},
1958 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1959 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1960 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1961 {65536, -1, &MPIR_Scatter_MV2_Direct},
1965 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1967 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1968 agg_table_sum += mv2_size_scatter_tuning_table[i];
1970 mv2_scatter_thresholds_table[0] =
1971 static_cast<mv2_scatter_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)));
1972 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1973 (sizeof(mv2_scatter_tuning_table)
1974 * mv2_size_scatter_tuning_table[0]));
1975 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1976 mv2_scatter_thresholds_table[i] =
1977 mv2_scatter_thresholds_table[i - 1]
1978 + mv2_size_scatter_tuning_table[i - 1];
1979 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1980 (sizeof(mv2_scatter_tuning_table)
1981 * mv2_size_scatter_tuning_table[i]));
1983 xbt_free(table_ptrs);