1 /* selector for collective algorithms based on mvapich decision logic */
3 /* Copyright (c) 2009-2019. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.hpp"
11 #include "smpi_mvapich2_selector_stampede.hpp"
17 int Coll_alltoall_mvapich2::alltoall( void *sendbuf, int sendcount,
18 MPI_Datatype sendtype,
19 void* recvbuf, int recvcount,
20 MPI_Datatype recvtype,
24 if(mv2_alltoall_table_ppn_conf==NULL)
25 init_mv2_alltoall_tables_stampede();
27 int sendtype_size, recvtype_size, comm_size;
28 char * tmp_buf = NULL;
29 int mpi_errno=MPI_SUCCESS;
31 int range_threshold = 0;
33 comm_size = comm->size();
35 sendtype_size=sendtype->size();
36 recvtype_size=recvtype->size();
37 long nbytes = sendtype_size * sendcount;
39 /* check if safe to use partial subscription mode */
41 /* Search for the corresponding system size inside the tuning table */
42 while ((range < (mv2_size_alltoall_tuning_table[conf_index] - 1)) &&
43 (comm_size > mv2_alltoall_thresholds_table[conf_index][range].numproc)) {
46 /* Search for corresponding inter-leader function */
47 while ((range_threshold < (mv2_alltoall_thresholds_table[conf_index][range].size_table - 1))
49 mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max)
50 && (mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max != -1)) {
53 MV2_Alltoall_function = mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold]
54 .MV2_pt_Alltoall_function;
56 if(sendbuf != MPI_IN_PLACE) {
57 mpi_errno = MV2_Alltoall_function(sendbuf, sendcount, sendtype,
58 recvbuf, recvcount, recvtype,
63 mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].min
64 ||nbytes > mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].max
66 tmp_buf = (char *)smpi_get_tmp_sendbuffer( comm_size * recvcount * recvtype_size );
67 Datatype::copy((char *)recvbuf,
68 comm_size*recvcount, recvtype,
70 comm_size*recvcount, recvtype);
72 mpi_errno = MV2_Alltoall_function(tmp_buf, recvcount, recvtype,
73 recvbuf, recvcount, recvtype,
75 smpi_free_tmp_buffer(tmp_buf);
77 mpi_errno = MPIR_Alltoall_inplace_MV2(sendbuf, sendcount, sendtype,
78 recvbuf, recvcount, recvtype,
87 int Coll_allgather_mvapich2::allgather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
88 void *recvbuf, int recvcount, MPI_Datatype recvtype,
92 int mpi_errno = MPI_SUCCESS;
93 long nbytes = 0, comm_size, recvtype_size;
95 int partial_sub_ok = 0;
97 int range_threshold = 0;
100 //MPI_Comm *shmem_commptr=NULL;
101 /* Get the size of the communicator */
102 comm_size = comm->size();
103 recvtype_size=recvtype->size();
104 nbytes = recvtype_size * recvcount;
106 if(mv2_allgather_table_ppn_conf==NULL)
107 init_mv2_allgather_tables_stampede();
109 if(comm->get_leaders_comm()==MPI_COMM_NULL){
113 if (comm->is_uniform()){
114 shmem_comm = comm->get_intra_comm();
115 int local_size = shmem_comm->size();
117 if (mv2_allgather_table_ppn_conf[0] == -1) {
118 // Indicating user defined tuning
123 if (local_size == mv2_allgather_table_ppn_conf[i]) {
129 } while(i < mv2_allgather_num_ppn_conf);
132 if (partial_sub_ok != 1) {
136 /* Search for the corresponding system size inside the tuning table */
137 while ((range < (mv2_size_allgather_tuning_table[conf_index] - 1)) &&
139 mv2_allgather_thresholds_table[conf_index][range].numproc)) {
142 /* Search for corresponding inter-leader function */
143 while ((range_threshold <
144 (mv2_allgather_thresholds_table[conf_index][range].size_inter_table - 1))
145 && (nbytes > mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
146 && (mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max !=
151 /* Set inter-leader pt */
153 mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].
154 MV2_pt_Allgatherction;
156 is_two_level = mv2_allgather_thresholds_table[conf_index][range].two_level[range_threshold];
158 /* intracommunicator */
159 if(is_two_level ==1){
160 if(partial_sub_ok ==1){
161 if (comm->is_blocked()){
162 mpi_errno = MPIR_2lvl_Allgather_MV2(sendbuf, sendcount, sendtype,
163 recvbuf, recvcount, recvtype,
166 mpi_errno = Coll_allgather_mpich::allgather(sendbuf, sendcount, sendtype,
167 recvbuf, recvcount, recvtype,
171 mpi_errno = MPIR_Allgather_RD_MV2(sendbuf, sendcount, sendtype,
172 recvbuf, recvcount, recvtype,
175 } else if(MV2_Allgatherction == &MPIR_Allgather_Bruck_MV2
176 || MV2_Allgatherction == &MPIR_Allgather_RD_MV2
177 || MV2_Allgatherction == &MPIR_Allgather_Ring_MV2) {
178 mpi_errno = MV2_Allgatherction(sendbuf, sendcount, sendtype,
179 recvbuf, recvcount, recvtype,
182 return MPI_ERR_OTHER;
188 int Coll_gather_mvapich2::gather(void *sendbuf,
190 MPI_Datatype sendtype,
193 MPI_Datatype recvtype,
194 int root, MPI_Comm comm)
196 if(mv2_gather_thresholds_table==NULL)
197 init_mv2_gather_tables_stampede();
199 int mpi_errno = MPI_SUCCESS;
201 int range_threshold = 0;
202 int range_intra_threshold = 0;
204 int comm_size = comm->size();
205 int rank = comm->rank();
208 int recvtype_size = recvtype->size();
209 nbytes = recvcnt * recvtype_size;
211 int sendtype_size = sendtype->size();
212 nbytes = sendcnt * sendtype_size;
215 /* Search for the corresponding system size inside the tuning table */
216 while ((range < (mv2_size_gather_tuning_table - 1)) &&
217 (comm_size > mv2_gather_thresholds_table[range].numproc)) {
220 /* Search for corresponding inter-leader function */
221 while ((range_threshold < (mv2_gather_thresholds_table[range].size_inter_table - 1))
223 mv2_gather_thresholds_table[range].inter_leader[range_threshold].max)
224 && (mv2_gather_thresholds_table[range].inter_leader[range_threshold].max !=
229 /* Search for corresponding intra node function */
230 while ((range_intra_threshold < (mv2_gather_thresholds_table[range].size_intra_table - 1))
232 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max)
233 && (mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max !=
235 range_intra_threshold++;
238 if (comm->is_blocked() ) {
239 // Set intra-node function pt for gather_two_level
240 MV2_Gather_intra_node_function =
241 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].
242 MV2_pt_Gather_function;
243 //Set inter-leader pt
244 MV2_Gather_inter_leader_function =
245 mv2_gather_thresholds_table[range].inter_leader[range_threshold].
246 MV2_pt_Gather_function;
247 // We call Gather function
249 MV2_Gather_inter_leader_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,
250 recvtype, root, comm);
253 // Indeed, direct (non SMP-aware)gather is MPICH one
254 mpi_errno = Coll_gather_mpich::gather(sendbuf, sendcnt, sendtype,
255 recvbuf, recvcnt, recvtype,
262 int Coll_allgatherv_mvapich2::allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
263 void *recvbuf, int *recvcounts, int *displs,
264 MPI_Datatype recvtype, MPI_Comm comm )
266 int mpi_errno = MPI_SUCCESS;
267 int range = 0, comm_size, total_count, recvtype_size, i;
268 int range_threshold = 0;
271 if(mv2_allgatherv_thresholds_table==NULL)
272 init_mv2_allgatherv_tables_stampede();
274 comm_size = comm->size();
276 for (i = 0; i < comm_size; i++)
277 total_count += recvcounts[i];
279 recvtype_size=recvtype->size();
280 nbytes = total_count * recvtype_size;
282 /* Search for the corresponding system size inside the tuning table */
283 while ((range < (mv2_size_allgatherv_tuning_table - 1)) &&
284 (comm_size > mv2_allgatherv_thresholds_table[range].numproc)) {
287 /* Search for corresponding inter-leader function */
288 while ((range_threshold < (mv2_allgatherv_thresholds_table[range].size_inter_table - 1))
290 comm_size * mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max)
291 && (mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max !=
295 /* Set inter-leader pt */
296 MV2_Allgatherv_function =
297 mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].
298 MV2_pt_Allgatherv_function;
300 if (MV2_Allgatherv_function == &MPIR_Allgatherv_Rec_Doubling_MV2)
302 if (not(comm_size & (comm_size - 1))) {
304 MPIR_Allgatherv_Rec_Doubling_MV2(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm);
307 MPIR_Allgatherv_Bruck_MV2(sendbuf, sendcount,
314 MV2_Allgatherv_function(sendbuf, sendcount, sendtype,
315 recvbuf, recvcounts, displs,
324 int Coll_allreduce_mvapich2::allreduce(void *sendbuf,
327 MPI_Datatype datatype,
328 MPI_Op op, MPI_Comm comm)
331 int mpi_errno = MPI_SUCCESS;
335 comm_size = comm->size();
336 //rank = comm->rank();
342 if (mv2_allreduce_thresholds_table == NULL)
343 init_mv2_allreduce_tables_stampede();
345 /* check if multiple threads are calling this collective function */
347 MPI_Aint sendtype_size = 0;
349 int is_commutative = 0;
350 MPI_Aint true_lb, true_extent;
352 sendtype_size=datatype->size();
353 nbytes = count * sendtype_size;
355 datatype->extent(&true_lb, &true_extent);
356 is_commutative = op->is_commutative();
359 int range = 0, range_threshold = 0, range_threshold_intra = 0;
360 int is_two_level = 0;
362 /* Search for the corresponding system size inside the tuning table */
363 while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
364 (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
367 /* Search for corresponding inter-leader function */
368 /* skip mcast poiters if mcast is not available */
369 if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
370 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
371 && ((mv2_allreduce_thresholds_table[range].
372 inter_leader[range_threshold].MV2_pt_Allreducection
373 == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
374 (mv2_allreduce_thresholds_table[range].
375 inter_leader[range_threshold].MV2_pt_Allreducection
376 == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
381 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
383 mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
384 && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
387 if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
390 /* Search for corresponding intra-node function */
391 while ((range_threshold_intra <
392 (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
394 mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
395 && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
397 range_threshold_intra++;
400 MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
401 .MV2_pt_Allreducection;
403 MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
404 .MV2_pt_Allreducection;
406 /* check if mcast is ready, otherwise replace mcast with other algorithm */
407 if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
408 (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
410 MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
412 if(is_two_level != 1) {
413 MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
417 if(is_two_level == 1){
418 // check if shm is ready, if not use other algorithm first
419 if (is_commutative) {
420 if(comm->get_leaders_comm()==MPI_COMM_NULL){
423 mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
426 mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
430 mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count,
435 //comm->ch.intra_node_done=0;
443 int Coll_alltoallv_mvapich2::alltoallv(void *sbuf, int *scounts, int *sdisps,
445 void *rbuf, int *rcounts, int *rdisps,
451 if (sbuf == MPI_IN_PLACE) {
452 return Coll_alltoallv_ompi_basic_linear::alltoallv(sbuf, scounts, sdisps, sdtype,
453 rbuf, rcounts, rdisps,rdtype,
455 } else /* For starters, just keep the original algorithm. */
456 return Coll_alltoallv_ring::alltoallv(sbuf, scounts, sdisps, sdtype,
457 rbuf, rcounts, rdisps,rdtype,
462 int Coll_barrier_mvapich2::barrier(MPI_Comm comm)
464 return Coll_barrier_mvapich2_pair::barrier(comm);
470 int Coll_bcast_mvapich2::bcast(void *buffer,
472 MPI_Datatype datatype,
473 int root, MPI_Comm comm)
475 int mpi_errno = MPI_SUCCESS;
476 int comm_size/*, rank*/;
477 int two_level_bcast = 1;
480 int range_threshold = 0;
481 int range_threshold_intra = 0;
482 // int is_homogeneous, is_contig;
485 // void *tmp_buf = NULL;
487 //MPID_Datatype *dtp;
491 if(comm->get_leaders_comm()==MPI_COMM_NULL){
494 if (not mv2_bcast_thresholds_table)
495 init_mv2_bcast_tables_stampede();
496 comm_size = comm->size();
497 //rank = comm->rank();
500 /* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
503 /* MPID_Datatype_get_ptr(datatype, dtp);*/
504 /* is_contig = dtp->is_contig;*/
507 // is_homogeneous = 1;
509 /* MPI_Type_size() might not give the accurate size of the packed
510 * datatype for heterogeneous systems (because of padding, encoding,
511 * etc). On the other hand, MPI_Pack_size() can become very
512 * expensive, depending on the implementation, especially for
513 * heterogeneous systems. We want to use MPI_Type_size() wherever
514 * possible, and MPI_Pack_size() in other places.
516 //if (is_homogeneous) {
517 type_size=datatype->size();
520 MPIR_Pack_size_impl(1, datatype, &type_size);
522 nbytes = (count) * (type_size);
524 /* Search for the corresponding system size inside the tuning table */
525 while ((range < (mv2_size_bcast_tuning_table - 1)) &&
526 (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
529 /* Search for corresponding inter-leader function */
530 while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
532 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
533 && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
537 /* Search for corresponding intra-node function */
538 while ((range_threshold_intra <
539 (mv2_bcast_thresholds_table[range].size_intra_table - 1))
541 mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
542 && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
544 range_threshold_intra++;
548 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
549 MV2_pt_Bcast_function;
551 MV2_Bcast_intra_node_function =
552 mv2_bcast_thresholds_table[range].
553 intra_node[range_threshold_intra].MV2_pt_Bcast_function;
555 /* if (mv2_user_bcast_intra == NULL && */
556 /* MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
557 /* MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
560 if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
561 zcpy_pipelined_knomial_factor != -1) {
562 zcpy_knomial_factor =
563 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
564 zcpy_pipelined_knomial_factor;
567 if (mv2_pipelined_zcpy_knomial_factor != -1) {
568 zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
571 if(MV2_Bcast_intra_node_function == NULL) {
572 /* if tuning table do not have any intra selection, set func pointer to
573 ** default one for mcast intra node */
574 MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
577 /* Set value of pipeline segment size */
578 bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;
580 /* Set value of inter node knomial factor */
581 mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;
583 /* Set value of intra node knomial factor */
584 mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;
586 /* Check if we will use a two level algorithm or not */
588 #if defined(_MCST_SUPPORT_)
589 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
590 || comm->ch.is_mcast_ok;
592 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
594 if (two_level_bcast == 1) {
595 // if (not is_contig || not is_homogeneous) {
596 // tmp_buf = (void*)smpi_get_tmp_sendbuffer(nbytes);
599 /* if (rank == root) {*/
601 /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
603 /* MPIU_ERR_POP(mpi_errno);*/
606 #ifdef CHANNEL_MRAIL_GEN2
607 if ((mv2_enable_zcpy_bcast == 1) &&
608 (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
609 // if (not is_contig || not is_homogeneous) {
610 // mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
612 mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
616 #endif /* defined(CHANNEL_MRAIL_GEN2) */
618 shmem_comm = comm->get_intra_comm();
619 // if (not is_contig || not is_homogeneous) {
620 // MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
622 MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root, comm);
625 /* We are now done with the inter-node phase */
628 root = INTRA_NODE_ROOT;
630 // if (not is_contig || not is_homogeneous) {
631 // mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
633 mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
634 datatype, root, shmem_comm);
638 /* if (not is_contig || not is_homogeneous) {*/
639 /* if (rank != root) {*/
641 /* mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
642 /* count, datatype);*/
646 /* We use Knomial for intra node */
647 MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
648 /* if (mv2_enable_shmem_bcast == 0) {*/
649 /* Fall back to non-tuned version */
650 /* MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
652 mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
665 int Coll_reduce_mvapich2::reduce( void *sendbuf,
668 MPI_Datatype datatype,
669 MPI_Op op, int root, MPI_Comm comm)
671 if(mv2_reduce_thresholds_table == NULL)
672 init_mv2_reduce_tables_stampede();
674 int mpi_errno = MPI_SUCCESS;
676 int range_threshold = 0;
677 int range_intra_threshold = 0;
678 int is_commutative, pof2;
682 int is_two_level = 0;
684 comm_size = comm->size();
685 sendtype_size=datatype->size();
686 nbytes = count * sendtype_size;
691 is_commutative = (op==MPI_OP_NULL || op->is_commutative());
693 /* find nearest power-of-two less than or equal to comm_size */
694 for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
698 /* Search for the corresponding system size inside the tuning table */
699 while ((range < (mv2_size_reduce_tuning_table - 1)) &&
700 (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
703 /* Search for corresponding inter-leader function */
704 while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
706 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
707 && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
712 /* Search for corresponding intra node function */
713 while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
715 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
716 && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
718 range_intra_threshold++;
721 /* Set intra-node function pt for reduce_two_level */
722 MV2_Reduce_intra_function =
723 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
724 MV2_pt_Reduce_function;
725 /* Set inter-leader pt */
726 MV2_Reduce_function =
727 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
728 MV2_pt_Reduce_function;
730 if(mv2_reduce_intra_knomial_factor<0)
732 mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
734 if(mv2_reduce_inter_knomial_factor<0)
736 mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
738 if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
741 /* We call Reduce function */
742 if(is_two_level == 1)
744 if (is_commutative == 1) {
745 if(comm->get_leaders_comm()==MPI_COMM_NULL){
748 mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count,
749 datatype, op, root, comm);
751 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
752 datatype, op, root, comm);
754 } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
755 if(is_commutative ==1)
757 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
758 datatype, op, root, comm);
760 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
761 datatype, op, root, comm);
763 } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
764 if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
766 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
767 datatype, op, root, comm);
769 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
770 datatype, op, root, comm);
773 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
774 datatype, op, root, comm);
783 int Coll_reduce_scatter_mvapich2::reduce_scatter(void *sendbuf, void *recvbuf, int *recvcnts,
784 MPI_Datatype datatype, MPI_Op op,
787 int mpi_errno = MPI_SUCCESS;
788 int i = 0, comm_size = comm->size(), total_count = 0, type_size =
790 int is_commutative = 0;
791 int* disps = new int[comm_size];
793 if(mv2_red_scat_thresholds_table==NULL)
794 init_mv2_reduce_scatter_tables_stampede();
796 is_commutative=(op==MPI_OP_NULL || op->is_commutative());
797 for (i = 0; i < comm_size; i++) {
798 disps[i] = total_count;
799 total_count += recvcnts[i];
802 type_size=datatype->size();
803 nbytes = total_count * type_size;
805 if (is_commutative) {
807 int range_threshold = 0;
809 /* Search for the corresponding system size inside the tuning table */
810 while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
811 (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
814 /* Search for corresponding inter-leader function */
815 while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
817 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
818 && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
823 /* Set inter-leader pt */
824 MV2_Red_scat_function =
825 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
826 MV2_pt_Red_scat_function;
828 mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
832 int is_block_regular = 1;
833 for (i = 0; i < (comm_size - 1); ++i) {
834 if (recvcnts[i] != recvcnts[i+1]) {
835 is_block_regular = 0;
840 while (pof2 < comm_size) pof2 <<= 1;
841 if (pof2 == comm_size && is_block_regular) {
842 /* noncommutative, pof2 size, and block regular */
843 MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
847 mpi_errno = Coll_reduce_scatter_mpich_rdb::reduce_scatter(sendbuf, recvbuf,
858 int Coll_scatter_mvapich2::scatter(void *sendbuf,
860 MPI_Datatype sendtype,
863 MPI_Datatype recvtype,
864 int root, MPI_Comm comm)
866 int range = 0, range_threshold = 0, range_threshold_intra = 0;
867 int mpi_errno = MPI_SUCCESS;
868 // int mpi_errno_ret = MPI_SUCCESS;
869 int rank, nbytes, comm_size;
870 int partial_sub_ok = 0;
873 // MPID_Comm *shmem_commptr=NULL;
874 if(mv2_scatter_thresholds_table==NULL)
875 init_mv2_scatter_tables_stampede();
877 if(comm->get_leaders_comm()==MPI_COMM_NULL){
881 comm_size = comm->size();
886 int sendtype_size = sendtype->size();
887 nbytes = sendcnt * sendtype_size;
889 int recvtype_size = recvtype->size();
890 nbytes = recvcnt * recvtype_size;
893 // check if safe to use partial subscription mode
894 if (comm->is_uniform()) {
896 shmem_comm = comm->get_intra_comm();
897 if (mv2_scatter_table_ppn_conf[0] == -1) {
898 // Indicating user defined tuning
901 int local_size = shmem_comm->size();
904 if (local_size == mv2_scatter_table_ppn_conf[i]) {
910 } while(i < mv2_scatter_num_ppn_conf);
914 if (partial_sub_ok != 1) {
918 /* Search for the corresponding system size inside the tuning table */
919 while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
920 (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
923 /* Search for corresponding inter-leader function */
924 while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
926 mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
927 && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
931 /* Search for corresponding intra-node function */
932 while ((range_threshold_intra <
933 (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
935 mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
936 && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
938 range_threshold_intra++;
941 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
942 .MV2_pt_Scatter_function;
944 if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
945 #if defined(_MCST_SUPPORT_)
946 if(comm->ch.is_mcast_ok == 1
947 && mv2_use_mcast_scatter == 1
948 && comm->ch.shmem_coll_ok == 1) {
949 MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
951 #endif /*#if defined(_MCST_SUPPORT_) */
953 if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
954 MV2_pt_Scatter_function != NULL) {
955 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
956 .MV2_pt_Scatter_function;
959 MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
964 if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
965 (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
966 if( comm->is_blocked()) {
967 MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
968 .MV2_pt_Scatter_function;
971 MV2_Scatter_function(sendbuf, sendcnt, sendtype,
972 recvbuf, recvcnt, recvtype, root,
975 mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
976 recvbuf, recvcnt, recvtype, root,
981 mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
982 recvbuf, recvcnt, recvtype, root,
991 void smpi_coll_cleanup_mvapich2()
993 if (mv2_alltoall_thresholds_table)
994 delete[] mv2_alltoall_thresholds_table[0];
995 delete[] mv2_alltoall_thresholds_table;
996 delete[] mv2_size_alltoall_tuning_table;
997 delete[] mv2_alltoall_table_ppn_conf;
999 delete[] mv2_gather_thresholds_table;
1000 if (mv2_allgather_thresholds_table)
1001 delete[] mv2_allgather_thresholds_table[0];
1002 delete[] mv2_size_allgather_tuning_table;
1003 delete[] mv2_allgather_table_ppn_conf;
1004 delete[] mv2_allgather_thresholds_table;
1006 delete[] mv2_allgatherv_thresholds_table;
1007 delete[] mv2_reduce_thresholds_table;
1008 delete[] mv2_red_scat_thresholds_table;
1009 delete[] mv2_allreduce_thresholds_table;
1010 delete[] mv2_bcast_thresholds_table;
1011 if (mv2_scatter_thresholds_table)
1012 delete[] mv2_scatter_thresholds_table[0];
1013 delete[] mv2_scatter_thresholds_table;
1014 delete[] mv2_size_scatter_tuning_table;
1015 delete[] mv2_scatter_table_ppn_conf;