1 /* selector for collective algorithms based on mvapich decision logic */
3 /* Copyright (c) 2009-2010, 2013-2017. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
11 #include "smpi_mvapich2_selector_stampede.h"
17 int Coll_alltoall_mvapich2::alltoall( void *sendbuf, int sendcount,
18 MPI_Datatype sendtype,
19 void* recvbuf, int recvcount,
20 MPI_Datatype recvtype,
24 if(mv2_alltoall_table_ppn_conf==NULL)
25 init_mv2_alltoall_tables_stampede();
27 int sendtype_size, recvtype_size, comm_size;
28 char * tmp_buf = NULL;
29 int mpi_errno=MPI_SUCCESS;
31 int range_threshold = 0;
33 comm_size = comm->size();
35 sendtype_size=sendtype->size();
36 recvtype_size=recvtype->size();
37 long nbytes = sendtype_size * sendcount;
39 /* check if safe to use partial subscription mode */
41 /* Search for the corresponding system size inside the tuning table */
42 while ((range < (mv2_size_alltoall_tuning_table[conf_index] - 1)) &&
43 (comm_size > mv2_alltoall_thresholds_table[conf_index][range].numproc)) {
46 /* Search for corresponding inter-leader function */
47 while ((range_threshold < (mv2_alltoall_thresholds_table[conf_index][range].size_table - 1))
49 mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max)
50 && (mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max != -1)) {
53 MV2_Alltoall_function = mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold]
54 .MV2_pt_Alltoall_function;
56 if(sendbuf != MPI_IN_PLACE) {
57 mpi_errno = MV2_Alltoall_function(sendbuf, sendcount, sendtype,
58 recvbuf, recvcount, recvtype,
63 mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].min
64 ||nbytes > mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].max
66 tmp_buf = (char *)smpi_get_tmp_sendbuffer( comm_size * recvcount * recvtype_size );
67 mpi_errno = Datatype::copy((char *)recvbuf,
68 comm_size*recvcount, recvtype,
70 comm_size*recvcount, recvtype);
72 mpi_errno = MV2_Alltoall_function(tmp_buf, recvcount, recvtype,
73 recvbuf, recvcount, recvtype,
75 smpi_free_tmp_buffer(tmp_buf);
77 mpi_errno = MPIR_Alltoall_inplace_MV2(sendbuf, sendcount, sendtype,
78 recvbuf, recvcount, recvtype,
87 int Coll_allgather_mvapich2::allgather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
88 void *recvbuf, int recvcount, MPI_Datatype recvtype,
92 int mpi_errno = MPI_SUCCESS;
93 long nbytes = 0, comm_size, recvtype_size;
95 int partial_sub_ok = 0;
97 int range_threshold = 0;
101 //MPI_Comm *shmem_commptr=NULL;
102 /* Get the size of the communicator */
103 comm_size = comm->size();
104 recvtype_size=recvtype->size();
105 nbytes = recvtype_size * recvcount;
107 if(mv2_allgather_table_ppn_conf==NULL)
108 init_mv2_allgather_tables_stampede();
110 if(comm->get_leaders_comm()==MPI_COMM_NULL){
115 if (comm->is_uniform()){
116 shmem_comm = comm->get_intra_comm();
117 local_size = shmem_comm->size();
119 if (mv2_allgather_table_ppn_conf[0] == -1) {
120 // Indicating user defined tuning
125 if (local_size == mv2_allgather_table_ppn_conf[i]) {
131 } while(i < mv2_allgather_num_ppn_conf);
134 if (partial_sub_ok != 1) {
138 /* Search for the corresponding system size inside the tuning table */
139 while ((range < (mv2_size_allgather_tuning_table[conf_index] - 1)) &&
141 mv2_allgather_thresholds_table[conf_index][range].numproc)) {
144 /* Search for corresponding inter-leader function */
145 while ((range_threshold <
146 (mv2_allgather_thresholds_table[conf_index][range].size_inter_table - 1))
147 && (nbytes > mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
148 && (mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max !=
153 /* Set inter-leader pt */
155 mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].
156 MV2_pt_Allgatherction;
158 is_two_level = mv2_allgather_thresholds_table[conf_index][range].two_level[range_threshold];
160 /* intracommunicator */
161 if(is_two_level ==1){
162 if(partial_sub_ok ==1){
163 if (comm->is_blocked()){
164 mpi_errno = MPIR_2lvl_Allgather_MV2(sendbuf, sendcount, sendtype,
165 recvbuf, recvcount, recvtype,
168 mpi_errno = Coll_allgather_mpich::allgather(sendbuf, sendcount, sendtype,
169 recvbuf, recvcount, recvtype,
173 mpi_errno = MPIR_Allgather_RD_MV2(sendbuf, sendcount, sendtype,
174 recvbuf, recvcount, recvtype,
177 } else if(MV2_Allgatherction == &MPIR_Allgather_Bruck_MV2
178 || MV2_Allgatherction == &MPIR_Allgather_RD_MV2
179 || MV2_Allgatherction == &MPIR_Allgather_Ring_MV2) {
180 mpi_errno = MV2_Allgatherction(sendbuf, sendcount, sendtype,
181 recvbuf, recvcount, recvtype,
184 return MPI_ERR_OTHER;
190 int Coll_gather_mvapich2::gather(void *sendbuf,
192 MPI_Datatype sendtype,
195 MPI_Datatype recvtype,
196 int root, MPI_Comm comm)
198 if(mv2_gather_thresholds_table==NULL)
199 init_mv2_gather_tables_stampede();
201 int mpi_errno = MPI_SUCCESS;
203 int range_threshold = 0;
204 int range_intra_threshold = 0;
207 int recvtype_size, sendtype_size;
209 comm_size = comm->size();
213 recvtype_size=recvtype->size();
214 nbytes = recvcnt * recvtype_size;
216 sendtype_size=sendtype->size();
217 nbytes = sendcnt * sendtype_size;
220 /* Search for the corresponding system size inside the tuning table */
221 while ((range < (mv2_size_gather_tuning_table - 1)) &&
222 (comm_size > mv2_gather_thresholds_table[range].numproc)) {
225 /* Search for corresponding inter-leader function */
226 while ((range_threshold < (mv2_gather_thresholds_table[range].size_inter_table - 1))
228 mv2_gather_thresholds_table[range].inter_leader[range_threshold].max)
229 && (mv2_gather_thresholds_table[range].inter_leader[range_threshold].max !=
234 /* Search for corresponding intra node function */
235 while ((range_intra_threshold < (mv2_gather_thresholds_table[range].size_intra_table - 1))
237 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max)
238 && (mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max !=
240 range_intra_threshold++;
243 if (comm->is_blocked() ) {
244 // Set intra-node function pt for gather_two_level
245 MV2_Gather_intra_node_function =
246 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].
247 MV2_pt_Gather_function;
248 //Set inter-leader pt
249 MV2_Gather_inter_leader_function =
250 mv2_gather_thresholds_table[range].inter_leader[range_threshold].
251 MV2_pt_Gather_function;
252 // We call Gather function
254 MV2_Gather_inter_leader_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,
255 recvtype, root, comm);
258 // Indeed, direct (non SMP-aware)gather is MPICH one
259 mpi_errno = Coll_gather_mpich::gather(sendbuf, sendcnt, sendtype,
260 recvbuf, recvcnt, recvtype,
267 int Coll_allgatherv_mvapich2::allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
268 void *recvbuf, int *recvcounts, int *displs,
269 MPI_Datatype recvtype, MPI_Comm comm )
271 int mpi_errno = MPI_SUCCESS;
272 int range = 0, comm_size, total_count, recvtype_size, i;
273 int range_threshold = 0;
276 if(mv2_allgatherv_thresholds_table==NULL)
277 init_mv2_allgatherv_tables_stampede();
279 comm_size = comm->size();
281 for (i = 0; i < comm_size; i++)
282 total_count += recvcounts[i];
284 recvtype_size=recvtype->size();
285 nbytes = total_count * recvtype_size;
287 /* Search for the corresponding system size inside the tuning table */
288 while ((range < (mv2_size_allgatherv_tuning_table - 1)) &&
289 (comm_size > mv2_allgatherv_thresholds_table[range].numproc)) {
292 /* Search for corresponding inter-leader function */
293 while ((range_threshold < (mv2_allgatherv_thresholds_table[range].size_inter_table - 1))
295 comm_size * mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max)
296 && (mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max !=
300 /* Set inter-leader pt */
301 MV2_Allgatherv_function =
302 mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].
303 MV2_pt_Allgatherv_function;
305 if (MV2_Allgatherv_function == &MPIR_Allgatherv_Rec_Doubling_MV2)
307 if (not(comm_size & (comm_size - 1))) {
309 MPIR_Allgatherv_Rec_Doubling_MV2(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm);
312 MPIR_Allgatherv_Bruck_MV2(sendbuf, sendcount,
319 MV2_Allgatherv_function(sendbuf, sendcount, sendtype,
320 recvbuf, recvcounts, displs,
329 int Coll_allreduce_mvapich2::allreduce(void *sendbuf,
332 MPI_Datatype datatype,
333 MPI_Op op, MPI_Comm comm)
336 int mpi_errno = MPI_SUCCESS;
340 comm_size = comm->size();
341 //rank = comm->rank();
347 if (mv2_allreduce_thresholds_table == NULL)
348 init_mv2_allreduce_tables_stampede();
350 /* check if multiple threads are calling this collective function */
352 MPI_Aint sendtype_size = 0;
354 int range = 0, range_threshold = 0, range_threshold_intra = 0;
355 int is_two_level = 0;
356 int is_commutative = 0;
357 MPI_Aint true_lb, true_extent;
359 sendtype_size=datatype->size();
360 nbytes = count * sendtype_size;
362 datatype->extent(&true_lb, &true_extent);
364 //is_commutative = op->is_commutative();
367 /* Search for the corresponding system size inside the tuning table */
368 while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
369 (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
372 /* Search for corresponding inter-leader function */
373 /* skip mcast poiters if mcast is not available */
374 if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
375 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
376 && ((mv2_allreduce_thresholds_table[range].
377 inter_leader[range_threshold].MV2_pt_Allreducection
378 == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
379 (mv2_allreduce_thresholds_table[range].
380 inter_leader[range_threshold].MV2_pt_Allreducection
381 == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
386 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
388 mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
389 && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
392 if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
395 /* Search for corresponding intra-node function */
396 while ((range_threshold_intra <
397 (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
399 mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
400 && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
402 range_threshold_intra++;
405 MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
406 .MV2_pt_Allreducection;
408 MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
409 .MV2_pt_Allreducection;
411 /* check if mcast is ready, otherwise replace mcast with other algorithm */
412 if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
413 (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
415 MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
417 if(is_two_level != 1) {
418 MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
422 if(is_two_level == 1){
423 // check if shm is ready, if not use other algorithm first
424 if (is_commutative) {
425 if(comm->get_leaders_comm()==MPI_COMM_NULL){
428 mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
431 mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
435 mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count,
440 //comm->ch.intra_node_done=0;
448 int Coll_alltoallv_mvapich2::alltoallv(void *sbuf, int *scounts, int *sdisps,
450 void *rbuf, int *rcounts, int *rdisps,
456 if (sbuf == MPI_IN_PLACE) {
457 return Coll_alltoallv_ompi_basic_linear::alltoallv(sbuf, scounts, sdisps, sdtype,
458 rbuf, rcounts, rdisps,rdtype,
460 } else /* For starters, just keep the original algorithm. */
461 return Coll_alltoallv_ring::alltoallv(sbuf, scounts, sdisps, sdtype,
462 rbuf, rcounts, rdisps,rdtype,
467 int Coll_barrier_mvapich2::barrier(MPI_Comm comm)
469 return Coll_barrier_mvapich2_pair::barrier(comm);
475 int Coll_bcast_mvapich2::bcast(void *buffer,
477 MPI_Datatype datatype,
478 int root, MPI_Comm comm)
480 int mpi_errno = MPI_SUCCESS;
481 int comm_size/*, rank*/;
482 int two_level_bcast = 1;
485 int range_threshold = 0;
486 int range_threshold_intra = 0;
487 int is_homogeneous, is_contig;
490 void *tmp_buf = NULL;
492 //MPID_Datatype *dtp;
496 if(comm->get_leaders_comm()==MPI_COMM_NULL){
499 if (not mv2_bcast_thresholds_table)
500 init_mv2_bcast_tables_stampede();
501 comm_size = comm->size();
502 //rank = comm->rank();
505 /* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
508 /* MPID_Datatype_get_ptr(datatype, dtp);*/
509 /* is_contig = dtp->is_contig;*/
514 /* MPI_Type_size() might not give the accurate size of the packed
515 * datatype for heterogeneous systems (because of padding, encoding,
516 * etc). On the other hand, MPI_Pack_size() can become very
517 * expensive, depending on the implementation, especially for
518 * heterogeneous systems. We want to use MPI_Type_size() wherever
519 * possible, and MPI_Pack_size() in other places.
521 //if (is_homogeneous) {
522 type_size=datatype->size();
525 MPIR_Pack_size_impl(1, datatype, &type_size);
527 nbytes = (count) * (type_size);
529 /* Search for the corresponding system size inside the tuning table */
530 while ((range < (mv2_size_bcast_tuning_table - 1)) &&
531 (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
534 /* Search for corresponding inter-leader function */
535 while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
537 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
538 && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
542 /* Search for corresponding intra-node function */
543 while ((range_threshold_intra <
544 (mv2_bcast_thresholds_table[range].size_intra_table - 1))
546 mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
547 && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
549 range_threshold_intra++;
553 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
554 MV2_pt_Bcast_function;
556 MV2_Bcast_intra_node_function =
557 mv2_bcast_thresholds_table[range].
558 intra_node[range_threshold_intra].MV2_pt_Bcast_function;
560 /* if (mv2_user_bcast_intra == NULL && */
561 /* MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
562 /* MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
565 if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
566 zcpy_pipelined_knomial_factor != -1) {
567 zcpy_knomial_factor =
568 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
569 zcpy_pipelined_knomial_factor;
572 if (mv2_pipelined_zcpy_knomial_factor != -1) {
573 zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
576 if(MV2_Bcast_intra_node_function == NULL) {
577 /* if tuning table do not have any intra selection, set func pointer to
578 ** default one for mcast intra node */
579 MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
582 /* Set value of pipeline segment size */
583 bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;
585 /* Set value of inter node knomial factor */
586 mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;
588 /* Set value of intra node knomial factor */
589 mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;
591 /* Check if we will use a two level algorithm or not */
593 #if defined(_MCST_SUPPORT_)
594 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
595 || comm->ch.is_mcast_ok;
597 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
599 if (two_level_bcast == 1) {
600 if (not is_contig || not is_homogeneous) {
601 tmp_buf = (void*)smpi_get_tmp_sendbuffer(nbytes);
604 /* if (rank == root) {*/
606 /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
608 /* MPIU_ERR_POP(mpi_errno);*/
611 #ifdef CHANNEL_MRAIL_GEN2
612 if ((mv2_enable_zcpy_bcast == 1) &&
613 (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
614 if (not is_contig || not is_homogeneous) {
615 mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
617 mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
621 #endif /* defined(CHANNEL_MRAIL_GEN2) */
623 shmem_comm = comm->get_intra_comm();
624 if (not is_contig || not is_homogeneous) {
625 mpi_errno = MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
628 MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root,
632 /* We are now done with the inter-node phase */
635 root = INTRA_NODE_ROOT;
637 if (not is_contig || not is_homogeneous) {
638 mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
640 mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
641 datatype, root, shmem_comm);
645 /* if (not is_contig || not is_homogeneous) {*/
646 /* if (rank != root) {*/
648 /* mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
649 /* count, datatype);*/
653 /* We use Knomial for intra node */
654 MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
655 /* if (mv2_enable_shmem_bcast == 0) {*/
656 /* Fall back to non-tuned version */
657 /* MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
659 mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
672 int Coll_reduce_mvapich2::reduce( void *sendbuf,
675 MPI_Datatype datatype,
676 MPI_Op op, int root, MPI_Comm comm)
678 if(mv2_reduce_thresholds_table == NULL)
679 init_mv2_reduce_tables_stampede();
681 int mpi_errno = MPI_SUCCESS;
683 int range_threshold = 0;
684 int range_intra_threshold = 0;
685 int is_commutative, pof2;
689 int is_two_level = 0;
691 comm_size = comm->size();
692 sendtype_size=datatype->size();
693 nbytes = count * sendtype_size;
698 is_commutative = (op==MPI_OP_NULL || op->is_commutative());
700 /* find nearest power-of-two less than or equal to comm_size */
701 for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
705 /* Search for the corresponding system size inside the tuning table */
706 while ((range < (mv2_size_reduce_tuning_table - 1)) &&
707 (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
710 /* Search for corresponding inter-leader function */
711 while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
713 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
714 && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
719 /* Search for corresponding intra node function */
720 while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
722 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
723 && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
725 range_intra_threshold++;
728 /* Set intra-node function pt for reduce_two_level */
729 MV2_Reduce_intra_function =
730 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
731 MV2_pt_Reduce_function;
732 /* Set inter-leader pt */
733 MV2_Reduce_function =
734 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
735 MV2_pt_Reduce_function;
737 if(mv2_reduce_intra_knomial_factor<0)
739 mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
741 if(mv2_reduce_inter_knomial_factor<0)
743 mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
745 if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
748 /* We call Reduce function */
749 if(is_two_level == 1)
751 if (is_commutative == 1) {
752 if(comm->get_leaders_comm()==MPI_COMM_NULL){
755 mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count,
756 datatype, op, root, comm);
758 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
759 datatype, op, root, comm);
761 } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
762 if(is_commutative ==1)
764 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
765 datatype, op, root, comm);
767 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
768 datatype, op, root, comm);
770 } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
771 if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
773 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
774 datatype, op, root, comm);
776 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
777 datatype, op, root, comm);
780 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
781 datatype, op, root, comm);
790 int Coll_reduce_scatter_mvapich2::reduce_scatter(void *sendbuf, void *recvbuf, int *recvcnts,
791 MPI_Datatype datatype, MPI_Op op,
794 int mpi_errno = MPI_SUCCESS;
795 int i = 0, comm_size = comm->size(), total_count = 0, type_size =
798 int range_threshold = 0;
799 int is_commutative = 0;
800 int *disps = static_cast<int*>(xbt_malloc(comm_size * sizeof (int)));
802 if(mv2_red_scat_thresholds_table==NULL)
803 init_mv2_reduce_scatter_tables_stampede();
805 is_commutative=(op==MPI_OP_NULL || op->is_commutative());
806 for (i = 0; i < comm_size; i++) {
807 disps[i] = total_count;
808 total_count += recvcnts[i];
811 type_size=datatype->size();
812 nbytes = total_count * type_size;
814 if (is_commutative) {
816 /* Search for the corresponding system size inside the tuning table */
817 while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
818 (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
821 /* Search for corresponding inter-leader function */
822 while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
824 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
825 && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
830 /* Set inter-leader pt */
831 MV2_Red_scat_function =
832 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
833 MV2_pt_Red_scat_function;
835 mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
839 int is_block_regular = 1;
840 for (i = 0; i < (comm_size - 1); ++i) {
841 if (recvcnts[i] != recvcnts[i+1]) {
842 is_block_regular = 0;
847 while (pof2 < comm_size) pof2 <<= 1;
848 if (pof2 == comm_size && is_block_regular) {
849 /* noncommutative, pof2 size, and block regular */
850 mpi_errno = MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
854 mpi_errno = Coll_reduce_scatter_mpich_rdb::reduce_scatter(sendbuf, recvbuf,
865 int Coll_scatter_mvapich2::scatter(void *sendbuf,
867 MPI_Datatype sendtype,
870 MPI_Datatype recvtype,
871 int root, MPI_Comm comm)
873 int range = 0, range_threshold = 0, range_threshold_intra = 0;
874 int mpi_errno = MPI_SUCCESS;
875 // int mpi_errno_ret = MPI_SUCCESS;
876 int rank, nbytes, comm_size;
877 int recvtype_size, sendtype_size;
878 int partial_sub_ok = 0;
883 // MPID_Comm *shmem_commptr=NULL;
884 if(mv2_scatter_thresholds_table==NULL)
885 init_mv2_scatter_tables_stampede();
887 if(comm->get_leaders_comm()==MPI_COMM_NULL){
891 comm_size = comm->size();
896 sendtype_size=sendtype->size();
897 nbytes = sendcnt * sendtype_size;
899 recvtype_size=recvtype->size();
900 nbytes = recvcnt * recvtype_size;
903 // check if safe to use partial subscription mode
904 if (comm->is_uniform()) {
906 shmem_comm = comm->get_intra_comm();
907 local_size = shmem_comm->size();
909 if (mv2_scatter_table_ppn_conf[0] == -1) {
910 // Indicating user defined tuning
914 if (local_size == mv2_scatter_table_ppn_conf[i]) {
920 } while(i < mv2_scatter_num_ppn_conf);
924 if (partial_sub_ok != 1) {
928 /* Search for the corresponding system size inside the tuning table */
929 while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
930 (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
933 /* Search for corresponding inter-leader function */
934 while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
936 mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
937 && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
941 /* Search for corresponding intra-node function */
942 while ((range_threshold_intra <
943 (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
945 mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
946 && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
948 range_threshold_intra++;
951 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
952 .MV2_pt_Scatter_function;
954 if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
955 #if defined(_MCST_SUPPORT_)
956 if(comm->ch.is_mcast_ok == 1
957 && mv2_use_mcast_scatter == 1
958 && comm->ch.shmem_coll_ok == 1) {
959 MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
961 #endif /*#if defined(_MCST_SUPPORT_) */
963 if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
964 MV2_pt_Scatter_function != NULL) {
965 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
966 .MV2_pt_Scatter_function;
969 MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
974 if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
975 (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
976 if( comm->is_blocked()) {
977 MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
978 .MV2_pt_Scatter_function;
981 MV2_Scatter_function(sendbuf, sendcnt, sendtype,
982 recvbuf, recvcnt, recvtype, root,
985 mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
986 recvbuf, recvcnt, recvtype, root,
991 mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
992 recvbuf, recvcnt, recvtype, root,
1001 void smpi_coll_cleanup_mvapich2()
1004 if (mv2_alltoall_thresholds_table)
1005 xbt_free(mv2_alltoall_thresholds_table[i]);
1006 xbt_free(mv2_alltoall_thresholds_table);
1007 xbt_free(mv2_size_alltoall_tuning_table);
1008 xbt_free(mv2_alltoall_table_ppn_conf);
1010 xbt_free(mv2_gather_thresholds_table);
1011 if (mv2_allgather_thresholds_table)
1012 xbt_free(mv2_allgather_thresholds_table[0]);
1013 xbt_free(mv2_size_allgather_tuning_table);
1014 xbt_free(mv2_allgather_table_ppn_conf);
1015 xbt_free(mv2_allgather_thresholds_table);
1017 xbt_free(mv2_allgatherv_thresholds_table);
1018 xbt_free(mv2_reduce_thresholds_table);
1019 xbt_free(mv2_red_scat_thresholds_table);
1020 xbt_free(mv2_allreduce_thresholds_table);
1021 xbt_free(mv2_bcast_thresholds_table);
1022 if (mv2_scatter_thresholds_table)
1023 xbt_free(mv2_scatter_thresholds_table[0]);
1024 xbt_free(mv2_scatter_thresholds_table);
1025 xbt_free(mv2_size_scatter_tuning_table);
1026 xbt_free(mv2_scatter_table_ppn_conf);