1 #include "colls_private.h"
3 /*****************************************************************************
5 Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are met:
11 * Redistributions of source code must retain the above copyright notice,
12 this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
18 * Neither the name of the Florida State University nor the names of its
19 contributors may be used to endorse or promote products derived from this
20 software without specific prior written permission.
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *************************************************************************
34 * Any results obtained from executing this software require the *
35 * acknowledgment and citation of the software and its owners. *
36 * The full citation is given below: *
38 * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
39 * Collective Communication Routines." The 19th ACM International *
40 * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
42 *************************************************************************
44 *****************************************************************************/
46 /*****************************************************************************
48 * Function: bcast_scatter_rdb_allgather
53 buff: send input buffer
54 count: number of elements to send
55 data_type: data type of elements being sent
59 * Descrp: broadcasts using a scatter followed by rdb allgather.
61 * Auther: MPICH / modified by Ahmad Faraj
63 ****************************************************************************/
66 smpi_coll_tuned_bcast_scatter_rdb_allgather(void *buff, int count, MPI_Datatype
67 data_type, int root, MPI_Comm comm)
72 int i, j, k, src, dst, rank, num_procs, send_offset, recv_offset;
73 int mask, relative_rank, curr_size, recv_size = 0, send_size, nbytes;
74 int scatter_size, tree_root, relative_dst, dst_tree_root;
75 int my_tree_root, offset, tmp_mask, num_procs_completed;
76 int tag = COLL_TAG_BCAST;
78 rank = smpi_comm_rank(comm);
79 num_procs = smpi_comm_size(comm);
80 extent = smpi_datatype_get_extent(data_type);
82 nbytes = extent * count;
83 scatter_size = (nbytes + num_procs - 1) / num_procs; // ceiling division
84 curr_size = (rank == root) ? nbytes : 0; // root starts with all the data
85 relative_rank = (rank >= root) ? rank - root : rank - root + num_procs;
88 while (mask < num_procs) {
89 if (relative_rank & mask) {
93 recv_size = nbytes - relative_rank * scatter_size;
94 // recv_size is larger than what might actually be sent by the
95 // sender. We don't need compute the exact value because MPI
96 // allows you to post a larger recv.
98 curr_size = 0; // this process doesn't receive any data
99 // because of uneven division
101 smpi_mpi_recv((char *)buff + relative_rank * scatter_size, recv_size,
102 MPI_BYTE, src, tag, comm, &status);
103 curr_size = smpi_mpi_get_count(&status, MPI_BYTE);
110 // This process is responsible for all processes that have bits
111 // set from the LSB upto (but not including) mask. Because of
112 // the "not including", we start by shifting mask back down
117 if (relative_rank + mask < num_procs) {
118 send_size = curr_size - scatter_size * mask;
119 // mask is also the size of this process's subtree
123 if (dst >= num_procs)
125 smpi_mpi_send((char *)buff + scatter_size * (relative_rank + mask),
126 send_size, MPI_BYTE, dst, tag, comm);
128 curr_size -= send_size;
134 // done scatter now do allgather
139 while (mask < num_procs) {
140 relative_dst = relative_rank ^ mask;
142 dst = (relative_dst + root) % num_procs;
144 /* find offset into send and recv buffers.
145 zero out the least significant "i" bits of relative_rank and
146 relative_dst to find root of src and dst
147 subtrees. Use ranks of roots as index to send from
148 and recv into buffer */
150 dst_tree_root = relative_dst >> i;
153 my_tree_root = relative_rank >> i;
156 send_offset = my_tree_root * scatter_size;
157 recv_offset = dst_tree_root * scatter_size;
159 if (relative_dst < num_procs) {
160 smpi_mpi_sendrecv((char *)buff + send_offset, curr_size, MPI_BYTE, dst, tag,
161 (char *)buff + recv_offset, scatter_size * mask, MPI_BYTE, dst,
163 recv_size = smpi_mpi_get_count(&status, MPI_BYTE);
164 curr_size += recv_size;
167 /* if some processes in this process's subtree in this step
168 did not have any destination process to communicate with
169 because of non-power-of-two, we need to send them the
170 data that they would normally have received from those
171 processes. That is, the haves in this subtree must send to
172 the havenots. We use a logarithmic recursive-halfing algorithm
175 if (dst_tree_root + mask > num_procs) {
176 num_procs_completed = num_procs - my_tree_root - mask;
177 /* num_procs_completed is the number of processes in this
178 subtree that have all the data. Send data to others
179 in a tree fashion. First find root of current tree
180 that is being divided into two. k is the number of
181 least-significant bits in this process's rank that
182 must be zeroed out to find the rank of the root */
191 offset = scatter_size * (my_tree_root + mask);
192 tmp_mask = mask >> 1;
195 relative_dst = relative_rank ^ tmp_mask;
196 dst = (relative_dst + root) % num_procs;
198 tree_root = relative_rank >> k;
201 /* send only if this proc has data and destination
202 doesn't have data. */
204 if ((relative_dst > relative_rank)
205 && (relative_rank < tree_root + num_procs_completed)
206 && (relative_dst >= tree_root + num_procs_completed)) {
207 smpi_mpi_send((char *)buff + offset, recv_size, MPI_BYTE, dst, tag, comm);
209 /* recv_size was set in the previous
210 receive. that's the amount of data to be
213 /* recv only if this proc. doesn't have data and sender
215 else if ((relative_dst < relative_rank)
216 && (relative_dst < tree_root + num_procs_completed)
217 && (relative_rank >= tree_root + num_procs_completed)) {
219 smpi_mpi_recv((char *)buff + offset, scatter_size * num_procs_completed,
220 MPI_BYTE, dst, tag, comm, &status);
222 /* num_procs_completed is also equal to the no. of processes
223 whose data we don't have */
224 recv_size = smpi_mpi_get_count(&status, MPI_BYTE);
225 curr_size += recv_size;