2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2009 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2009 University of Houston. All rights reserved.
15 * Additional copyrights may follow
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions are
22 * - Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
25 * - Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer listed
27 * in this license in the documentation and/or other materials
28 * provided with the distribution.
30 * - Neither the name of the copyright holders nor the names of its
31 * contributors may be used to endorse or promote products derived from
32 * this software without specific prior written permission.
34 * The copyright holders provide no reassurances that the source code
35 * provided does not infringe any patent, copyright, or any other
36 * intellectual property rights of third parties. The copyright holders
37 * disclaim any liability to any recipient for claims brought against
38 * recipient by any third party for infringement of that parties
39 * intellectual property rights.
41 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
44 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
45 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
46 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
47 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 #include "colls_private.h"
56 #include "coll_tuned_topo.h"
57 #define MAXTREEFANOUT 32
60 smpi_coll_tuned_bcast_ompi_split_bintree ( void* buffer,
62 MPI_Datatype datatype,
68 int segindex, i, lr, pair;
69 int segcount[2]; /* Number ompi_request_wait_allof elements sent with each segment */
71 int num_segments[2]; /* Number of segmenets */
72 int sendcount[2]; /* the same like segcount, except for the last segment */
73 size_t realsegsize[2];
76 ptrdiff_t type_extent;
79 MPI_Request base_req, new_req;
80 ompi_coll_tree_t *tree;
81 // mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
82 // mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
84 size = smpi_comm_size(comm);
85 rank = smpi_comm_rank(comm);
88 //compute again segsize
89 const size_t intermediate_message_size = 370728;
90 size_t message_size = smpi_datatype_size(datatype) * (unsigned long)count;
91 if(message_size < intermediate_message_size)
96 XBT_DEBUG("ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize);
102 /* setup the binary tree topology. */
103 tree = ompi_coll_tuned_topo_build_tree(2,comm,root);
105 type_size = smpi_datatype_size( datatype );
107 /* Determine number of segments and number of elements per segment */
109 if (count % 2 != 0) counts[0]++;
110 counts[1] = count - counts[0];
112 /* Note that ompi_datatype_type_size() will never return a negative
113 value in typelng; it returns an int [vs. an unsigned type]
114 because of the MPI spec. */
115 if (segsize < ((uint32_t) type_size)) {
116 segsize = type_size; /* push segsize up to hold one type */
118 segcount[0] = segcount[1] = segsize / type_size;
119 num_segments[0] = counts[0]/segcount[0];
120 if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
121 num_segments[1] = counts[1]/segcount[1];
122 if ((counts[1] % segcount[1]) != 0) num_segments[1]++;
124 segcount[0] = counts[0];
125 segcount[1] = counts[1];
126 num_segments[0] = num_segments[1] = 1;
129 /* if the message is too small to be split into segments */
130 if( (counts[0] == 0 || counts[1] == 0) ||
131 (segsize > counts[0] * type_size) ||
132 (segsize > counts[1] * type_size) ) {
133 /* call linear version here ! */
134 return (smpi_coll_tuned_bcast_SMP_linear ( buffer, count, datatype,
137 type_extent = smpi_datatype_get_extent(datatype);
140 /* Determine real segment size */
141 realsegsize[0] = segcount[0] * type_extent;
142 realsegsize[1] = segcount[1] * type_extent;
144 /* set the buffer pointers */
145 tmpbuf[0] = (char *) buffer;
146 tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
149 Root splits the buffer in 2 and sends segmented message down the branches.
150 Left subtree of the tree receives first half of the buffer, while right
151 subtree receives the remaining message.
154 /* determine if I am left (0) or right (1), (root is right) */
155 lr = ((rank + size - root)%size + 1)%2;
159 /* determine segment count */
160 sendcount[0] = segcount[0];
161 sendcount[1] = segcount[1];
162 /* for each segment */
163 for (segindex = 0; segindex < num_segments[0]; segindex++) {
165 for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) {
166 if (segindex >= num_segments[i]) { /* no more segments */
169 /* determine how many elements are being sent in this round */
170 if(segindex == (num_segments[i] - 1))
171 sendcount[i] = counts[i] - segindex*segcount[i];
173 smpi_mpi_send(tmpbuf[i], sendcount[i], datatype,
174 tree->tree_next[i], COLL_TAG_BCAST, comm);
175 /* update tmp buffer */
176 tmpbuf[i] += realsegsize[i];
181 /* intermediate nodes code */
182 else if( tree->tree_nextsize > 0 ) {
183 /* Intermediate nodes:
184 * It will receive segments only from one half of the data.
185 * Which one is determined by whether the node belongs to the "left" or "right"
186 * subtree. Topoloby building function builds binary tree such that
187 * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
188 * and even on the right subtree.
190 * Create the pipeline. We first post the first receive, then in the loop we
191 * post the next receive and after that wait for the previous receive to complete
192 * and we disseminating the data to all children.
194 sendcount[lr] = segcount[lr];
195 base_req=smpi_mpi_irecv(tmpbuf[lr], sendcount[lr], datatype,
196 tree->tree_prev, COLL_TAG_BCAST,
199 for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
200 /* determine how many elements to expect in this round */
201 if( segindex == (num_segments[lr] - 1))
202 sendcount[lr] = counts[lr] - segindex*segcount[lr];
204 new_req = smpi_mpi_irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
205 datatype, tree->tree_prev, COLL_TAG_BCAST,
208 /* wait for and forward current segment */
209 smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
210 for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */
211 smpi_mpi_send( tmpbuf[lr], segcount[lr], datatype,
212 tree->tree_next[i], COLL_TAG_BCAST,
214 } /* end of for each child */
216 /* upate the base request */
218 /* go to the next buffer (ie. the one corresponding to the next recv) */
219 tmpbuf[lr] += realsegsize[lr];
220 } /* end of for segindex */
222 /* wait for the last segment and forward current segment */
223 smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
224 for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
225 smpi_mpi_send(tmpbuf[lr], sendcount[lr], datatype,
226 tree->tree_next[i], COLL_TAG_BCAST, comm);
227 } /* end of for each child */
232 /* Just consume segments as fast as possible */
233 sendcount[lr] = segcount[lr];
234 for (segindex = 0; segindex < num_segments[lr]; segindex++) {
235 /* determine how many elements to expect in this round */
236 if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr];
237 /* receive segments */
238 smpi_mpi_recv(tmpbuf[lr], sendcount[lr], datatype,
239 tree->tree_prev, COLL_TAG_BCAST,
240 comm, MPI_STATUS_IGNORE);
241 /* update the initial pointer to the buffer */
242 tmpbuf[lr] += realsegsize[lr];
246 /* reset the buffer pointers */
247 tmpbuf[0] = (char *) buffer;
248 tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
251 Find your immediate pair (identical node in opposite subtree) and SendRecv
252 data buffer with them.
253 The tree building function ensures that
255 if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
256 if we are in the right subtree (lr == 1) our pair is (rank-1)%size
257 If we have even number of nodes the rank (size-1) will pair up with root.
260 pair = (rank+1)%size;
262 pair = (rank+size-1)%size;
265 if ( (size%2) != 0 && rank != root) {
267 smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
268 pair, COLL_TAG_BCAST,
269 tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
270 pair, COLL_TAG_BCAST,
271 comm, MPI_STATUS_IGNORE);
272 } else if ( (size%2) == 0 ) {
273 /* root sends right buffer to the last node */
275 smpi_mpi_send(tmpbuf[1], counts[1], datatype,
276 (root+size-1)%size, COLL_TAG_BCAST, comm);
279 /* last node receives right buffer from the root */
280 else if (rank == (root+size-1)%size) {
281 smpi_mpi_recv(tmpbuf[1], counts[1], datatype,
282 root, COLL_TAG_BCAST,
283 comm, MPI_STATUS_IGNORE);
285 /* everyone else exchanges buffers */
287 smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
288 pair, COLL_TAG_BCAST,
289 tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
290 pair, COLL_TAG_BCAST,
291 comm, MPI_STATUS_IGNORE);
295 return (MPI_SUCCESS);