1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
18 * Copyright (c) 2009 University of Houston. All rights reserved.
20 * Additional copyrights may follow
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions are
26 * - Redistributions of source code must retain the above copyright
27 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer listed
31 * in this license in the documentation and/or other materials
32 * provided with the distribution.
34 * - Neither the name of the copyright holders nor the names of its
35 * contributors may be used to endorse or promote products derived from
36 * this software without specific prior written permission.
38 * The copyright holders provide no reassurances that the source code
39 * provided does not infringe any patent, copyright, or any other
40 * intellectual property rights of third parties. The copyright holders
41 * disclaim any liability to any recipient for claims brought against
42 * recipient by any third party for infringement of that parties
43 * intellectual property rights.
45 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
46 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
47 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
48 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
49 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
51 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
55 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58 #include "colls_private.h"
59 #include "coll_tuned_topo.h"
60 #define MAXTREEFANOUT 32
63 smpi_coll_tuned_bcast_ompi_split_bintree ( void* buffer,
65 MPI_Datatype datatype,
71 int segindex, i, lr, pair;
72 int segcount[2]; /* Number ompi_request_wait_allof elements sent with each segment */
74 int num_segments[2]; /* Number of segmenets */
75 int sendcount[2]; /* the same like segcount, except for the last segment */
76 size_t realsegsize[2];
79 ptrdiff_t type_extent;
82 MPI_Request base_req, new_req;
83 ompi_coll_tree_t *tree;
84 // mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
85 // mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
87 size = smpi_comm_size(comm);
88 rank = smpi_comm_rank(comm);
91 //compute again segsize
92 const size_t intermediate_message_size = 370728;
93 size_t message_size = smpi_datatype_size(datatype) * (unsigned long)count;
94 if(message_size < intermediate_message_size)
99 XBT_DEBUG("ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize);
105 /* setup the binary tree topology. */
106 tree = ompi_coll_tuned_topo_build_tree(2,comm,root);
108 type_size = smpi_datatype_size( datatype );
110 /* Determine number of segments and number of elements per segment */
112 if (count % 2 != 0) counts[0]++;
113 counts[1] = count - counts[0];
115 /* Note that ompi_datatype_type_size() will never return a negative
116 value in typelng; it returns an int [vs. an unsigned type]
117 because of the MPI spec. */
118 if (segsize < ((uint32_t) type_size)) {
119 segsize = type_size; /* push segsize up to hold one type */
121 segcount[0] = segcount[1] = segsize / type_size;
122 num_segments[0] = counts[0]/segcount[0];
123 if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
124 num_segments[1] = counts[1]/segcount[1];
125 if ((counts[1] % segcount[1]) != 0) num_segments[1]++;
127 segcount[0] = counts[0];
128 segcount[1] = counts[1];
129 num_segments[0] = num_segments[1] = 1;
132 /* if the message is too small to be split into segments */
133 if( (counts[0] == 0 || counts[1] == 0) ||
134 (segsize > counts[0] * type_size) ||
135 (segsize > counts[1] * type_size) ) {
136 /* call linear version here ! */
137 return (smpi_coll_tuned_bcast_SMP_linear ( buffer, count, datatype,
140 type_extent = smpi_datatype_get_extent(datatype);
143 /* Determine real segment size */
144 realsegsize[0] = segcount[0] * type_extent;
145 realsegsize[1] = segcount[1] * type_extent;
147 /* set the buffer pointers */
148 tmpbuf[0] = (char *) buffer;
149 tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
152 Root splits the buffer in 2 and sends segmented message down the branches.
153 Left subtree of the tree receives first half of the buffer, while right
154 subtree receives the remaining message.
157 /* determine if I am left (0) or right (1), (root is right) */
158 lr = ((rank + size - root)%size + 1)%2;
162 /* determine segment count */
163 sendcount[0] = segcount[0];
164 sendcount[1] = segcount[1];
165 /* for each segment */
166 for (segindex = 0; segindex < num_segments[0]; segindex++) {
168 for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) {
169 if (segindex >= num_segments[i]) { /* no more segments */
172 /* determine how many elements are being sent in this round */
173 if(segindex == (num_segments[i] - 1))
174 sendcount[i] = counts[i] - segindex*segcount[i];
176 smpi_mpi_send(tmpbuf[i], sendcount[i], datatype,
177 tree->tree_next[i], COLL_TAG_BCAST, comm);
178 /* update tmp buffer */
179 tmpbuf[i] += realsegsize[i];
184 /* intermediate nodes code */
185 else if( tree->tree_nextsize > 0 ) {
186 /* Intermediate nodes:
187 * It will receive segments only from one half of the data.
188 * Which one is determined by whether the node belongs to the "left" or "right"
189 * subtree. Topoloby building function builds binary tree such that
190 * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
191 * and even on the right subtree.
193 * Create the pipeline. We first post the first receive, then in the loop we
194 * post the next receive and after that wait for the previous receive to complete
195 * and we disseminating the data to all children.
197 sendcount[lr] = segcount[lr];
198 base_req=smpi_mpi_irecv(tmpbuf[lr], sendcount[lr], datatype,
199 tree->tree_prev, COLL_TAG_BCAST,
202 for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
203 /* determine how many elements to expect in this round */
204 if( segindex == (num_segments[lr] - 1))
205 sendcount[lr] = counts[lr] - segindex*segcount[lr];
207 new_req = smpi_mpi_irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
208 datatype, tree->tree_prev, COLL_TAG_BCAST,
211 /* wait for and forward current segment */
212 smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
213 for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */
214 smpi_mpi_send( tmpbuf[lr], segcount[lr], datatype,
215 tree->tree_next[i], COLL_TAG_BCAST,
217 } /* end of for each child */
219 /* upate the base request */
221 /* go to the next buffer (ie. the one corresponding to the next recv) */
222 tmpbuf[lr] += realsegsize[lr];
223 } /* end of for segindex */
225 /* wait for the last segment and forward current segment */
226 smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
227 for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
228 smpi_mpi_send(tmpbuf[lr], sendcount[lr], datatype,
229 tree->tree_next[i], COLL_TAG_BCAST, comm);
230 } /* end of for each child */
235 /* Just consume segments as fast as possible */
236 sendcount[lr] = segcount[lr];
237 for (segindex = 0; segindex < num_segments[lr]; segindex++) {
238 /* determine how many elements to expect in this round */
239 if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr];
240 /* receive segments */
241 smpi_mpi_recv(tmpbuf[lr], sendcount[lr], datatype,
242 tree->tree_prev, COLL_TAG_BCAST,
243 comm, MPI_STATUS_IGNORE);
244 /* update the initial pointer to the buffer */
245 tmpbuf[lr] += realsegsize[lr];
249 /* reset the buffer pointers */
250 tmpbuf[0] = (char *) buffer;
251 tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
254 Find your immediate pair (identical node in opposite subtree) and SendRecv
255 data buffer with them.
256 The tree building function ensures that
258 if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
259 if we are in the right subtree (lr == 1) our pair is (rank-1)%size
260 If we have even number of nodes the rank (size-1) will pair up with root.
263 pair = (rank+1)%size;
265 pair = (rank+size-1)%size;
268 if ( (size%2) != 0 && rank != root) {
270 smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
271 pair, COLL_TAG_BCAST,
272 tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
273 pair, COLL_TAG_BCAST,
274 comm, MPI_STATUS_IGNORE);
275 } else if ( (size%2) == 0 ) {
276 /* root sends right buffer to the last node */
278 smpi_mpi_send(tmpbuf[1], counts[1], datatype,
279 (root+size-1)%size, COLL_TAG_BCAST, comm);
282 /* last node receives right buffer from the root */
283 else if (rank == (root+size-1)%size) {
284 smpi_mpi_recv(tmpbuf[1], counts[1], datatype,
285 root, COLL_TAG_BCAST,
286 comm, MPI_STATUS_IGNORE);
288 /* everyone else exchanges buffers */
290 smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
291 pair, COLL_TAG_BCAST,
292 tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
293 pair, COLL_TAG_BCAST,
294 comm, MPI_STATUS_IGNORE);
298 return (MPI_SUCCESS);