2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2009 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
14 * Additional copyrights may follow
19 #include "colls_private.h"
20 #include "coll_tuned_topo.h"
22 /* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain,
23 * gather_intra_pipeline, segmentation? */
25 smpi_coll_tuned_gather_ompi_binomial(void *sbuf, int scount,
27 void *rbuf, int rcount,
41 ompi_coll_tree_t* bmtree;
43 MPI_Aint sextent, slb, strue_lb, strue_extent;
44 MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
47 size = smpi_comm_size(comm);
48 rank = smpi_comm_rank(comm);
51 "smpi_coll_tuned_gather_ompi_binomial rank %d", rank);
53 /* create the binomial tree */
54 // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
55 bmtree = ompi_coll_tuned_topo_build_in_order_bmtree(comm, root);
56 // data->cached_in_order_bmtree;
58 smpi_datatype_extent(sdtype, &slb, &sextent);
59 smpi_datatype_extent(sdtype, &strue_lb, &strue_extent);
61 vrank = (rank - root + size) % size;
64 smpi_datatype_extent(rdtype, &rlb, &rextent);
65 smpi_datatype_extent(rdtype, &rtrue_lb, &rtrue_extent);
67 /* root on 0, just use the recv buffer */
69 if (sbuf != MPI_IN_PLACE) {
70 err = smpi_datatype_copy(sbuf, scount, sdtype,
71 ptmp, rcount, rdtype);
72 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
75 /* root is not on 0, allocate temp buffer for recv,
76 * rotate data at the end */
77 tempbuf = (char *) malloc(rtrue_extent + (rcount*size - 1) * rextent);
78 if (NULL == tempbuf) {
79 err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl;
83 if (sbuf != MPI_IN_PLACE) {
84 /* copy from sbuf to temp buffer */
85 err = smpi_datatype_copy(sbuf, scount, sdtype,
86 ptmp, rcount, rdtype);
87 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
89 /* copy from rbuf to temp buffer */
90 err = smpi_datatype_copy((char *) rbuf + rank*rextent*rcount, rcount, rdtype, ptmp, rcount, rdtype );
91 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
95 } else if (!(vrank % 2)) {
96 /* other non-leaf nodes, allocate temp buffer for data received from
97 * children, the most we need is half of the total data elements due
98 * to the property of binimoal tree */
99 tempbuf = (char *) malloc(strue_extent + (scount*size - 1) * sextent);
100 if (NULL == tempbuf) {
101 err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl;
104 ptmp = tempbuf - slb;
105 /* local copy to tempbuf */
106 err = smpi_datatype_copy(sbuf, scount, sdtype,
107 ptmp, scount, sdtype);
108 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
110 /* use sdtype,scount as rdtype,rdcount since they are ignored on
117 /* leaf nodes, no temp buffer needed, use sdtype,scount as
118 * rdtype,rdcount since they are ignored on non-root procs */
119 ptmp = (char *) sbuf;
124 /* all non-leaf nodes recv from children */
125 for (i = 0; i < bmtree->tree_nextsize; i++) {
126 int mycount = 0, vkid;
127 /* figure out how much data I have to send to this child */
128 vkid = (bmtree->tree_next[i] - root + size) % size;
129 mycount = vkid - vrank;
130 if (mycount > (size - vkid))
131 mycount = size - vkid;
135 "smpi_coll_tuned_gather_ompi_binomial rank %d recv %d mycount = %d",
136 rank, bmtree->tree_next[i], mycount);
138 smpi_mpi_recv(ptmp + total_recv*rextent, mycount, rdtype,
139 bmtree->tree_next[i], COLL_TAG_GATHER,
142 total_recv += mycount;
147 /* all nodes except root send to parents */
149 "smpi_coll_tuned_gather_ompi_binomial rank %d send %d count %d\n",
150 rank, bmtree->tree_prev, total_recv);
152 smpi_mpi_send(ptmp, total_recv, sdtype,
159 /* rotate received data on root if root != 0 */
160 err = smpi_datatype_copy(ptmp, rcount*(size - root), rdtype,
161 (char *) rbuf + rextent*root*rcount, rcount*(size - root), rdtype );
162 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
165 err = smpi_datatype_copy( ptmp + rextent*rcount*(size-root), rcount*root,rdtype,
166 (char *) rbuf,rcount*root,rdtype);
167 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
171 } else if (!(vrank % 2)) {
172 /* other non-leaf nodes */
181 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
182 __FILE__, line, err, rank);
187 * gather_intra_linear_sync
189 * Function: - synchronized gather operation with
190 * Accepts: - same arguments as MPI_Gather(), first segment size
191 * Returns: - MPI_SUCCESS or error code
194 smpi_coll_tuned_gather_ompi_linear_sync(void *sbuf, int scount,
196 void *rbuf, int rcount,
204 int first_segment_count;
209 int first_segment_size=0;
210 size = smpi_comm_size(comm);
211 rank = smpi_comm_rank(comm);
213 size_t dsize, block_size;
215 dsize= smpi_datatype_size(rdtype);
216 block_size = dsize * rcount;
218 dsize=smpi_datatype_size(sdtype);
219 block_size = dsize * scount;
222 if (block_size > 92160){
223 first_segment_size = 32768;
225 first_segment_size = 1024;
229 "smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size);
232 /* Non-root processes:
233 - receive zero byte message from the root,
234 - send the first segment of the data synchronously,
235 - send the second segment of the data.
238 typelng= smpi_datatype_size(sdtype);
239 smpi_datatype_extent(sdtype, &lb, &extent);
240 first_segment_count = scount;
241 COLL_TUNED_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
242 first_segment_count );
244 smpi_mpi_recv(sbuf, 0, MPI_BYTE, root,
246 comm, MPI_STATUS_IGNORE);
248 smpi_mpi_send(sbuf, first_segment_count, sdtype, root,
252 smpi_mpi_send((char*)sbuf + extent * first_segment_count,
253 (scount - first_segment_count), sdtype,
254 root, COLL_TAG_GATHER,
260 - For every non-root node:
261 - post irecv for the first segment of the message
262 - send zero byte message to signal node to send the message
263 - post irecv for the second segment of the message
264 - wait for the first segment to complete
265 - Copy local data if necessary
266 - Waitall for all the second segments to complete.
269 MPI_Request *reqs = NULL, first_segment_req;
270 reqs = (MPI_Request *) calloc(size, sizeof(MPI_Request ));
271 if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }
273 typelng=smpi_datatype_size(rdtype);
274 smpi_datatype_extent(rdtype, &lb, &extent);
275 first_segment_count = rcount;
276 COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
277 first_segment_count );
279 ptmp = (char *) rbuf;
280 for (i = 0; i < size; ++i) {
283 reqs[i] = MPI_REQUEST_NULL;
287 /* irecv for the first segment from i */
288 ptmp = (char*)rbuf + i * rcount * extent;
289 first_segment_req = smpi_mpi_irecv(ptmp, first_segment_count, rdtype, i,
290 COLL_TAG_GATHER, comm
293 /* send sync message */
294 smpi_mpi_send(rbuf, 0, MPI_BYTE, i,
298 /* irecv for the second segment */
299 ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent;
300 reqs[i]=smpi_mpi_irecv(ptmp, (rcount - first_segment_count),
301 rdtype, i, COLL_TAG_GATHER, comm
304 /* wait on the first segment to complete */
305 smpi_mpi_wait(&first_segment_req, MPI_STATUS_IGNORE);
308 /* copy local data if necessary */
309 if (MPI_IN_PLACE != sbuf) {
310 ret = smpi_datatype_copy(sbuf, scount, sdtype,
311 (char*)rbuf + rank * rcount * extent,
313 if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
316 /* wait all second segments to complete */
317 ret = smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
327 "ERROR_HNDL: node %d file %s line %d error %d\n",
328 rank, __FILE__, line, ret );
333 * Linear functions are copied from the BASIC coll module
334 * they do not segment the message and are simple implementations
335 * but for some small number of nodes and/or small data sizes they
336 * are just as fast as tuned/tree based segmenting operations
337 * and as such may be selected by the decision functions
338 * These are copied into this module due to the way we select modules
339 * in V1. i.e. in V2 we will handle this differently and so will not
340 * have to duplicate code.
341 * JPG following the examples from other coll_tuned implementations. Dec06.
344 /* copied function (with appropriate renaming) starts here */
348 * Function: - basic gather operation
349 * Accepts: - same arguments as MPI_Gather()
350 * Returns: - MPI_SUCCESS or error code
353 smpi_coll_tuned_gather_ompi_basic_linear(void *sbuf, int scount,
355 void *rbuf, int rcount,
369 size = smpi_comm_size(comm);
370 rank = smpi_comm_rank(comm);
372 /* Everyone but root sends data and returns. */
374 "ompi_coll_tuned_gather_intra_basic_linear rank %d", rank);
377 smpi_mpi_send(sbuf, scount, sdtype, root,
383 /* I am the root, loop receiving the data. */
385 smpi_datatype_extent(rdtype, &lb, &extent);
386 incr = extent * rcount;
387 for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
389 if (MPI_IN_PLACE != sbuf) {
390 err = smpi_datatype_copy(sbuf, scount, sdtype,
391 ptmp, rcount, rdtype);
396 smpi_mpi_recv(ptmp, rcount, rdtype, i,
398 comm, MPI_STATUS_IGNORE);
401 if (MPI_SUCCESS != err) {