1 /* Copyright (c) 2013-2022. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2006 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
22 #include "../coll_tuned_topo.hpp"
23 #include "../colls_private.hpp"
25 namespace simgrid::smpi {
27 int scatter__ompi_binomial(const void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
28 MPI_Datatype rdtype, int root, MPI_Comm comm)
36 unsigned char* ptmp = nullptr;
37 unsigned char* tempbuf = nullptr;
38 const unsigned char* cptmp; // const ptmp
40 ompi_coll_tree_t* bmtree;
42 MPI_Aint sextent, slb, strue_lb, strue_extent;
43 MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
49 "Coll_scatter_ompi_binomial::scatter rank %d", rank);
51 /* create the binomial tree */
53 // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
54 bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( comm, root);//ompi_ data->cached_in_order_bmtree;
56 sdtype->extent(&slb, &sextent);
57 sdtype->extent(&strue_lb, &strue_extent);
58 rdtype->extent(&rlb, &rextent);
59 rdtype->extent(&rtrue_lb, &rtrue_extent);
61 vrank = (rank - root + size) % size;
65 /* root on 0, just use the send buffer */
66 ptmp = nullptr; // unused
67 cptmp = static_cast<const unsigned char*>(sbuf);
68 if (rbuf != MPI_IN_PLACE) {
69 /* local copy to rbuf */
70 err = Datatype::copy(sbuf, scount, sdtype, rbuf, rcount, rdtype);
71 if (MPI_SUCCESS != err) {
77 /* root is not on 0, allocate temp buffer for send */
78 tempbuf = smpi_get_tmp_sendbuffer(strue_extent + (scount * size - 1) * sextent);
79 if (nullptr == tempbuf) {
88 /* and rotate data so they will eventually in the right place */
89 err = Datatype::copy((char*)sbuf + sextent * root * scount, scount * (size - root), sdtype, ptmp,
90 scount * (size - root), sdtype);
91 if (MPI_SUCCESS != err) {
96 err = Datatype::copy((char*)sbuf, scount * root, sdtype, ptmp + sextent * scount * (size - root), scount * root,
98 if (MPI_SUCCESS != err) {
103 if (rbuf != MPI_IN_PLACE) {
104 /* local copy to rbuf */
105 err = Datatype::copy(ptmp, scount, sdtype, rbuf, rcount, rdtype);
106 if (MPI_SUCCESS != err) {
113 } else if (not(vrank % 2)) {
114 /* non-root, non-leaf nodes, allocate temp buffer for recv
115 * the most we need is rcount*size/2 */
116 tempbuf = smpi_get_tmp_recvbuffer(rtrue_extent + (rcount * size - 1) * rextent);
117 if (nullptr == tempbuf) {
123 ptmp = tempbuf - rlb;
131 /* leaf nodes, just use rbuf */
132 ptmp = static_cast<unsigned char*>(rbuf);
136 if (not(vrank % 2)) {
138 /* recv from parent on non-root */
139 Request::recv(ptmp, rcount * size, rdtype, bmtree->tree_prev, COLL_TAG_SCATTER, comm, &status);
140 /* local copy to rbuf */
141 Datatype::copy(ptmp, scount, sdtype, rbuf, rcount, rdtype);
143 /* send to children on all non-leaf */
144 for (i = 0; i < bmtree->tree_nextsize; i++) {
145 int mycount = 0, vkid;
146 /* figure out how much data I have to send to this child */
147 vkid = (bmtree->tree_next[i] - root + size) % size;
148 mycount = vkid - vrank;
149 if (mycount > (size - vkid))
150 mycount = size - vkid;
153 Request::send(cptmp + total_send * sextent, mycount, sdtype, bmtree->tree_next[i], COLL_TAG_SCATTER, comm);
155 total_send += mycount;
159 /* recv from parent on leaf nodes */
160 Request::recv(ptmp, rcount, rdtype, bmtree->tree_prev, COLL_TAG_SCATTER, comm, &status);
163 smpi_free_tmp_buffer(tempbuf);
164 // not FIXME : store the tree, as done in ompi, instead of calculating it each time ?
165 ompi_coll_tuned_topo_destroy_tree(&bmtree);
170 smpi_free_tmp_buffer(tempbuf);
172 XBT_DEBUG("%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank);
177 * Linear functions are copied from the BASIC coll module
178 * they do not segment the message and are simple implementations
179 * but for some small number of nodes and/or small data sizes they
180 * are just as fast as tuned/tree based segmenting operations
181 * and as such may be selected by the decision functions
182 * These are copied into this module due to the way we select modules
183 * in V1. i.e. in V2 we will handle this differently and so will not
184 * have to duplicate code.
185 * JPG following the examples from other coll_tuned implementations. Dec06.
188 /* copied function (with appropriate renaming) starts here */
192 * Function: - basic scatter operation
193 * Accepts: - same arguments as MPI_Scatter()
194 * Returns: - MPI_SUCCESS or error code
196 int scatter__ompi_basic_linear(const void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
197 MPI_Datatype rdtype, int root, MPI_Comm comm)
199 int i, rank, size, err;
208 /* If not root, receive data. */
211 Request::recv(rbuf, rcount, rdtype, root,
213 comm, MPI_STATUS_IGNORE);
217 /* I am the root, loop sending data. */
219 err = sdtype->extent(&lb, &incr);
220 if (MPI_SUCCESS != err) {
221 return MPI_ERR_OTHER;
225 for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
227 /* simple optimization */
230 if (MPI_IN_PLACE != rbuf) {
232 Datatype::copy(ptmp, scount, sdtype, rbuf, rcount,
236 Request::send(ptmp, scount, sdtype, i,
240 if (MPI_SUCCESS != err) {
251 * Use isends for distributing the data with periodic sync by blocking send.
252 * Blocking send acts like a local resources flush, because it ensures
253 * progression until the message is sent/(copied to some sort of transmit buffer).
255 int scatter__ompi_linear_nb(const void *sbuf, int scount,
257 void *rbuf, int rcount,
262 int i, rank, size, err, line, nreqs;
265 MPI_Request *reqs = nullptr;
266 MPI_Request *preq = nullptr;
271 /* If not root, receive data. */
273 Request::recv(rbuf, rcount, rdtype, root,
275 comm, MPI_STATUS_IGNORE);
279 nreqs = size - 1; /* no send for myself */
281 reqs = new MPI_Request[nreqs];
284 line = __LINE__; goto err_hndl;
287 incr = sdtype->get_extent();
290 /* I am the root, loop sending data. */
291 for (i = 0, ptmp = (char *)sbuf, preq = reqs; i < size; ++i, ptmp += incr) {
292 /* simple optimization */
294 if (MPI_IN_PLACE != rbuf) {
295 err = Datatype::copy(ptmp, scount, sdtype, rbuf, rcount,
299 *preq = Request::isend(ptmp, scount, sdtype, i,
300 COLL_TAG_SCATTER, comm);
303 if (MPI_SUCCESS != err) {
304 line = __LINE__; goto err_hndl;
308 err = Request::waitall(preq - reqs, reqs, MPI_STATUSES_IGNORE);
309 if (MPI_SUCCESS != err) {
310 line = __LINE__; goto err_hndl;
321 XBT_DEBUG("%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank);
322 (void)line; /* silence compiler warning */
326 } // namespace simgrid::smpi