1 /* smpi_coll.c -- various optimized routing for collectives */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
14 #include "smpi_coll_private.h"
16 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
17 "Logging specific to SMPI (coll)");
28 typedef struct s_proc_tree *proc_tree_t;
33 static proc_tree_t alloc_tree(int arity)
38 tree = xbt_new(struct s_proc_tree, 1);
39 tree->PROCTREE_A = arity;
41 tree->numChildren = 0;
42 tree->child = xbt_new(int, arity);
43 for (i = 0; i < arity; i++) {
54 static void free_tree(proc_tree_t tree)
56 xbt_free(tree->child);
61 * Build the tree depending on a process rank (index) and the group size (extent)
62 * @param index the rank of the calling process
63 * @param extent the total number of processes
65 static void build_tree(int index, int extent, proc_tree_t * tree)
67 int places = (*tree)->PROCTREE_A * index;
72 for (i = 1; i <= (*tree)->PROCTREE_A; i++) {
74 ch = (*tree)->PROCTREE_A * index + i + (*tree)->root;
76 if (places < extent) {
77 (*tree)->child[i - 1] = ch;
78 (*tree)->numChildren++;
81 if (index == (*tree)->root) {
85 pr = (index - 1) / (*tree)->PROCTREE_A;
93 static void tree_bcast(void *buf, int count, MPI_Datatype datatype,
94 int root, MPI_Comm comm, proc_tree_t tree)
96 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
98 MPI_Request *requests;
100 rank = smpi_comm_rank(comm);
101 /* wait for data from my parent in the tree */
103 DEBUG3("<%d> tree_bcast(): i am not root: recv from %d, tag=%d)",
104 rank, tree->parent, system_tag + rank);
105 smpi_mpi_recv(buf, count, datatype, tree->parent, system_tag + rank,
106 comm, MPI_STATUS_IGNORE);
108 requests = xbt_new(MPI_Request, tree->numChildren);
109 DEBUG2("<%d> creates %d requests (1 per child)", rank,
111 /* iniates sends to ranks lower in the tree */
112 for (i = 0; i < tree->numChildren; i++) {
113 if (tree->child[i] == -1) {
114 requests[i] = MPI_REQUEST_NULL;
116 DEBUG3("<%d> send to <%d>, tag=%d", rank, tree->child[i],
117 system_tag + tree->child[i]);
119 smpi_isend_init(buf, count, datatype, tree->child[i],
120 system_tag + tree->child[i], comm);
123 smpi_mpi_startall(tree->numChildren, requests);
124 smpi_mpi_waitall(tree->numChildren, requests, MPI_STATUS_IGNORE);
131 static void tree_antibcast(void *buf, int count, MPI_Datatype datatype,
132 int root, MPI_Comm comm, proc_tree_t tree)
134 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
136 MPI_Request *requests;
138 rank = smpi_comm_rank(comm);
139 // everyone sends to its parent, except root.
141 DEBUG3("<%d> tree_antibcast(): i am not root: send to %d, tag=%d)",
142 rank, tree->parent, system_tag + rank);
143 smpi_mpi_send(buf, count, datatype, tree->parent, system_tag + rank,
146 //every one receives as many messages as it has children
147 requests = xbt_new(MPI_Request, tree->numChildren);
148 DEBUG2("<%d> creates %d requests (1 per child)", rank,
150 for (i = 0; i < tree->numChildren; i++) {
151 if (tree->child[i] == -1) {
152 requests[i] = MPI_REQUEST_NULL;
154 DEBUG3("<%d> recv from <%d>, tag=%d", rank, tree->child[i],
155 system_tag + tree->child[i]);
157 smpi_irecv_init(buf, count, datatype, tree->child[i],
158 system_tag + tree->child[i], comm);
161 smpi_mpi_startall(tree->numChildren, requests);
162 smpi_mpi_waitall(tree->numChildren, requests, MPI_STATUS_IGNORE);
167 * bcast with a binary, ternary, or whatever tree ..
169 void nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
170 MPI_Comm comm, int arity)
172 proc_tree_t tree = alloc_tree(arity);
175 rank = smpi_comm_rank(comm);
176 size = smpi_comm_size(comm);
177 build_tree(rank, size, &tree);
178 tree_bcast(buf, count, datatype, root, comm, tree);
183 * barrier with a binary, ternary, or whatever tree ..
185 void nary_tree_barrier(MPI_Comm comm, int arity)
187 proc_tree_t tree = alloc_tree(arity);
191 rank = smpi_comm_rank(comm);
192 size = smpi_comm_size(comm);
193 build_tree(rank, size, &tree);
194 tree_antibcast(&dummy, 1, MPI_CHAR, 0, comm, tree);
195 tree_bcast(&dummy, 1, MPI_CHAR, 0, comm, tree);
202 * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
204 int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount,
205 MPI_Datatype sendtype, void *recvbuf,
206 int recvcount, MPI_Datatype recvtype,
209 int system_tag = 777;
210 int i, rank, size, err, count;
212 MPI_Aint sendextent = 0;
213 MPI_Aint recvextent = 0;
214 MPI_Request *requests;
216 // FIXME: check implementation
217 rank = smpi_comm_rank(comm);
218 size = smpi_comm_size(comm);
219 DEBUG1("<%d> algorithm alltoall_bruck() called.", rank);
220 err = smpi_datatype_extent(sendtype, &lb, &sendextent);
221 err = smpi_datatype_extent(recvtype, &lb, &recvextent);
222 /* Local copy from self */
224 smpi_datatype_copy(&((char *) sendbuf)[rank * sendextent], sendcount,
225 sendtype, &((char *) recvbuf)[rank * recvextent],
226 recvcount, recvtype);
227 if (err == MPI_SUCCESS && size > 1) {
228 /* Initiate all send/recv to/from others. */
229 requests = xbt_new(MPI_Request, 2 * (size - 1));
231 /* Create all receives that will be posted first */
232 for (i = 0; i < size; ++i) {
234 DEBUG3("<%d> skip request creation [src = %d, recvcount = %d]",
239 smpi_irecv_init(&((char *) recvbuf)[i * recvextent], recvcount,
240 recvtype, i, system_tag, comm);
243 /* Now create all sends */
244 for (i = 0; i < size; ++i) {
246 DEBUG3("<%d> skip request creation [dst = %d, sendcount = %d]",
251 smpi_isend_init(&((char *) sendbuf)[i * sendextent], sendcount,
252 sendtype, i, system_tag, comm);
255 /* Wait for them all. */
256 smpi_mpi_startall(count, requests);
257 DEBUG2("<%d> wait for %d requests", rank, count);
258 smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
265 * Alltoall basic_linear
267 int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount,
268 MPI_Datatype sendtype,
269 void *recvbuf, int recvcount,
270 MPI_Datatype recvtype,
273 int system_tag = 888;
274 int i, rank, size, err, count;
276 MPI_Aint sendinc = 0;
277 MPI_Aint recvinc = 0;
278 MPI_Request *requests;
281 rank = smpi_comm_rank(comm);
282 size = smpi_comm_size(comm);
283 DEBUG1("<%d> algorithm alltoall_basic_linear() called.", rank);
284 err = smpi_datatype_extent(sendtype, &lb, &sendinc);
285 err = smpi_datatype_extent(recvtype, &lb, &recvinc);
286 sendinc *= sendcount;
287 recvinc *= recvcount;
288 /* simple optimization */
290 smpi_datatype_copy(&((char *) sendbuf)[rank * sendinc], sendcount,
291 sendtype, &((char *) recvbuf)[rank * recvinc],
292 recvcount, recvtype);
293 if (err == MPI_SUCCESS && size > 1) {
294 /* Initiate all send/recv to/from others. */
295 requests = xbt_new(MPI_Request, 2 * (size - 1));
296 /* Post all receives first -- a simple optimization */
298 for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
300 smpi_irecv_init(&((char *) recvbuf)[i * recvinc], recvcount,
301 recvtype, i, system_tag, comm);
304 /* Now post all sends in reverse order
305 * - We would like to minimize the search time through message queue
306 * when messages actually arrive in the order in which they were posted.
307 * TODO: check the previous assertion
309 for (i = (rank + size - 1) % size; i != rank;
310 i = (i + size - 1) % size) {
312 smpi_isend_init(&((char *) sendbuf)[i * sendinc], sendcount,
313 sendtype, i, system_tag, comm);
316 /* Wait for them all. */
317 smpi_mpi_startall(count, requests);
318 DEBUG2("<%d> wait for %d requests", rank, count);
319 smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
328 * this algorithm performs size steps (1<=s<=size) and
329 * at each step s, a process p sends iand receive to.from a unique distinct remote process
330 * size=5 : s=1: 4->0->1, 0->1->2, 1->2->3, ...
331 * s=2: 3->0->2, 4->1->3, 0->2->4, 1->3->0 , 2->4->1
333 * Openmpi calls this routine when the message size sent to each rank is greater than 3000 bytes
335 int smpi_coll_tuned_alltoall_pairwise(void *sendbuf, int sendcount,
336 MPI_Datatype sendtype, void *recvbuf,
337 int recvcount, MPI_Datatype recvtype,
340 int system_tag = 999;
341 int rank, size, step, sendto, recvfrom, sendsize, recvsize;
343 rank = smpi_comm_rank(comm);
344 size = smpi_comm_size(comm);
345 DEBUG1("<%d> algorithm alltoall_pairwise() called.", rank);
346 sendsize = smpi_datatype_size(sendtype);
347 recvsize = smpi_datatype_size(recvtype);
348 /* Perform pairwise exchange - starting from 1 so the local copy is last */
349 for (step = 1; step < size + 1; step++) {
350 /* who do we talk to in this step? */
351 sendto = (rank + step) % size;
352 recvfrom = (rank + size - step) % size;
353 /* send and receive */
354 smpi_mpi_sendrecv(&((char *) sendbuf)[sendto * sendsize * sendcount],
355 sendcount, sendtype, sendto, system_tag,
356 &((char *) recvbuf)[recvfrom * recvsize * recvcount],
357 recvcount, recvtype, recvfrom, system_tag, comm,
363 int smpi_coll_basic_alltoallv(void *sendbuf, int *sendcounts,
364 int *senddisps, MPI_Datatype sendtype,
365 void *recvbuf, int *recvcounts,
366 int *recvdisps, MPI_Datatype recvtype,
369 int system_tag = 889;
370 int i, rank, size, err, count;
372 MPI_Aint sendextent = 0;
373 MPI_Aint recvextent = 0;
374 MPI_Request *requests;
377 rank = smpi_comm_rank(comm);
378 size = smpi_comm_size(comm);
379 DEBUG1("<%d> algorithm basic_alltoallv() called.", rank);
380 err = smpi_datatype_extent(sendtype, &lb, &sendextent);
381 err = smpi_datatype_extent(recvtype, &lb, &recvextent);
382 /* Local copy from self */
384 smpi_datatype_copy(&((char *) sendbuf)[senddisps[rank] * sendextent],
385 sendcounts[rank], sendtype,
386 &((char *) recvbuf)[recvdisps[rank] * recvextent],
387 recvcounts[rank], recvtype);
388 if (err == MPI_SUCCESS && size > 1) {
389 /* Initiate all send/recv to/from others. */
390 requests = xbt_new(MPI_Request, 2 * (size - 1));
392 /* Create all receives that will be posted first */
393 for (i = 0; i < size; ++i) {
394 if (i == rank || recvcounts[i] == 0) {
396 ("<%d> skip request creation [src = %d, recvcounts[src] = %d]",
397 rank, i, recvcounts[i]);
401 smpi_irecv_init(&((char *) recvbuf)[recvdisps[i] * recvextent],
402 recvcounts[i], recvtype, i, system_tag, comm);
405 /* Now create all sends */
406 for (i = 0; i < size; ++i) {
407 if (i == rank || sendcounts[i] == 0) {
409 ("<%d> skip request creation [dst = %d, sendcounts[dst] = %d]",
410 rank, i, sendcounts[i]);
414 smpi_isend_init(&((char *) sendbuf)[senddisps[i] * sendextent],
415 sendcounts[i], sendtype, i, system_tag, comm);
418 /* Wait for them all. */
419 smpi_mpi_startall(count, requests);
420 DEBUG2("<%d> wait for %d requests", rank, count);
421 smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);