2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2009 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2009 University of Houston. All rights reserved.
15 * Additional copyrights may follow
20 #include "colls_private.h"
22 * ompi_coll_tuned_allgatherv_intra_bruck
24 * Function: allgather using O(log(N)) steps.
25 * Accepts: Same arguments as MPI_Allgather
26 * Returns: MPI_SUCCESS or error code
28 * Description: Variation to All-to-all algorithm described by Bruck et al.in
29 * "Efficient Algorithms for All-to-all Communications
30 * in Multiport Message-Passing Systems"
31 * Note: Unlike in case of allgather implementation, we relay on
32 * indexed datatype to select buffers appropriately.
33 * The only additional memory requirement is for creation of
34 * temporary datatypes.
35 * Example on 7 nodes (memory lay out need not be in-order)
38 * [0] [ ] [ ] [ ] [ ] [ ] [ ]
39 * [ ] [1] [ ] [ ] [ ] [ ] [ ]
40 * [ ] [ ] [2] [ ] [ ] [ ] [ ]
41 * [ ] [ ] [ ] [3] [ ] [ ] [ ]
42 * [ ] [ ] [ ] [ ] [4] [ ] [ ]
43 * [ ] [ ] [ ] [ ] [ ] [5] [ ]
44 * [ ] [ ] [ ] [ ] [ ] [ ] [6]
45 * Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
47 * [0] [ ] [ ] [ ] [ ] [ ] [0]
48 * [1] [1] [ ] [ ] [ ] [ ] [ ]
49 * [ ] [2] [2] [ ] [ ] [ ] [ ]
50 * [ ] [ ] [3] [3] [ ] [ ] [ ]
51 * [ ] [ ] [ ] [4] [4] [ ] [ ]
52 * [ ] [ ] [ ] [ ] [5] [5] [ ]
53 * [ ] [ ] [ ] [ ] [ ] [6] [6]
54 * Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
55 * message contains all blocks from (rank) .. (rank + 2^2) with
58 * [0] [ ] [ ] [ ] [0] [0] [0]
59 * [1] [1] [ ] [ ] [ ] [1] [1]
60 * [2] [2] [2] [ ] [ ] [ ] [2]
61 * [3] [3] [3] [3] [ ] [ ] [ ]
62 * [ ] [4] [4] [4] [4] [ ] [ ]
63 * [ ] [ ] [5] [5] [5] [5] [ ]
64 * [ ] [ ] [ ] [6] [6] [6] [6]
65 * Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
66 * message size is "all remaining blocks"
68 * [0] [0] [0] [0] [0] [0] [0]
69 * [1] [1] [1] [1] [1] [1] [1]
70 * [2] [2] [2] [2] [2] [2] [2]
71 * [3] [3] [3] [3] [3] [3] [3]
72 * [4] [4] [4] [4] [4] [4] [4]
73 * [5] [5] [5] [5] [5] [5] [5]
74 * [6] [6] [6] [6] [6] [6] [6]
76 int smpi_coll_tuned_allgatherv_ompi_bruck(void *sbuf, int scount,
78 void *rbuf, int *rcounts,
84 int sendto, recvfrom, distance, blockcount, i;
85 int *new_rcounts = NULL, *new_rdispls = NULL;
86 int *new_scounts = NULL, *new_sdispls = NULL;
87 ptrdiff_t slb, rlb, sext, rext;
88 char *tmpsend = NULL, *tmprecv = NULL;
89 MPI_Datatype new_rdtype, new_sdtype;
91 size = smpi_comm_size(comm);
92 rank = smpi_comm_rank(comm);
95 "coll:tuned:allgather_ompi_bruck rank %d", rank);
97 smpi_datatype_extent (sdtype, &slb, &sext);
99 smpi_datatype_extent (rdtype, &rlb, &rext);
101 /* Initialization step:
102 - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
105 tmprecv = (char*) rbuf + rdispls[rank] * rext;
106 if (MPI_IN_PLACE != sbuf) {
107 tmpsend = (char*) sbuf;
108 smpi_datatype_copy(tmpsend, scount, sdtype,
109 tmprecv, rcounts[rank], rdtype);
112 /* Communication step:
113 At every step i, rank r:
114 - doubles the distance
115 - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
116 to rank (r - distance)
117 - receives message of blockcount blocks,
118 (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
120 - blockcount doubles until the last step when only the remaining data is
124 tmpsend = (char*) rbuf;
126 new_rcounts = (int*) calloc(4*size, sizeof(int));
127 new_rdispls = new_rcounts + size;
128 new_scounts = new_rdispls + size;
129 new_sdispls = new_scounts + size;
131 for (distance = 1; distance < size; distance<<=1) {
133 recvfrom = (rank + distance) % size;
134 sendto = (rank - distance + size) % size;
136 if (distance <= (size >> 1)) {
137 blockcount = distance;
139 blockcount = size - distance;
142 /* create send and receive datatypes */
143 for (i = 0; i < blockcount; i++) {
144 const int tmp_srank = (rank + i) % size;
145 const int tmp_rrank = (recvfrom + i) % size;
146 new_scounts[i] = rcounts[tmp_srank];
147 new_sdispls[i] = rdispls[tmp_srank];
148 new_rcounts[i] = rcounts[tmp_rrank];
149 new_rdispls[i] = rdispls[tmp_rrank];
151 smpi_datatype_indexed(blockcount, new_scounts, new_sdispls,
152 rdtype, &new_sdtype);
153 smpi_datatype_indexed(blockcount, new_rcounts, new_rdispls,
154 rdtype, &new_rdtype);
156 smpi_datatype_commit(&new_sdtype);
157 smpi_datatype_commit(&new_rdtype);
160 smpi_mpi_sendrecv(rbuf, 1, new_sdtype, sendto,
162 rbuf, 1, new_rdtype, recvfrom,
164 comm, MPI_STATUS_IGNORE);
165 smpi_datatype_free(&new_sdtype);
166 smpi_datatype_free(&new_rdtype);