2 * ompi_coll_tuned_allgather_intra_neighborexchange
4 * Function: allgather using N/2 steps (O(N))
5 * Accepts: Same arguments as MPI_Allgather
6 * Returns: MPI_SUCCESS or error code
8 * Description: Neighbor Exchange algorithm for allgather.
9 * Described by Chen et.al. in
10 * "Performance Evaluation of Allgather Algorithms on
11 * Terascale Linux Cluster with Fast Ethernet",
12 * Proceedings of the Eighth International Conference on
13 * High-Performance Computing inn Asia-Pacific Region
16 * Rank r exchanges message with one of its neighbors and
17 * forwards the data further in the next step.
19 * No additional memory requirements.
21 * Limitations: Algorithm works only on even number of processes.
22 * For odd number of processes we switch to ring algorithm.
27 * [0] [ ] [ ] [ ] [ ] [ ]
28 * [ ] [1] [ ] [ ] [ ] [ ]
29 * [ ] [ ] [2] [ ] [ ] [ ]
30 * [ ] [ ] [ ] [3] [ ] [ ]
31 * [ ] [ ] [ ] [ ] [4] [ ]
32 * [ ] [ ] [ ] [ ] [ ] [5]
35 * [0] [0] [ ] [ ] [ ] [ ]
36 * [1] [1] [ ] [ ] [ ] [ ]
37 * [ ] [ ] [2] [2] [ ] [ ]
38 * [ ] [ ] [3] [3] [ ] [ ]
39 * [ ] [ ] [ ] [ ] [4] [4]
40 * [ ] [ ] [ ] [ ] [5] [5]
43 * [0] [0] [0] [ ] [ ] [0]
44 * [1] [1] [1] [ ] [ ] [1]
45 * [ ] [2] [2] [2] [2] [ ]
46 * [ ] [3] [3] [3] [3] [ ]
47 * [4] [ ] [ ] [4] [4] [4]
48 * [5] [ ] [ ] [5] [5] [5]
51 * [0] [0] [0] [0] [0] [0]
52 * [1] [1] [1] [1] [1] [1]
53 * [2] [2] [2] [2] [2] [2]
54 * [3] [3] [3] [3] [3] [3]
55 * [4] [4] [4] [4] [4] [4]
56 * [5] [5] [5] [5] [5] [5]
59 #include "colls_private.h"
61 smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
63 void* rbuf, int rcount,
70 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
73 ptrdiff_t slb, rlb, sext, rext;
74 char *tmpsend = NULL, *tmprecv = NULL;
76 size = smpi_comm_size(comm);
77 rank = smpi_comm_rank(comm);
81 "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
83 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
89 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
91 err = smpi_datatype_extent (sdtype, &slb, &sext);
92 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
94 err = smpi_datatype_extent (rdtype, &rlb, &rext);
95 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
97 /* Initialization step:
98 - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
101 tmprecv = (char*) rbuf + rank * rcount * rext;
102 if (MPI_IN_PLACE != sbuf) {
103 tmpsend = (char*) sbuf;
104 smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
107 /* Determine neighbors, order in which blocks will arrive, etc. */
108 even_rank = !(rank % 2);
110 neighbor[0] = (rank + 1) % size;
111 neighbor[1] = (rank - 1 + size) % size;
112 recv_data_from[0] = rank;
113 recv_data_from[1] = rank;
114 offset_at_step[0] = (+2);
115 offset_at_step[1] = (-2);
117 neighbor[0] = (rank - 1 + size) % size;
118 neighbor[1] = (rank + 1) % size;
119 recv_data_from[0] = neighbor[0];
120 recv_data_from[1] = neighbor[0];
121 offset_at_step[0] = (-2);
122 offset_at_step[1] = (+2);
125 /* Communication loop:
126 - First step is special: exchange a single block with neighbor[0].
128 update recv_data_from according to offset, and
129 exchange two blocks with appropriate neighbor.
130 the send location becomes previous receve location.
132 tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
133 tmpsend = (char*)rbuf + rank * rcount * rext;
135 smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
137 tmprecv, rcount, rdtype, neighbor[0],
139 comm, MPI_STATUS_IGNORE);
141 /* Determine initial sending location */
143 send_data_from = rank;
145 send_data_from = recv_data_from[0];
148 for (i = 1; i < (size / 2); i++) {
149 const int i_parity = i % 2;
150 recv_data_from[i_parity] =
151 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
153 tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
154 tmpsend = (char*)rbuf + send_data_from * rcount * rext;
157 smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype,
160 tmprecv, 2 * rcount, rdtype,
163 comm, MPI_STATUS_IGNORE);
165 send_data_from = recv_data_from[i_parity];
171 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
172 __FILE__, line, err, rank);