2 /* IMPLEMENTED BY PITCH PATARASUK
3 Non-topoloty-specific (however, number of cores/node need to be changed)
4 all-reduce operation designed for smp clusters
5 It uses 2-layer communication: binomial for both intra-communication
8 /* change number of core per smp-node
9 we assume that number of core per process will be the same for all implementations */
15 Use -DMPICH2 if this code does not compile.
16 MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
17 This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
20 //#include <star-reduction.c>
23 This fucntion performs all-reduce operation as follow.
24 1) binomial_tree reduce inside each SMP node
25 2) binomial_tree reduce intra-communication between root of each SMP node
26 3) binomial_tree bcast intra-communication between root of each SMP node
27 4) binomial_tree bcast inside each SMP node
29 int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
30 int count, MPI_Datatype dtype,
31 MPI_Op op, MPI_Comm comm)
37 int num_core = NUM_CORE;
40 #ifdef MPICH2_REDUCTION
41 MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
43 MPI_User_function *uop;
44 struct MPIR_OP *op_ptr;
45 op_ptr = MPIR_ToPointer(op);
50 comm_size=smpi_comm_size(comm);
51 rank=smpi_comm_rank(comm);
53 smpi_datatype_extent(dtype, &lb, &extent);
54 tmp_buf = (void *) malloc(count * extent);
56 /* compute intra and inter ranking */
57 int intra_rank, inter_rank;
58 intra_rank = rank % num_core;
59 inter_rank = rank / num_core;
61 /* size of processes participate in intra communications =>
62 should be equal to number of machines */
63 int inter_comm_size = (comm_size + num_core - 1) / num_core;
65 /* copy input buffer to output buffer */
66 smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
67 recv_buf, count, dtype, rank, tag, comm, &status);
69 /* start binomial reduce intra communication inside each SMP node */
71 while (mask < num_core) {
72 if ((mask & intra_rank) == 0) {
73 src = (inter_rank * num_core) + (intra_rank | mask);
74 if (src < comm_size) {
75 smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
76 star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
79 dst = (inter_rank * num_core) + (intra_rank & (~mask));
80 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
86 /* start binomial reduce inter-communication between each SMP nodes:
87 each node only have one process that can communicate to other nodes */
88 if (intra_rank == 0) {
90 while (mask < inter_comm_size) {
91 if ((mask & inter_rank) == 0) {
92 src = (inter_rank | mask) * num_core;
93 if (src < comm_size) {
94 smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
95 star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
98 dst = (inter_rank & (~mask)) * num_core;
99 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
106 /* start binomial broadcast inter-communication between each SMP nodes:
107 each node only have one process that can communicate to other nodes */
108 if (intra_rank == 0) {
110 while (mask < inter_comm_size) {
111 if (inter_rank & mask) {
112 src = (inter_rank - mask) * num_core;
113 smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
121 if (inter_rank < inter_comm_size) {
122 dst = (inter_rank + mask) * num_core;
123 if (dst < comm_size) {
124 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
131 /* start binomial broadcast intra-communication inside each SMP nodes */
132 int num_core_in_current_smp = num_core;
133 if (inter_rank == (inter_comm_size - 1)) {
134 num_core_in_current_smp = comm_size - (inter_rank * num_core);
137 while (mask < num_core_in_current_smp) {
138 if (intra_rank & mask) {
139 src = (inter_rank * num_core) + (intra_rank - mask);
140 smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
148 dst = (inter_rank * num_core) + (intra_rank + mask);
149 if (dst < comm_size) {
150 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);