src/smpi/colls/barrier/barrier-ompi.cpp

   1 /* Copyright (c) 2013-2022. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2006 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  19  *
  20  * Additional copyrights may follow
  21  */
  22
  23 #include "../coll_tuned_topo.hpp"
  24 #include "../colls_private.hpp"
  25 #include "smpi_actor.hpp"
  26
  27 /*
  28  * Barrier is meant to be a synchronous operation, as some BTLs can mark
  29  * a request done before its passed to the NIC and progress might not be made
  30  * elsewhere we cannot allow a process to exit the barrier until its last
  31  * [round of] sends are completed.
  32  *
  33  * It is last round of sends rather than 'last' individual send as each pair of
  34  * peers can use different channels/devices/btls and the receiver of one of
  35  * these sends might be forced to wait as the sender
  36  * leaves the collective and does not make progress until the next mpi call
  37  *
  38  */
  39
  40 /*
  41  * Simple double ring version of barrier
  42  *
  43  * synchronous guarantee made by last ring of sends are synchronous
  44  *
  45  */
  46 namespace simgrid {
  47 namespace smpi {
  48 int barrier__ompi_doublering(MPI_Comm comm)
  49 {
  50     int rank, size;
  51     int left, right;
  52
  53
  54     rank = comm->rank();
  55     size = comm->size();
  56     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
  57     XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
  58
  59     left = ((rank-1+size)%size);
  60     right = ((rank+1)%size);
  61
  62     if (rank > 0) { /* receive message from the left */
  63       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  64     }
  65
  66     /* Send message to the right */
  67     Request::send(nullptr, 0, MPI_BYTE, right, tag, comm);
  68
  69     /* root needs to receive from the last node */
  70     if (rank == 0) {
  71       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  72     }
  73
  74     /* Allow nodes to exit */
  75     if (rank > 0) { /* post Receive from left */
  76       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  77     }
  78
  79     /* send message to the right one */
  80     Request::send(nullptr, 0, MPI_BYTE, right, tag, comm);
  81
  82     /* rank 0 post receive from the last node */
  83     if (rank == 0) {
  84       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  85     }
  86
  87     return MPI_SUCCESS;
  88
  89 }
  90
  91
  92 /*
  93  * To make synchronous, uses sync sends and sync sendrecvs
  94  */
  95
  96 int barrier__ompi_recursivedoubling(MPI_Comm comm)
  97 {
  98     int rank, size, adjsize;
  99     int mask, remote;
 100
 101     rank = comm->rank();
 102     size = comm->size();
 103     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 104     XBT_DEBUG(
 105                  "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
 106                  rank);
 107
 108     /* do nearest power of 2 less than size calc */
 109     for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
 110     adjsize >>= 1;
 111
 112     /* if size is not exact power of two, perform an extra step */
 113     if (adjsize != size) {
 114         if (rank >= adjsize) {
 115             /* send message to lower ranked node */
 116             remote = rank - adjsize;
 117             Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote,
 118                               tag, comm, MPI_STATUS_IGNORE);
 119
 120         } else if (rank < (size - adjsize)) {
 121
 122             /* receive message from high level rank */
 123             Request::recv(nullptr, 0, MPI_BYTE, rank + adjsize, tag, comm, MPI_STATUS_IGNORE);
 124         }
 125     }
 126
 127     /* exchange messages */
 128     if ( rank < adjsize ) {
 129         mask = 0x1;
 130         while ( mask < adjsize ) {
 131             remote = rank ^ mask;
 132             mask <<= 1;
 133             if (remote >= adjsize) continue;
 134
 135             /* post receive from the remote node */
 136             Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote,
 137                               tag, comm, MPI_STATUS_IGNORE);
 138         }
 139     }
 140
 141     /* non-power of 2 case */
 142     if (adjsize != size) {
 143         if (rank < (size - adjsize)) {
 144             /* send enter message to higher ranked node */
 145             remote = rank + adjsize;
 146             Request::send(nullptr, 0, MPI_BYTE, remote, tag, comm);
 147         }
 148     }
 149
 150     return MPI_SUCCESS;
 151
 152 }
 153
 154
 155 /*
 156  * To make synchronous, uses sync sends and sync sendrecvs
 157  */
 158
 159 int barrier__ompi_bruck(MPI_Comm comm)
 160 {
 161     int rank, size;
 162     int distance, to, from;
 163
 164     rank = comm->rank();
 165     size = comm->size();
 166     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 167     XBT_DEBUG(
 168                  "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
 169
 170     /* exchange data with rank-2^k and rank+2^k */
 171     for (distance = 1; distance < size; distance <<= 1) {
 172         from = (rank + size - distance) % size;
 173         to   = (rank + distance) % size;
 174
 175         /* send message to lower ranked node */
 176         Request::sendrecv(nullptr, 0, MPI_BYTE, to, tag, nullptr, 0, MPI_BYTE, from, tag,
 177                           comm, MPI_STATUS_IGNORE);
 178     }
 179
 180     return MPI_SUCCESS;
 181
 182 }
 183
 184
 185 /*
 186  * To make synchronous, uses sync sends and sync sendrecvs
 187  */
 188 /* special case for two processes */
 189 int barrier__ompi_two_procs(MPI_Comm comm)
 190 {
 191     int remote;
 192
 193     remote = comm->rank();
 194     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 195     XBT_DEBUG(
 196                  "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
 197     remote = (remote + 1) & 0x1;
 198
 199     Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote, tag,
 200                       comm, MPI_STATUS_IGNORE);
 201     return (MPI_SUCCESS);
 202 }
 203
 204
 205 /*
 206  * Linear functions are copied from the BASIC coll module
 207  * they do not segment the message and are simple implementations
 208  * but for some small number of nodes and/or small data sizes they
 209  * are just as fast as tuned/tree based segmenting operations
 210  * and as such may be selected by the decision functions
 211  * These are copied into this module due to the way we select modules
 212  * in V1. i.e. in V2 we will handle this differently and so will not
 213  * have to duplicate code.
 214  * GEF Oct05 after asking Jeff.
 215  */
 216
 217 /* copied function (with appropriate renaming) starts here */
 218
 219 int barrier__ompi_basic_linear(MPI_Comm comm)
 220 {
 221     int i;
 222     int size = comm->size();
 223     int rank = comm->rank();
 224
 225     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 226     /* All non-root send & receive zero-length message. */
 227
 228     if (rank > 0) {
 229       Request::send(nullptr, 0, MPI_BYTE, 0, tag, comm);
 230
 231       Request::recv(nullptr, 0, MPI_BYTE, 0, tag, comm, MPI_STATUS_IGNORE);
 232     }
 233
 234     /* The root collects and broadcasts the messages. */
 235
 236     else {
 237         MPI_Request* requests;
 238
 239         requests = new MPI_Request[size];
 240         for (i = 1; i < size; ++i) {
 241           requests[i] = Request::irecv(nullptr, 0, MPI_BYTE, i, tag, comm);
 242         }
 243         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 244
 245         for (i = 1; i < size; ++i) {
 246           requests[i] = Request::isend(nullptr, 0, MPI_BYTE, i, tag, comm);
 247         }
 248         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 249         delete[] requests;
 250     }
 251
 252     /* All done */
 253
 254     return MPI_SUCCESS;
 255
 256 }
 257 /* copied function (with appropriate renaming) ends here */
 258
 259 /*
 260  * Another recursive doubling type algorithm, but in this case
 261  * we go up the tree and back down the tree.
 262  */
 263 int barrier__ompi_tree(MPI_Comm comm)
 264 {
 265     int rank, size, depth;
 266     int jump, partner;
 267
 268     rank = comm->rank();
 269     size = comm->size();
 270     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 271     XBT_DEBUG(
 272                  "ompi_coll_tuned_barrier_ompi_tree %d",
 273                  rank);
 274
 275     /* Find the nearest power of 2 of the communicator size. */
 276     for(depth = 1; depth < size; depth <<= 1 );
 277
 278     for (jump=1; jump<depth; jump<<=1) {
 279         partner = rank ^ jump;
 280         if (!(partner & (jump-1)) && partner < size) {
 281             if (partner > rank) {
 282               Request::recv(nullptr, 0, MPI_BYTE, partner, tag, comm, MPI_STATUS_IGNORE);
 283             } else if (partner < rank) {
 284               Request::send(nullptr, 0, MPI_BYTE, partner, tag, comm);
 285             }
 286         }
 287     }
 288
 289     depth>>=1;
 290     for (jump = depth; jump>0; jump>>=1) {
 291         partner = rank ^ jump;
 292         if (!(partner & (jump-1)) && partner < size) {
 293             if (partner > rank) {
 294               Request::send(nullptr, 0, MPI_BYTE, partner, tag, comm);
 295             } else if (partner < rank) {
 296               Request::recv(nullptr, 0, MPI_BYTE, partner, tag, comm, MPI_STATUS_IGNORE);
 297             }
 298         }
 299     }
 300
 301     return MPI_SUCCESS;
 302 }
 303
 304 }
 305 }