src/smpi/colls/allreduce-smp-binomial.c

   1 #include "colls.h"
   2 /* IMPLEMENTED BY PITCH PATARASUK
   3    Non-topoloty-specific (however, number of cores/node need to be changed)
   4    all-reduce operation designed for smp clusters
   5    It uses 2-layer communication: binomial for both intra-communication
   6    inter-communication*/
   7
   8 /* change number of core per smp-node
   9    we assume that number of core per process will be the same for all implementations */
  10 #ifndef NUM_CORE
  11 #define NUM_CORE 8
  12 #endif
  13
  14 /* ** NOTE **
  15    Use -DMPICH2 if this code does not compile.
  16    MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
  17    This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
  18 */
  19
  20 //#include <star-reduction.c>
  21
  22 /*
  23 This fucntion performs all-reduce operation as follow.
  24 1) binomial_tree reduce inside each SMP node
  25 2) binomial_tree reduce intra-communication between root of each SMP node
  26 3) binomial_tree bcast intra-communication between root of each SMP node
  27 4) binomial_tree bcast inside each SMP node
  28 */
  29 int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
  30                                            int count, MPI_Datatype dtype,
  31                                            MPI_Op op, MPI_Comm comm)
  32 {
  33   int comm_size, rank;
  34   void *tmp_buf;
  35   int tag = 50;
  36   int mask, src, dst;
  37   int num_core = NUM_CORE;
  38   MPI_Status status;
  39   /*
  40      #ifdef MPICH2_REDUCTION
  41      MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
  42      #else
  43      MPI_User_function *uop;
  44      struct MPIR_OP *op_ptr;
  45      op_ptr = MPIR_ToPointer(op);
  46      uop  = op_ptr->op;
  47      #endif
  48    */
  49
  50   comm_size=smpi_comm_size(comm);
  51   rank=smpi_comm_rank(comm);
  52   MPI_Aint extent, lb;
  53   smpi_datatype_extent(dtype, &lb, &extent);
  54   tmp_buf = (void *) malloc(count * extent);
  55
  56   /* compute intra and inter ranking */
  57   int intra_rank, inter_rank;
  58   intra_rank = rank % num_core;
  59   inter_rank = rank / num_core;
  60
  61   /* size of processes participate in intra communications =>
  62      should be equal to number of machines */
  63   int inter_comm_size = (comm_size + num_core - 1) / num_core;
  64
  65   /* copy input buffer to output buffer */
  66   smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
  67                recv_buf, count, dtype, rank, tag, comm, &status);
  68
  69   /* start binomial reduce intra communication inside each SMP node */
  70   mask = 1;
  71   while (mask < num_core) {
  72     if ((mask & intra_rank) == 0) {
  73       src = (inter_rank * num_core) + (intra_rank | mask);
  74       if (src < comm_size) {
  75         smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
  76         star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
  77       }
  78     } else {
  79       dst = (inter_rank * num_core) + (intra_rank & (~mask));
  80       smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
  81       break;
  82     }
  83     mask <<= 1;
  84   }
  85
  86   /* start binomial reduce inter-communication between each SMP nodes:
  87      each node only have one process that can communicate to other nodes */
  88   if (intra_rank == 0) {
  89     mask = 1;
  90     while (mask < inter_comm_size) {
  91       if ((mask & inter_rank) == 0) {
  92         src = (inter_rank | mask) * num_core;
  93         if (src < comm_size) {
  94           smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
  95           star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
  96         }
  97       } else {
  98         dst = (inter_rank & (~mask)) * num_core;
  99         smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 100         break;
 101       }
 102       mask <<= 1;
 103     }
 104   }
 105
 106   /* start binomial broadcast inter-communication between each SMP nodes:
 107      each node only have one process that can communicate to other nodes */
 108   if (intra_rank == 0) {
 109     mask = 1;
 110     while (mask < inter_comm_size) {
 111       if (inter_rank & mask) {
 112         src = (inter_rank - mask) * num_core;
 113         smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
 114         break;
 115       }
 116       mask <<= 1;
 117     }
 118     mask >>= 1;
 119
 120     while (mask > 0) {
 121       if (inter_rank < inter_comm_size) {
 122         dst = (inter_rank + mask) * num_core;
 123         if (dst < comm_size) {
 124           smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 125         }
 126       }
 127       mask >>= 1;
 128     }
 129   }
 130
 131   /* start binomial broadcast intra-communication inside each SMP nodes */
 132   int num_core_in_current_smp = num_core;
 133   if (inter_rank == (inter_comm_size - 1)) {
 134     num_core_in_current_smp = comm_size - (inter_rank * num_core);
 135   }
 136   mask = 1;
 137   while (mask < num_core_in_current_smp) {
 138     if (intra_rank & mask) {
 139       src = (inter_rank * num_core) + (intra_rank - mask);
 140       smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
 141       break;
 142     }
 143     mask <<= 1;
 144   }
 145   mask >>= 1;
 146
 147   while (mask > 0) {
 148     dst = (inter_rank * num_core) + (intra_rank + mask);
 149     if (dst < comm_size) {
 150       smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 151     }
 152     mask >>= 1;
 153   }
 154
 155   free(tmp_buf);
 156   return MPI_SUCCESS;
 157 }