Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Concatenate nested namespaces (sonar).
[simgrid.git] / src / smpi / colls / reduce / reduce-ompi.cpp
1 /* Copyright (c) 2013-2022. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9  *                         University Research and Technology
10  *                         Corporation.  All rights reserved.
11  * Copyright (c) 2004-2009 The University of Tennessee and The University
12  *                         of Tennessee Research Foundation.  All rights
13  *                         reserved.
14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15  *                         University of Stuttgart.  All rights reserved.
16  * Copyright (c) 2004-2005 The Regents of the University of California.
17  *                         All rights reserved.
18  *
19  * Additional copyrights may follow
20  */
21
22 #include "../coll_tuned_topo.hpp"
23 #include "../colls_private.hpp"
24
25 namespace simgrid::smpi {
26
27 int smpi_coll_tuned_ompi_reduce_generic(const void* sendbuf, void* recvbuf, int original_count,
28                                     MPI_Datatype datatype, MPI_Op  op,
29                                     int root, MPI_Comm comm,
30                                     ompi_coll_tree_t* tree, int count_by_segment,
31                                     int max_outstanding_reqs );
32 /**
33  * This is a generic implementation of the reduce protocol. It used the tree
34  * provided as an argument and execute all operations using a segment of
35  * count times a datatype.
36  * For the last communication it will update the count in order to limit
37  * the number of datatype to the original count (original_count)
38  *
39  * Note that for non-commutative operations we cannot save memory copy
40  * for the first block: thus we must copy sendbuf to accumbuf on intermediate
41  * to keep the optimized loop happy.
42  */
43 int smpi_coll_tuned_ompi_reduce_generic(const void* sendbuf, void* recvbuf, int original_count,
44                                     MPI_Datatype datatype, MPI_Op  op,
45                                     int root, MPI_Comm comm,
46                                     ompi_coll_tree_t* tree, int count_by_segment,
47                                     int max_outstanding_reqs )
48 {
49   unsigned char *inbuf[2] = {nullptr, nullptr}, *inbuf_free[2] = {nullptr, nullptr};
50   unsigned char *accumbuf = nullptr, *accumbuf_free = nullptr;
51   const unsigned char *local_op_buffer = nullptr, *sendtmpbuf = nullptr;
52   ptrdiff_t extent, lower_bound, segment_increment;
53   MPI_Request reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
54   int num_segments, line, ret, segindex, i, rank;
55   int recvcount, prevcount, inbi;
56
57   /**
58    * Determine number of segments and number of elements
59    * sent per operation
60    */
61   datatype->extent(&lower_bound, &extent);
62   num_segments      = (original_count + count_by_segment - 1) / count_by_segment;
63   segment_increment = count_by_segment * extent;
64
65   sendtmpbuf = static_cast<const unsigned char*>(sendbuf);
66   if (sendbuf == MPI_IN_PLACE) {
67     sendtmpbuf = static_cast<const unsigned char*>(recvbuf);
68     }
69
70     XBT_DEBUG("coll:tuned:reduce_generic count %d, msg size %lu, segsize %lu, max_requests %d", original_count,
71               (unsigned long)(num_segments * segment_increment), (unsigned long)segment_increment,
72               max_outstanding_reqs);
73
74     rank = comm->rank();
75
76     /* non-leaf nodes - wait for children to send me data & forward up
77        (if needed) */
78     if( tree->tree_nextsize > 0 ) {
79         ptrdiff_t true_extent, real_segment_size;
80         true_extent=datatype->get_extent();
81
82         /* handle non existent recv buffer (i.e. its NULL) and
83            protect the recv buffer on non-root nodes */
84         accumbuf = static_cast<unsigned char*>(recvbuf);
85         if (nullptr == accumbuf || root != rank) {
86           /* Allocate temporary accumulator buffer. */
87           accumbuf_free = smpi_get_tmp_sendbuffer(true_extent + (original_count - 1) * extent);
88           if (accumbuf_free == nullptr) {
89             line = __LINE__;
90             ret  = -1;
91             goto error_hndl;
92           }
93           accumbuf = accumbuf_free - lower_bound;
94         }
95
96         /* If this is a non-commutative operation we must copy
97            sendbuf to the accumbuf, in order to simplify the loops */
98         if ((op != MPI_OP_NULL && not op->is_commutative())) {
99           Datatype::copy(sendtmpbuf, original_count, datatype, accumbuf, original_count, datatype);
100         }
101         /* Allocate two buffers for incoming segments */
102         real_segment_size = true_extent + (count_by_segment - 1) * extent;
103         inbuf_free[0]     = smpi_get_tmp_recvbuffer(real_segment_size);
104         if (inbuf_free[0] == nullptr) {
105           line = __LINE__;
106           ret  = -1;
107           goto error_hndl;
108         }
109         inbuf[0] = inbuf_free[0] - lower_bound;
110         /* if there is chance to overlap communication -
111            allocate second buffer */
112         if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
113           inbuf_free[1] = smpi_get_tmp_recvbuffer(real_segment_size);
114           if (inbuf_free[1] == nullptr) {
115             line = __LINE__;
116             ret  = -1;
117             goto error_hndl;
118             }
119             inbuf[1] = inbuf_free[1] - lower_bound;
120         }
121
122         /* reset input buffer index and receive count */
123         inbi = 0;
124         recvcount = 0;
125         /* for each segment */
126         for( segindex = 0; segindex <= num_segments; segindex++ ) {
127             prevcount = recvcount;
128             /* recvcount - number of elements in current segment */
129             recvcount = count_by_segment;
130             if( segindex == (num_segments-1) )
131                 recvcount = original_count - count_by_segment * segindex;
132
133             /* for each child */
134             for( i = 0; i < tree->tree_nextsize; i++ ) {
135                 /**
136                  * We try to overlap communication:
137                  * either with next segment or with the next child
138                  */
139                 /* post irecv for current segindex on current child */
140                 if( segindex < num_segments ) {
141                     void* local_recvbuf = inbuf[inbi];
142                     if( 0 == i ) {
143                         /* for the first step (1st child per segment) and
144                          * commutative operations we might be able to irecv
145                          * directly into the accumulate buffer so that we can
146                          * reduce(op) this with our sendbuf in one step as
147                          * ompi_op_reduce only has two buffer pointers,
148                          * this avoids an extra memory copy.
149                          *
150                          * BUT if the operation is non-commutative or
151                          * we are root and are USING MPI_IN_PLACE this is wrong!
152                          */
153                         if(  (op==MPI_OP_NULL || op->is_commutative()) &&
154                             !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
155                             local_recvbuf = accumbuf + segindex * segment_increment;
156                         }
157                     }
158
159                     reqs[inbi]=Request::irecv(local_recvbuf, recvcount, datatype,
160                                              tree->tree_next[i],
161                                              COLL_TAG_REDUCE, comm
162                                              );
163                 }
164                 /* wait for previous req to complete, if any.
165                    if there are no requests reqs[inbi ^1] will be
166                    MPI_REQUEST_NULL. */
167                 /* wait on data from last child for previous segment */
168                 Request::waitall( 1, &reqs[inbi ^ 1],
169                                              MPI_STATUSES_IGNORE );
170                 local_op_buffer = inbuf[inbi ^ 1];
171                 if( i > 0 ) {
172                     /* our first operation is to combine our own [sendbuf] data
173                      * with the data we recvd from down stream (but only
174                      * the operation is commutative and if we are not root and
175                      * not using MPI_IN_PLACE)
176                      */
177                     if( 1 == i ) {
178                         if( (op==MPI_OP_NULL || op->is_commutative())&&
179                             !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
180                             local_op_buffer = sendtmpbuf + segindex * segment_increment;
181                         }
182                     }
183                     /* apply operation */
184                     if(op!=MPI_OP_NULL) op->apply( local_op_buffer,
185                                    accumbuf + segindex * segment_increment,
186                                    &recvcount, datatype );
187                 } else if ( segindex > 0 ) {
188                     void* accumulator = accumbuf + (segindex-1) * segment_increment;
189                     if( tree->tree_nextsize <= 1 ) {
190                         if(  (op==MPI_OP_NULL || op->is_commutative()) &&
191                             !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
192                             local_op_buffer = sendtmpbuf + (segindex-1) * segment_increment;
193                         }
194                     }
195                     if(op!=MPI_OP_NULL) op->apply( local_op_buffer, accumulator, &prevcount,
196                                    datatype );
197
198                     /* all reduced on available data this step (i) complete,
199                      * pass to the next process unless you are the root.
200                      */
201                     if (rank != tree->tree_root) {
202                         /* send combined/accumulated data to parent */
203                         Request::send( accumulator, prevcount,
204                                                   datatype, tree->tree_prev,
205                                                   COLL_TAG_REDUCE,
206                                                   comm);
207                     }
208
209                     /* we stop when segindex = number of segments
210                        (i.e. we do num_segment+1 steps for pipelining */
211                     if (segindex == num_segments) break;
212                 }
213
214                 /* update input buffer index */
215                 inbi = inbi ^ 1;
216             } /* end of for each child */
217         } /* end of for each segment */
218
219         /* clean up */
220         smpi_free_tmp_buffer(inbuf_free[0]);
221         smpi_free_tmp_buffer(inbuf_free[1]);
222         smpi_free_tmp_buffer(accumbuf_free);
223     }
224
225     /* leaf nodes
226        Depending on the value of max_outstanding_reqs and
227        the number of segments we have two options:
228        - send all segments using blocking send to the parent, or
229        - avoid overflooding the parent nodes by limiting the number of
230        outstanding requests to max_oustanding_reqs.
231        TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size
232        for the current communication, synchronization should be used only
233        when the message/segment size is smaller than the eager size.
234     */
235     else {
236
237         /* If the number of segments is less than a maximum number of outstanding
238            requests or there is no limit on the maximum number of outstanding
239            requests, we send data to the parent using blocking send */
240         if ((0 == max_outstanding_reqs) ||
241             (num_segments <= max_outstanding_reqs)) {
242
243             segindex = 0;
244             while ( original_count > 0) {
245                 if (original_count < count_by_segment) {
246                     count_by_segment = original_count;
247                 }
248                 Request::send((char*)sendbuf +
249                                          segindex * segment_increment,
250                                          count_by_segment, datatype,
251                                          tree->tree_prev,
252                                          COLL_TAG_REDUCE,
253                                          comm) ;
254                 segindex++;
255                 original_count -= count_by_segment;
256             }
257         }
258
259         /* Otherwise, introduce flow control:
260            - post max_outstanding_reqs non-blocking synchronous send,
261            - for remaining segments
262            - wait for a ssend to complete, and post the next one.
263            - wait for all outstanding sends to complete.
264         */
265         else {
266
267             int creq = 0;
268             auto* sreq = new (std::nothrow) MPI_Request[max_outstanding_reqs];
269             if (nullptr == sreq) {
270               line = __LINE__;
271               ret  = -1;
272               goto error_hndl;
273             }
274
275             /* post first group of requests */
276             for (segindex = 0; segindex < max_outstanding_reqs; segindex++) {
277                 sreq[segindex]=Request::isend((char*)sendbuf +
278                                           segindex * segment_increment,
279                                           count_by_segment, datatype,
280                                           tree->tree_prev,
281                                           COLL_TAG_REDUCE,
282                                           comm);
283                 original_count -= count_by_segment;
284             }
285
286             creq = 0;
287             while ( original_count > 0 ) {
288                 /* wait on a posted request to complete */
289                 Request::wait(&sreq[creq], MPI_STATUS_IGNORE);
290                 sreq[creq] = MPI_REQUEST_NULL;
291
292                 if( original_count < count_by_segment ) {
293                     count_by_segment = original_count;
294                 }
295                 sreq[creq]=Request::isend((char*)sendbuf +
296                                           segindex * segment_increment,
297                                           count_by_segment, datatype,
298                                           tree->tree_prev,
299                                           COLL_TAG_REDUCE,
300                                           comm );
301                 creq = (creq + 1) % max_outstanding_reqs;
302                 segindex++;
303                 original_count -= count_by_segment;
304             }
305
306             /* Wait on the remaining request to complete */
307             Request::waitall( max_outstanding_reqs, sreq,
308                                          MPI_STATUSES_IGNORE );
309
310             /* free requests */
311             delete[] sreq;
312         }
313     }
314     ompi_coll_tuned_topo_destroy_tree(&tree);
315     return MPI_SUCCESS;
316
317  error_hndl:  /* error handler */
318     XBT_DEBUG("ERROR_HNDL: node %d file %s line %d error %d\n",
319                    rank, __FILE__, line, ret );
320     smpi_free_tmp_buffer(inbuf_free[0]);
321     smpi_free_tmp_buffer(inbuf_free[1]);
322     smpi_free_tmp_buffer(accumbuf);
323     return ret;
324 }
325
326 /* Attention: this version of the reduce operations does not
327    work for:
328    - non-commutative operations
329    - segment sizes which are not multiplies of the extent of the datatype
330      meaning that at least one datatype must fit in the segment !
331 */
332
333
334 int reduce__ompi_chain(const void *sendbuf, void *recvbuf, int count,
335                        MPI_Datatype datatype,
336                        MPI_Op  op, int root,
337                        MPI_Comm  comm
338                        )
339 {
340     uint32_t segsize=64*1024;
341     int segcount = count;
342     size_t typelng;
343     int fanout = comm->size()/2;
344
345     XBT_DEBUG("coll:tuned:reduce_intra_chain rank %d fo %d ss %5u", comm->rank(), fanout, segsize);
346
347     /**
348      * Determine number of segments and number of elements
349      * sent per operation
350      */
351     typelng = datatype->size();
352
353     COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
354
355     return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype,
356                                            op, root, comm,
357                                            ompi_coll_tuned_topo_build_chain(fanout, comm, root),
358                                            segcount, 0 );
359 }
360
361
362 int reduce__ompi_pipeline(const void *sendbuf, void *recvbuf,
363                           int count, MPI_Datatype datatype,
364                           MPI_Op  op, int root,
365                           MPI_Comm  comm  )
366 {
367
368     uint32_t segsize;
369     int segcount = count;
370     size_t typelng;
371 //    COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );
372
373     /**
374      * Determine number of segments and number of elements
375      * sent per operation
376      */
377     const double a2 =  0.0410 / 1024.0; /* [1/B] */
378     const double b2 =  9.7128;
379     const double a4 =  0.0033 / 1024.0; /* [1/B] */
380     const double b4 =  1.6761;
381     typelng= datatype->size();
382     int communicator_size = comm->size();
383     size_t message_size = typelng * count;
384
385     if (communicator_size > (a2 * message_size + b2)) {
386         // Pipeline_1K
387         segsize = 1024;
388     }else if (communicator_size > (a4 * message_size + b4)) {
389         // Pipeline_32K
390         segsize = 32*1024;
391     } else {
392         // Pipeline_64K
393         segsize = 64*1024;
394     }
395
396     XBT_DEBUG("coll:tuned:reduce_intra_pipeline rank %d ss %5u", comm->rank(), segsize);
397
398     COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
399
400     return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype,
401                                            op, root, comm,
402                                            ompi_coll_tuned_topo_build_chain( 1, comm, root),
403                                            segcount, 0);
404 }
405
406 int reduce__ompi_binary(const void *sendbuf, void *recvbuf,
407                         int count, MPI_Datatype datatype,
408                         MPI_Op  op, int root,
409                         MPI_Comm  comm)
410 {
411     uint32_t segsize;
412     int segcount = count;
413     size_t typelng;
414
415
416
417     /**
418      * Determine number of segments and number of elements
419      * sent per operation
420      */
421     typelng=datatype->size();
422
423         // Binary_32K
424     segsize = 32*1024;
425
426     XBT_DEBUG("coll:tuned:reduce_intra_binary rank %d ss %5u", comm->rank(), segsize);
427
428     COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
429
430     return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype,
431                                            op, root, comm,
432                                            ompi_coll_tuned_topo_build_tree(2, comm, root),
433                                            segcount, 0);
434 }
435
436 int reduce__ompi_binomial(const void *sendbuf, void *recvbuf,
437                           int count, MPI_Datatype datatype,
438                           MPI_Op  op, int root,
439                           MPI_Comm  comm)
440 {
441
442     uint32_t segsize=0;
443     int segcount = count;
444     size_t typelng;
445
446     const double a1 =  0.6016 / 1024.0; /* [1/B] */
447     const double b1 =  1.3496;
448
449 //    COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
450
451     /**
452      * Determine number of segments and number of elements
453      * sent per operation
454      */
455     typelng= datatype->size();
456     int communicator_size = comm->size();
457     size_t message_size = typelng * count;
458     if (((communicator_size < 8) && (message_size < 20480)) ||
459                (message_size < 2048) || (count <= 1)) {
460         /* Binomial_0K */
461         segsize = 0;
462     } else if (communicator_size > (a1 * message_size + b1)) {
463         // Binomial_1K
464         segsize = 1024;
465     }
466
467     XBT_DEBUG("coll:tuned:reduce_intra_binomial rank %d ss %5u", comm->rank(), segsize);
468     COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
469
470     return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype,
471                                            op, root, comm,
472                                            ompi_coll_tuned_topo_build_in_order_bmtree(comm, root),
473                                            segcount, 0);
474 }
475
476 /*
477  * reduce_intra_in_order_binary
478  *
479  * Function:      Logarithmic reduce operation for non-commutative operations.
480  * Accepts:       same as MPI_Reduce()
481  * Returns:       MPI_SUCCESS or error code
482  */
483 int reduce__ompi_in_order_binary(const void *sendbuf, void *recvbuf,
484                                  int count,
485                                  MPI_Datatype datatype,
486                                  MPI_Op  op, int root,
487                                  MPI_Comm  comm)
488 {
489     uint32_t segsize=0;
490     int ret;
491     int rank, size, io_root;
492     int segcount = count;
493     size_t typelng;
494
495     rank = comm->rank();
496     size = comm->size();
497     XBT_DEBUG("coll:tuned:reduce_intra_in_order_binary rank %d ss %5u", rank, segsize);
498
499     /**
500      * Determine number of segments and number of elements
501      * sent per operation
502      */
503     typelng=datatype->size();
504     COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
505
506     /* An in-order binary tree must use root (size-1) to preserve the order of
507        operations.  Thus, if root is not rank (size - 1), then we must handle
508        1. MPI_IN_PLACE option on real root, and
509        2. we must allocate temporary recvbuf on rank (size - 1).
510        Note that generic function must be careful not to switch order of
511        operations for non-commutative ops.
512     */
513     io_root = size - 1;
514     const void* use_this_sendbuf = sendbuf;
515     void* use_this_recvbuf       = recvbuf;
516     unsigned char* tmp_sendbuf   = nullptr;
517     unsigned char* tmp_recvbuf   = nullptr;
518     if (io_root != root) {
519         ptrdiff_t text, ext;
520
521         ext=datatype->get_extent();
522         text=datatype->get_extent();
523
524         if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
525           tmp_sendbuf = smpi_get_tmp_sendbuffer(text + (count - 1) * ext);
526           if (nullptr == tmp_sendbuf) {
527             return MPI_ERR_INTERN;
528           }
529           Datatype::copy(recvbuf, count, datatype, tmp_sendbuf, count, datatype);
530           use_this_sendbuf = tmp_sendbuf;
531         } else if (io_root == rank) {
532           tmp_recvbuf = smpi_get_tmp_recvbuffer(text + (count - 1) * ext);
533           if (nullptr == tmp_recvbuf) {
534             return MPI_ERR_INTERN;
535           }
536           use_this_recvbuf = tmp_recvbuf;
537         }
538     }
539
540     /* Use generic reduce with in-order binary tree topology and io_root */
541     ret = smpi_coll_tuned_ompi_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
542                                           op, io_root, comm,
543                                           ompi_coll_tuned_topo_build_in_order_bintree(comm),
544                                           segcount, 0 );
545     if (MPI_SUCCESS != ret) { return ret; }
546
547     /* Clean up */
548     if (io_root != root) {
549         if (root == rank) {
550             /* Receive result from rank io_root to recvbuf */
551             Request::recv(recvbuf, count, datatype, io_root,
552                                     COLL_TAG_REDUCE, comm,
553                                     MPI_STATUS_IGNORE);
554             if (MPI_IN_PLACE == sendbuf) {
555               smpi_free_tmp_buffer(tmp_sendbuf);
556             }
557
558         } else if (io_root == rank) {
559             /* Send result from use_this_recvbuf to root */
560             Request::send(use_this_recvbuf, count, datatype, root,
561                                     COLL_TAG_REDUCE,
562                                     comm);
563             smpi_free_tmp_buffer(tmp_recvbuf);
564         }
565     }
566
567     return MPI_SUCCESS;
568 }
569
570 /*
571  * Linear functions are copied from the BASIC coll module
572  * they do not segment the message and are simple implementations
573  * but for some small number of nodes and/or small data sizes they
574  * are just as fast as tuned/tree based segmenting operations
575  * and as such may be selected by the decision functions
576  * These are copied into this module due to the way we select modules
577  * in V1. i.e. in V2 we will handle this differently and so will not
578  * have to duplicate code.
579  * GEF Oct05 after asking Jeff.
580  */
581
582 /* copied function (with appropriate renaming) starts here */
583
584 /*
585  *  reduce_lin_intra
586  *
587  *  Function:   - reduction using O(N) algorithm
588  *  Accepts:    - same as MPI_Reduce()
589  *  Returns:    - MPI_SUCCESS or error code
590  */
591
592 int reduce__ompi_basic_linear(const void *sbuf, void *rbuf, int count,
593                               MPI_Datatype dtype,
594                               MPI_Op op,
595                               int root,
596                               MPI_Comm comm)
597 {
598     int i, rank, size;
599     ptrdiff_t true_extent, lb, extent;
600     unsigned char* free_buffer  = nullptr;
601     unsigned char* pml_buffer   = nullptr;
602     unsigned char* inplace_temp = nullptr;
603     const unsigned char* inbuf;
604
605     /* Initialize */
606
607     rank = comm->rank();
608     size = comm->size();
609
610     XBT_DEBUG("coll:tuned:reduce_intra_basic_linear rank %d", rank);
611
612     /* If not root, send data to the root. */
613
614     if (rank != root) {
615         Request::send(sbuf, count, dtype, root,
616                                 COLL_TAG_REDUCE,
617                                 comm);
618         return MPI_SUCCESS;
619     }
620
621     /* see discussion in ompi_coll_basic_reduce_lin_intra about
622        extent and true extent */
623     /* for reducing buffer allocation lengths.... */
624
625     dtype->extent(&lb, &extent);
626     true_extent = dtype->get_extent();
627
628     if (MPI_IN_PLACE == sbuf) {
629         sbuf = rbuf;
630         inplace_temp = smpi_get_tmp_recvbuffer(true_extent + (count - 1) * extent);
631         if (nullptr == inplace_temp) {
632           return -1;
633         }
634         rbuf = inplace_temp - lb;
635     }
636
637     if (size > 1) {
638       free_buffer = smpi_get_tmp_recvbuffer(true_extent + (count - 1) * extent);
639       pml_buffer  = free_buffer - lb;
640     }
641
642     /* Initialize the receive buffer. */
643
644     if (rank == (size - 1)) {
645         Datatype::copy((char*)sbuf, count, dtype,(char*)rbuf, count, dtype);
646     } else {
647         Request::recv(rbuf, count, dtype, size - 1,
648                                 COLL_TAG_REDUCE, comm,
649                                 MPI_STATUS_IGNORE);
650     }
651
652     /* Loop receiving and calling reduction function (C or Fortran). */
653
654     for (i = size - 2; i >= 0; --i) {
655         if (rank == i) {
656           inbuf = static_cast<const unsigned char*>(sbuf);
657         } else {
658             Request::recv(pml_buffer, count, dtype, i,
659                                     COLL_TAG_REDUCE, comm,
660                                     MPI_STATUS_IGNORE);
661             inbuf = pml_buffer;
662         }
663
664         /* Perform the reduction */
665         if(op!=MPI_OP_NULL) op->apply( inbuf, rbuf, &count, dtype);
666     }
667
668     if (nullptr != inplace_temp) {
669       Datatype::copy(inplace_temp, count, dtype, (char*)sbuf, count, dtype);
670       smpi_free_tmp_buffer(inplace_temp);
671     }
672     if (nullptr != free_buffer) {
673       smpi_free_tmp_buffer(free_buffer);
674     }
675
676     /* All done */
677     return MPI_SUCCESS;
678 }
679
680 /* copied function (with appropriate renaming) ends here */
681
682 } // namespace simgrid::smpi