1 /* Copyright (c) 2013-2017. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "../colls_private.h"
8 #include "../coll_tuned_topo.h"
10 #define MAXTREEFANOUT 32
14 int Coll_bcast_ompi_pipeline::bcast( void* buffer,
16 MPI_Datatype datatype,
20 int count_by_segment = original_count;
22 size_t segsize =1024 << 7;
23 //mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
24 //mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
26 // return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
27 // count_by_segment, data->cached_pipeline );
28 ompi_coll_tree_t * tree = ompi_coll_tuned_topo_build_chain( 1, comm, root );
32 int num_segments; /* Number of segments */
33 int sendcount; /* number of elements sent in this segment */
37 MPI_Request recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
38 MPI_Request *send_reqs = NULL;
42 * Determine number of elements sent per operation.
44 type_size = datatype->size();
48 if(size==1)return MPI_SUCCESS;
51 const double a_p16 = 3.2118e-6; /* [1 / byte] */
52 const double b_p16 = 8.7936;
53 const double a_p64 = 2.3679e-6; /* [1 / byte] */
54 const double b_p64 = 1.1787;
55 const double a_p128 = 1.6134e-6; /* [1 / byte] */
56 const double b_p128 = 2.1102;
59 /* else we need data size for decision function */
60 message_size = type_size * (unsigned long)original_count; /* needed for decision */
62 if (size < (a_p128 * message_size + b_p128)) {
63 //Pipeline with 128KB segments
65 }else if (size < (a_p64 * message_size + b_p64)) {
66 // Pipeline with 64KB segments
68 }else if (size < (a_p16 * message_size + b_p16)) {
69 //Pipeline with 16KB segments
73 COLL_TUNED_COMPUTED_SEGCOUNT( segsize, type_size, count_by_segment );
75 XBT_DEBUG("coll:tuned:bcast_intra_pipeline rank %d ss %5zu type_size %lu count_by_segment %d",
76 comm->rank(), segsize, (unsigned long)type_size, count_by_segment);
80 extent = datatype->get_extent();
81 num_segments = (original_count + count_by_segment - 1) / count_by_segment;
82 realsegsize = count_by_segment * extent;
84 /* Set the buffer pointers */
85 tmpbuf = (char *) buffer;
87 if( tree->tree_nextsize != 0 ) {
88 send_reqs = xbt_new(MPI_Request, tree->tree_nextsize );
95 - send segment to all children.
96 The last segment may have less elements than other segments.
98 sendcount = count_by_segment;
99 for( segindex = 0; segindex < num_segments; segindex++ ) {
100 if( segindex == (num_segments - 1) ) {
101 sendcount = original_count - segindex * count_by_segment;
103 for( i = 0; i < tree->tree_nextsize; i++ ) {
104 send_reqs[i] = Request::isend(tmpbuf, sendcount, datatype,
106 COLL_TAG_BCAST, comm);
109 /* complete the sends before starting the next sends */
110 Request::waitall( tree->tree_nextsize, send_reqs,
111 MPI_STATUSES_IGNORE );
113 /* update tmp buffer */
114 tmpbuf += realsegsize;
119 /* Intermediate nodes code */
120 else if( tree->tree_nextsize > 0 ) {
123 1) Post the first receive
124 2) For segments 1 .. num_segments
126 - wait on the previous receive to complete
127 - send this data to children
128 3) Wait on the last segment
129 4) Compute number of elements in last segment.
130 5) Send the last segment to children
133 recv_reqs[req_index]=Request::irecv(tmpbuf, count_by_segment, datatype,
134 tree->tree_prev, COLL_TAG_BCAST,
137 for( segindex = 1; segindex < num_segments; segindex++ ) {
139 req_index = req_index ^ 0x1;
142 recv_reqs[req_index]= Request::irecv( tmpbuf + realsegsize, count_by_segment,
143 datatype, tree->tree_prev,
147 /* wait for and forward the previous segment to children */
148 Request::wait( &recv_reqs[req_index ^ 0x1],
149 MPI_STATUSES_IGNORE );
151 for( i = 0; i < tree->tree_nextsize; i++ ) {
152 send_reqs[i]=Request::isend(tmpbuf, count_by_segment, datatype,
154 COLL_TAG_BCAST, comm );
157 /* complete the sends before starting the next iteration */
158 Request::waitall( tree->tree_nextsize, send_reqs,
159 MPI_STATUSES_IGNORE );
161 /* Update the receive buffer */
162 tmpbuf += realsegsize;
165 /* Process the last segment */
166 Request::wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE );
167 sendcount = original_count - (num_segments - 1) * count_by_segment;
168 for( i = 0; i < tree->tree_nextsize; i++ ) {
169 send_reqs[i] = Request::isend(tmpbuf, sendcount, datatype,
171 COLL_TAG_BCAST, comm);
174 Request::waitall( tree->tree_nextsize, send_reqs,
175 MPI_STATUSES_IGNORE );
181 Receive all segments from parent in a loop:
182 1) post irecv for the first segment
183 2) for segments 1 .. num_segments
184 - post irecv for the next segment
185 - wait on the previous segment to arrive
186 3) wait for the last segment
189 recv_reqs[req_index] = Request::irecv(tmpbuf, count_by_segment, datatype,
190 tree->tree_prev, COLL_TAG_BCAST,
193 for( segindex = 1; segindex < num_segments; segindex++ ) {
194 req_index = req_index ^ 0x1;
195 tmpbuf += realsegsize;
196 /* post receive for the next segment */
197 recv_reqs[req_index] = Request::irecv(tmpbuf, count_by_segment, datatype,
198 tree->tree_prev, COLL_TAG_BCAST,
200 /* wait on the previous segment */
201 Request::wait( &recv_reqs[req_index ^ 0x1],
205 Request::wait( &recv_reqs[req_index], MPI_STATUS_IGNORE );
208 if( NULL != send_reqs ) free(send_reqs);
211 return (MPI_SUCCESS);