2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2005 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
14 * Additional copyrights may follow
19 #include "colls_private.h"
20 #include "coll_tuned_topo.h"
22 * Some static helpers.
24 static int pown( int fanout, int num )
27 if( num < 0 ) return 0;
28 if (1==num) return fanout;
33 for( j = 0; j < num; j++ ) { p*= fanout; }
38 static int calculate_level( int fanout, int rank )
41 if( rank < 0 ) return -1;
42 for( level = 0, num = 0; num <= rank; level++ ) {
43 num += pown(fanout, level);
48 static int calculate_num_nodes_up_to_level( int fanout, int level )
50 /* just use geometric progression formula for sum:
51 a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
52 return ((pown(fanout,level) - 1)/(fanout - 1));
56 * And now the building functions.
58 * An example for fanout = 2, comm_size = 7
60 * 0 <-- delta = 1 (fanout^0)
62 * 1 2 <-- delta = 2 (fanout^1)
64 * 3 5 4 6 <-- delta = 4 (fanout^2)
68 ompi_coll_tuned_topo_build_tree( int fanout,
74 int level; /* location of my rank in the tree structure of size */
75 int delta; /* number of nodes on my level */
76 int slimit; /* total number of nodes on levels above me */
79 ompi_coll_tree_t* tree;
81 XBT_DEBUG("coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root);
84 XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d", fanout);
87 if (fanout>MAXTREEFANOUT) {
88 XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT);
93 * Get size and rank of the process in this communicator
95 size = smpi_comm_size(comm);
96 rank = smpi_comm_rank(comm);
98 tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
100 XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory");
104 tree->tree_root = MPI_UNDEFINED;
105 tree->tree_nextsize = MPI_UNDEFINED;
110 tree->tree_root = root;
115 tree->tree_fanout = fanout;
116 tree->tree_bmtree = 0;
117 tree->tree_root = root;
118 tree->tree_prev = -1;
119 tree->tree_nextsize = 0;
120 for( i = 0; i < fanout; i++ ) {
121 tree->tree_next[i] = -1;
124 /* return if we have less than 2 processes */
130 * Shift all ranks by root, so that the algorithm can be
131 * designed as if root would be always 0
132 * shiftedrank should be used in calculating distances
133 * and position in tree
135 shiftedrank = rank - root;
136 if( shiftedrank < 0 ) {
140 /* calculate my level */
141 level = calculate_level( fanout, shiftedrank );
142 delta = pown( fanout, level );
144 /* find my children */
145 for( i = 0; i < fanout; i++ ) {
146 schild = shiftedrank + delta * (i+1);
147 if( schild < size ) {
148 tree->tree_next[i] = (schild+root)%size;
149 tree->tree_nextsize = tree->tree_nextsize + 1;
156 slimit = calculate_num_nodes_up_to_level( fanout, level );
157 sparent = shiftedrank;
158 if( sparent < fanout ) {
161 while( sparent >= slimit ) {
162 sparent -= delta/fanout;
165 tree->tree_prev = (sparent+root)%size;
171 * Constructs in-order binary tree which can be used for non-commutative reduce
173 * Root of this tree is always rank (size-1) and fanout is 2.
174 * Here are some of the examples of this tree:
175 * size == 2 size == 3 size == 4 size == 9
185 ompi_coll_tuned_topo_build_in_order_bintree( MPI_Comm comm )
188 int myrank, rightsize, delta;
189 int parent, lchild, rchild;
190 ompi_coll_tree_t* tree;
193 * Get size and rank of the process in this communicator
195 size = smpi_comm_size(comm);
196 rank = smpi_comm_rank(comm);
198 tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
201 "coll:tuned:topo_build_tree PANIC::out of memory");
205 tree->tree_root = MPI_UNDEFINED;
206 tree->tree_nextsize = MPI_UNDEFINED;
211 tree->tree_fanout = 2;
212 tree->tree_bmtree = 0;
213 tree->tree_root = size - 1;
214 tree->tree_prev = -1;
215 tree->tree_nextsize = 0;
216 tree->tree_next[0] = -1;
217 tree->tree_next[1] = -1;
219 "coll:tuned:topo_build_in_order_tree Building fo %d rt %d",
220 tree->tree_fanout, tree->tree_root);
230 /* Compute the size of the right subtree */
231 rightsize = size >> 1;
233 /* Determine the left and right child of this parent */
239 rchild = rightsize - 1;
243 /* The following cases are possible: myrank can be
245 - belong to the left subtree, or
246 - belong to the right subtee
247 Each of the cases need to be handled differently.
250 if (myrank == parent) {
252 - compute real ranks of my children, and exit the loop. */
253 if (lchild >= 0) tree->tree_next[0] = lchild + delta;
254 if (rchild >= 0) tree->tree_next[1] = rchild + delta;
257 if (myrank > rchild) {
258 /* I belong to the left subtree:
259 - If I am the left child, compute real rank of my parent
260 - Iterate down through tree:
261 compute new size, shift ranks down, and update delta.
263 if (myrank == lchild) {
264 tree->tree_prev = parent + delta;
266 size = size - rightsize - 1;
267 delta = delta + rightsize;
268 myrank = myrank - rightsize;
272 /* I belong to the right subtree:
273 - If I am the right child, compute real rank of my parent
274 - Iterate down through tree:
275 compute new size and parent,
276 but the delta and rank do not need to change.
278 if (myrank == rchild) {
279 tree->tree_prev = parent + delta;
286 if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
287 if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
292 int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
294 ompi_coll_tree_t *ptr;
296 if ((!tree)||(!*tree)) {
303 *tree = NULL; /* mark tree as gone */
310 * Here are some of the examples of this tree:
311 * size == 2 size = 4 size = 8
321 ompi_coll_tuned_topo_build_bmtree( MPI_Comm comm,
330 ompi_coll_tree_t *bmtree;
333 XBT_DEBUG("coll:tuned:topo:build_bmtree rt %d", root);
336 * Get size and rank of the process in this communicator
338 size = smpi_comm_size(comm);
339 rank = smpi_comm_rank(comm);
343 bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
345 XBT_DEBUG("coll:tuned:topo:build_bmtree PANIC out of memory");
349 bmtree->tree_bmtree = 1;
351 bmtree->tree_root = MPI_UNDEFINED;
352 bmtree->tree_nextsize = MPI_UNDEFINED;
353 for(i=0;i<MAXTREEFANOUT;i++) {
354 bmtree->tree_next[i] = -1;
357 if( index < 0 ) index += size;
359 while( mask <= index ) mask <<= 1;
361 /* Now I can compute my father rank */
363 bmtree->tree_prev = root;
365 remote = (index ^ (mask >> 1)) + root;
366 if( remote >= size ) remote -= size;
367 bmtree->tree_prev = remote;
369 /* And now let's fill my childs */
370 while( mask < size ) {
371 remote = (index ^ mask);
372 if( remote >= size ) break;
374 if( remote >= size ) remote -= size;
375 if (childs==MAXTREEFANOUT) {
376 XBT_DEBUG("coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs);
379 bmtree->tree_next[childs] = remote;
383 bmtree->tree_nextsize = childs;
384 bmtree->tree_root = root;
389 * Constructs in-order binomial tree which can be used for gather/scatter
392 * Here are some of the examples of this tree:
393 * size == 2 size = 4 size = 8
403 ompi_coll_tuned_topo_build_in_order_bmtree( MPI_Comm comm,
411 ompi_coll_tree_t *bmtree;
414 XBT_DEBUG("coll:tuned:topo:build_in_order_bmtree rt %d", root);
417 * Get size and rank of the process in this communicator
419 size = smpi_comm_size(comm);
420 rank = smpi_comm_rank(comm);
422 vrank = (rank - root + size) % size;
424 bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
426 XBT_DEBUG("coll:tuned:topo:build_bmtree PANIC out of memory");
430 bmtree->tree_bmtree = 1;
431 bmtree->tree_root = MPI_UNDEFINED;
432 bmtree->tree_nextsize = MPI_UNDEFINED;
433 for(i=0;i<MAXTREEFANOUT;i++) {
434 bmtree->tree_next[i] = -1;
438 bmtree->tree_prev = root;
441 while (mask < size) {
442 remote = vrank ^ mask;
443 if (remote < vrank) {
444 bmtree->tree_prev = (remote + root) % size;
446 } else if (remote < size) {
447 bmtree->tree_next[childs] = (remote + root) % size;
449 if (childs==MAXTREEFANOUT) {
451 "coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d",
452 MAXTREEFANOUT, childs);
458 bmtree->tree_nextsize = childs;
459 bmtree->tree_root = root;
466 ompi_coll_tuned_topo_build_chain( int fanout,
471 int srank; /* shifted rank */
474 ompi_coll_tree_t *chain;
476 XBT_DEBUG("coll:tuned:topo:build_chain fo %d rt %d", fanout, root);
479 * Get size and rank of the process in this communicator
481 size = smpi_comm_size(comm);
482 rank = smpi_comm_rank(comm);
485 XBT_DEBUG("coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!");
488 if (fanout>MAXTREEFANOUT) {
489 XBT_DEBUG("coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT);
490 fanout = MAXTREEFANOUT;
494 * Allocate space for topology arrays if needed
496 chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) );
498 XBT_DEBUG("coll:tuned:topo:build_chain PANIC out of memory");
502 chain->tree_root = MPI_UNDEFINED;
503 chain->tree_nextsize = -1;
504 for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
507 * Set root & numchain
509 chain->tree_root = root;
510 if( (size - 1) < fanout ) {
511 chain->tree_nextsize = size-1;
514 chain->tree_nextsize = fanout;
521 if (srank < 0) srank += size;
524 * Special case - fanout == 1
527 if( srank == 0 ) chain->tree_prev = -1;
528 else chain->tree_prev = (srank-1+root)%size;
530 if( (srank + 1) >= size) {
531 chain->tree_next[0] = -1;
532 chain->tree_nextsize = 0;
534 chain->tree_next[0] = (srank+1+root)%size;
535 chain->tree_nextsize = 1;
540 /* Let's handle the case where there is just one node in the communicator */
542 chain->tree_next[0] = -1;
543 chain->tree_nextsize = 0;
544 chain->tree_prev = -1;
548 * Calculate maximum chain length
550 maxchainlen = (size-1) / fanout;
551 if( (size-1) % fanout != 0 ) {
553 mark = (size-1)%fanout;
559 * Find your own place in the list of shifted ranks
563 if( srank-1 < (mark * maxchainlen) ) {
564 column = (srank-1)/maxchainlen;
565 head = 1+column*maxchainlen;
568 column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1);
569 head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1);
573 if( srank == head ) {
574 chain->tree_prev = 0; /*root*/
576 chain->tree_prev = srank-1; /* rank -1 */
578 if( srank == (head + len - 1) ) {
579 chain->tree_next[0] = -1;
580 chain->tree_nextsize = 0;
582 if( (srank + 1) < size ) {
583 chain->tree_next[0] = srank+1;
584 chain->tree_nextsize = 1;
586 chain->tree_next[0] = -1;
587 chain->tree_nextsize = 0;
596 chain->tree_prev = -1;
597 chain->tree_next[0] = (root+1)%size;
598 for( i = 1; i < fanout; i++ ) {
599 chain->tree_next[i] = chain->tree_next[i-1] + maxchainlen;
601 chain->tree_next[i]--;
603 chain->tree_next[i] %= size;
605 chain->tree_nextsize = fanout;
607 chain->tree_prev = (chain->tree_prev+root)%size;
608 if( chain->tree_next[0] != -1 ) {
609 chain->tree_next[0] = (chain->tree_next[0]+root)%size;
616 int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
620 XBT_DEBUG("coll:tuned:topo:topo_dump_tree %1d tree root %d"
621 " fanout %d BM %1d nextsize %d prev %d",
622 rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
623 tree->tree_nextsize, tree->tree_prev);
624 if( tree->tree_nextsize ) {
625 for( i = 0; i < tree->tree_nextsize; i++ )
626 XBT_DEBUG("[%1d] %d", i, tree->tree_next[i]);