1 /* Copyright (c) 2013-2020. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2005 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
22 #include "coll_tuned_topo.hpp"
23 #include "colls_private.hpp"
25 * Some static helpers.
27 static int pown( int fanout, int num )
30 if( num < 0 ) return 0;
31 if (1==num) return fanout;
36 for (int j = 0; j < num; j++) {
43 static int calculate_level( int fanout, int rank )
46 if( rank < 0 ) return -1;
47 for( level = 0, num = 0; num <= rank; level++ ) {
48 num += pown(fanout, level);
53 static int calculate_num_nodes_up_to_level( int fanout, int level )
55 /* just use geometric progression formula for sum:
56 a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
58 return ((pown(fanout, level) - 1) / (fanout - 1));
60 return 0; // is this right ?
64 * And now the building functions.
66 * An example for fanout = 2, comm_size = 7
68 * 0 <-- delta = 1 (fanout^0)
70 * 1 2 <-- delta = 2 (fanout^1)
72 * 3 5 4 6 <-- delta = 4 (fanout^2)
76 ompi_coll_tuned_topo_build_tree( int fanout,
82 int level; /* location of my rank in the tree structure of size */
83 int delta; /* number of nodes on my level */
84 int slimit; /* total number of nodes on levels above me */
86 ompi_coll_tree_t* tree;
88 XBT_DEBUG("coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root);
91 XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d", fanout);
94 if (fanout>MAXTREEFANOUT) {
95 XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT);
100 * Get size and rank of the process in this communicator
105 tree = new ompi_coll_tree_t;
107 XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory");
114 tree->tree_fanout = fanout;
115 tree->tree_bmtree = 0;
116 tree->tree_root = root;
117 tree->tree_prev = -1;
118 tree->tree_nextsize = 0;
119 for (int i = 0; i < fanout; i++) {
120 tree->tree_next[i] = -1;
123 /* return if we have less than 2 processes */
129 * Shift all ranks by root, so that the algorithm can be
130 * designed as if root would be always 0
131 * shiftedrank should be used in calculating distances
132 * and position in tree
134 shiftedrank = rank - root;
135 if( shiftedrank < 0 ) {
139 /* calculate my level */
140 level = calculate_level( fanout, shiftedrank );
141 delta = pown( fanout, level );
143 /* find my children */
144 for (int i = 0; i < fanout; i++) {
145 int schild = shiftedrank + delta * (i+1);
146 if( schild < size ) {
147 tree->tree_next[i] = (schild+root)%size;
148 tree->tree_nextsize = tree->tree_nextsize + 1;
155 slimit = calculate_num_nodes_up_to_level( fanout, level );
156 sparent = shiftedrank;
157 if( sparent < fanout ) {
160 while( sparent >= slimit ) {
161 sparent -= delta/fanout;
164 tree->tree_prev = (sparent+root)%size;
170 * Constructs in-order binary tree which can be used for non-commutative reduce
172 * Root of this tree is always rank (size-1) and fanout is 2.
173 * Here are some of the examples of this tree:
174 * size == 2 size == 3 size == 4 size == 9
184 ompi_coll_tuned_topo_build_in_order_bintree( MPI_Comm comm )
189 ompi_coll_tree_t* tree;
192 * Get size and rank of the process in this communicator
197 tree = new ompi_coll_tree_t;
199 XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory");
206 tree->tree_fanout = 2;
207 tree->tree_bmtree = 0;
208 tree->tree_root = size - 1;
209 tree->tree_prev = -1;
210 tree->tree_nextsize = 0;
211 tree->tree_next[0] = -1;
212 tree->tree_next[1] = -1;
214 "coll:tuned:topo_build_in_order_tree Building fo %d rt %d",
215 tree->tree_fanout, tree->tree_root);
225 /* Compute the size of the right subtree */
226 int rightsize = size >> 1;
228 /* Determine the left and right child of this parent */
234 rchild = rightsize - 1;
238 /* The following cases are possible: myrank can be
240 - belong to the left subtree, or
241 - belong to the right subtee
242 Each of the cases need to be handled differently.
245 if (myrank == parent) {
247 - compute real ranks of my children, and exit the loop. */
249 tree->tree_next[0] = lchild + delta;
251 tree->tree_next[1] = rchild + delta;
254 if (myrank > rchild) {
255 /* I belong to the left subtree:
256 - If I am the left child, compute real rank of my parent
257 - Iterate down through tree:
258 compute new size, shift ranks down, and update delta.
260 if (myrank == lchild) {
261 tree->tree_prev = parent + delta;
263 size = size - rightsize - 1;
264 delta = delta + rightsize;
265 myrank = myrank - rightsize;
269 /* I belong to the right subtree:
270 - If I am the right child, compute real rank of my parent
271 - Iterate down through tree:
272 compute new size and parent,
273 but the delta and rank do not need to change.
275 if (myrank == rchild) {
276 tree->tree_prev = parent + delta;
283 if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
284 if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
289 int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
291 ompi_coll_tree_t *ptr;
293 if ((tree == nullptr) || (*tree == nullptr)) {
300 *tree = nullptr; /* mark tree as gone */
307 * Here are some of the examples of this tree:
308 * size == 2 size = 4 size = 8
318 ompi_coll_tuned_topo_build_bmtree( MPI_Comm comm,
327 ompi_coll_tree_t *bmtree;
330 XBT_DEBUG("coll:tuned:topo:build_bmtree rt %d", root);
333 * Get size and rank of the process in this communicator
340 bmtree = new ompi_coll_tree_t;
342 XBT_DEBUG("coll:tuned:topo:build_bmtree PANIC out of memory");
346 bmtree->tree_bmtree = 1;
348 bmtree->tree_root = MPI_UNDEFINED;
349 bmtree->tree_nextsize = MPI_UNDEFINED;
350 for(i=0;i<MAXTREEFANOUT;i++) {
351 bmtree->tree_next[i] = -1;
354 if( index < 0 ) index += size;
356 while( mask <= index ) mask <<= 1;
358 /* Now I can compute my father rank */
360 bmtree->tree_prev = root;
362 remote = (index ^ (mask >> 1)) + root;
363 if( remote >= size ) remote -= size;
364 bmtree->tree_prev = remote;
366 /* And now let's fill my children */
367 while( mask < size ) {
368 remote = (index ^ mask);
369 if( remote >= size ) break;
371 if( remote >= size ) remote -= size;
372 if (children==MAXTREEFANOUT) {
373 XBT_DEBUG("coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, children);
377 bmtree->tree_next[children] = remote;
381 bmtree->tree_nextsize = children;
382 bmtree->tree_root = root;
387 * Constructs in-order binomial tree which can be used for gather/scatter
390 * Here are some of the examples of this tree:
391 * size == 2 size = 4 size = 8
400 ompi_coll_tree_t* ompi_coll_tuned_topo_build_in_order_bmtree(MPI_Comm comm, int root)
406 ompi_coll_tree_t *bmtree;
409 XBT_DEBUG("coll:tuned:topo:build_in_order_bmtree rt %d", root);
412 * Get size and rank of the process in this communicator
417 vrank = (rank - root + size) % size;
419 bmtree = new ompi_coll_tree_t;
421 XBT_DEBUG("coll:tuned:topo:build_bmtree PANIC out of memory");
426 bmtree->tree_bmtree = 1;
427 bmtree->tree_root = MPI_UNDEFINED;
428 bmtree->tree_nextsize = MPI_UNDEFINED;
429 for(i=0;i<MAXTREEFANOUT;i++) {
430 bmtree->tree_next[i] = -1;
434 bmtree->tree_prev = root;
437 while (mask < size) {
438 int remote = vrank ^ mask;
439 if (remote < vrank) {
440 bmtree->tree_prev = (remote + root) % size;
442 } else if (remote < size) {
443 bmtree->tree_next[children] = (remote + root) % size;
445 if (children == MAXTREEFANOUT) {
446 XBT_DEBUG("coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, children);
453 bmtree->tree_nextsize = children;
454 bmtree->tree_root = root;
461 ompi_coll_tuned_topo_build_chain( int fanout,
466 int srank; /* shifted rank */
469 ompi_coll_tree_t *chain;
471 XBT_DEBUG("coll:tuned:topo:build_chain fo %d rt %d", fanout, root);
474 * Get size and rank of the process in this communicator
480 XBT_DEBUG("coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!");
483 if (fanout>MAXTREEFANOUT) {
484 XBT_DEBUG("coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT);
485 fanout = MAXTREEFANOUT;
489 * Allocate space for topology arrays if needed
491 chain = new ompi_coll_tree_t;
493 XBT_DEBUG("coll:tuned:topo:build_chain PANIC out of memory");
497 for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
500 * Set root & numchain
502 chain->tree_root = root;
503 if( (size - 1) < fanout ) {
504 chain->tree_nextsize = size-1;
507 chain->tree_nextsize = fanout;
514 if (srank < 0) srank += size;
517 * Special case - fanout == 1
520 if( srank == 0 ) chain->tree_prev = -1;
521 else chain->tree_prev = (srank-1+root)%size;
523 if( (srank + 1) >= size) {
524 chain->tree_next[0] = -1;
525 chain->tree_nextsize = 0;
527 chain->tree_next[0] = (srank+1+root)%size;
528 chain->tree_nextsize = 1;
533 /* Let's handle the case where there is just one node in the communicator */
535 chain->tree_next[0] = -1;
536 chain->tree_nextsize = 0;
537 chain->tree_prev = -1;
541 * Calculate maximum chain length
543 maxchainlen = (size-1) / fanout;
544 if( (size-1) % fanout != 0 ) {
546 mark = (size-1)%fanout;
552 * Find your own place in the list of shifted ranks
558 if( srank-1 < (mark * maxchainlen) ) {
559 column = (srank-1)/maxchainlen;
560 head = 1+column*maxchainlen;
563 column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1);
564 head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1);
568 if( srank == head ) {
569 chain->tree_prev = 0; /*root*/
571 chain->tree_prev = srank-1; /* rank -1 */
573 if( srank == (head + len - 1) ) {
574 chain->tree_next[0] = -1;
575 chain->tree_nextsize = 0;
577 if( (srank + 1) < size ) {
578 chain->tree_next[0] = srank+1;
579 chain->tree_nextsize = 1;
581 chain->tree_next[0] = -1;
582 chain->tree_nextsize = 0;
591 chain->tree_prev = -1;
592 chain->tree_next[0] = (root+1)%size;
593 for (int i = 1; i < fanout; i++) {
594 chain->tree_next[i] = chain->tree_next[i-1] + maxchainlen;
596 chain->tree_next[i]--;
598 chain->tree_next[i] %= size;
600 chain->tree_nextsize = fanout;
602 chain->tree_prev = (chain->tree_prev+root)%size;
603 if( chain->tree_next[0] != -1 ) {
604 chain->tree_next[0] = (chain->tree_next[0]+root)%size;
611 int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
613 XBT_DEBUG("coll:tuned:topo:topo_dump_tree %1d tree root %d"
614 " fanout %d BM %1d nextsize %d prev %d",
615 rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
616 tree->tree_nextsize, tree->tree_prev);
617 if( tree->tree_nextsize ) {
618 for (int i = 0; i < tree->tree_nextsize; i++)
619 XBT_DEBUG("[%1d] %d", i, tree->tree_next[i]);