From: Bruno Donassolo Date: Tue, 27 Apr 2021 17:10:27 +0000 (+0200) Subject: New example: Fat Tree cluster of multi-cpu hosts(Star Zones) X-Git-Tag: v3.28~403 X-Git-Url: http://bilbo.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/88ac1cdcafcdabfc2519cca322ce0fac9c2fd67f New example: Fat Tree cluster of multi-cpu hosts(Star Zones) Adapt existing Torus example for different platforms. Same idea: each leaf is a new StarZone netzone, emulating a multi-cpu node. --- diff --git a/MANIFEST.in b/MANIFEST.in index ecad1680e8..13ef11d8a0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -169,6 +169,8 @@ include examples/cpp/cloud-migration/s4u-cloud-migration.cpp include examples/cpp/cloud-migration/s4u-cloud-migration.tesh include examples/cpp/cloud-simple/s4u-cloud-simple.cpp include examples/cpp/cloud-simple/s4u-cloud-simple.tesh +include examples/cpp/clusters-multicpu/s4u-clusters-multicpu.cpp +include examples/cpp/clusters-multicpu/s4u-clusters-multicpu.tesh include examples/cpp/comm-dependent/s4u-comm-dependent.cpp include examples/cpp/comm-dependent/s4u-comm-dependent.tesh include examples/cpp/comm-host2host/s4u-comm-host2host.cpp @@ -320,8 +322,6 @@ include examples/cpp/synchro-mutex/s4u-synchro-mutex.cpp include examples/cpp/synchro-mutex/s4u-synchro-mutex.tesh include examples/cpp/synchro-semaphore/s4u-synchro-semaphore.cpp include examples/cpp/synchro-semaphore/s4u-synchro-semaphore.tesh -include examples/cpp/torus-multicpu/s4u-torus-multicpu.cpp -include examples/cpp/torus-multicpu/s4u-torus-multicpu.tesh include examples/cpp/trace-categories/s4u-trace-categories.cpp include examples/cpp/trace-categories/s4u-trace-categories.tesh include examples/cpp/trace-host-user-variables/s4u-trace-host-user-variables.cpp diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 87ee48d180..ad691e103f 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -80,7 +80,7 @@ foreach (example actor-create actor-daemon actor-exiting actor-join actor-kill replay-comm replay-io routing-get-clusters synchro-barrier synchro-condition-variable synchro-condition-variable-waituntil synchro-mutex synchro-semaphore - torus-multicpu) + clusters-multicpu) # Use default source file unless specified otherwise if(NOT DEFINED _${example}_sources) diff --git a/examples/cpp/torus-multicpu/s4u-torus-multicpu.cpp b/examples/cpp/clusters-multicpu/s4u-clusters-multicpu.cpp similarity index 64% rename from examples/cpp/torus-multicpu/s4u-torus-multicpu.cpp rename to examples/cpp/clusters-multicpu/s4u-clusters-multicpu.cpp index 65d192ee5a..27686f8aed 100644 --- a/examples/cpp/torus-multicpu/s4u-torus-multicpu.cpp +++ b/examples/cpp/clusters-multicpu/s4u-clusters-multicpu.cpp @@ -64,8 +64,9 @@ public: } }; +/*************************************************************************************************/ /** - * @brief Callback to set a Torus leaf/element + * @brief Callback to set a cluster leaf/element * * In our example, each leaf if a StarZone, composed of 8 CPUs. * Each CPU is modeled as a host, connected to the outer world through a high-speed PCI link. @@ -82,7 +83,7 @@ public: * / / \ \ * CPU1 ... CPU8 * - * @param zone Torus netzone being created (usefull to create the hosts/links inside it) + * @param zone Cluster netzone being created (usefull to create the hosts/links inside it) * @param coord Coordinates in the torus (e.g. "0,0,0", "0,1,0") * @param id Internal identifier in the torus (for information) * @return netpoint, gateway: the netpoint to the StarZone and CPU0 as gateway @@ -104,8 +105,8 @@ create_hostzone(const sg4::NetZone* zone, const std::vector& /*coo const sg4::Host* gateway = nullptr; /* create CPUs */ for (int i = 0; i < num_cpus; i++) { - std::string cpu_name = hostname + "-cpu" + std::to_string(i); - sg4::Host* host = host_zone->create_host(cpu_name, speed)->seal(); + std::string cpu_name = hostname + "-cpu" + std::to_string(i); + const sg4::Host* host = host_zone->create_host(cpu_name, speed)->seal(); /* the first CPU is the gateway */ if (i == 0) gateway = host; @@ -120,10 +121,23 @@ create_hostzone(const sg4::NetZone* zone, const std::vector& /*coo return std::make_pair(host_zone->get_netpoint(), gateway->get_netpoint()); } +/*************************************************************************************************/ +/** + * @brief Callback to create limiter link (1Gbs) for each netpoint + * @param zone Torus netzone being created (usefull to create the hosts/links inside it) + * @param coord Coordinates in the torus (e.g. "0,0,0", "0,1,0") + * @param id Internal identifier in the torus (for information) + * @return Limiter link + */ +static sg4::Link* create_limiter(sg4::NetZone* zone, const std::vector& /*coord*/, int id) +{ + return zone->create_link("limiter-" + std::to_string(id), 1e9)->seal(); +} + /** * @brief Creates a TORUS cluster * - * Creates a TORUS clustes with dimensions 2x2x2 + * Creates a TORUS cluster with dimensions 2x2x2 * * The cluster has 8 elements/leaves in total. Each element is a StarZone containing 8 Hosts. * Each pair in the torus is connected through 2 links: @@ -133,11 +147,11 @@ create_hostzone(const sg4::NetZone* zone, const std::vector& /*coo * (Y-axis=2) * A * | - * | X (Z-axis=2) - * | / 10 Gbs + * | D (Z-axis=2) + * + / 10 Gbs * | + * |/ limiter=1Gps - * B----------C (X-axis=2) + * B-----+----C (X-axis=2) * * For example, a communication from A to C goes through: * A->limiter(A)->link(A-B)->limiter(B)->link(B-C)->C @@ -146,7 +160,7 @@ create_hostzone(const sg4::NetZone* zone, const std::vector& /*coo * communication from A-CPU-3 to C-CPU-7 goes through: * 1) StarZone A: A-CPU-3 -> link-up-A-CPU-3 -> A-CPU-0 * 2) A-CPU-0->limiter(A)->link(A-B)->limiter(B)->link(B-C)->C-CPU-0 - * 3) C-CPU-0-> link-down-C-CPU-7 -> C-CPU-7 + * 3) StarZone C: C-CPU-0-> link-down-C-CPU-7 -> C-CPU-7 * * Note that we don't have limiter links inside the StarZones(A, B, C), * but we have limiters in the Torus that are added to the links in the path (as we can see in "2)"") @@ -156,23 +170,76 @@ create_hostzone(const sg4::NetZone* zone, const std::vector& /*coo */ static void create_torus_cluster() { - // Callback to create limiter link (1Gbs) for each host - auto create_limiter = [](sg4::NetZone* zone, const std::vector& /*coord*/, int id) -> sg4::Link* { - return zone->create_link("limiter-" + std::to_string(id), 1e9)->seal(); - }; - /* create the torus cluster, 10Gbs link between elements in the cluster */ sg4::create_torus_zone("cluster", nullptr, {2, 2, 2}, 10e9, 10e-6, sg4::Link::SharingPolicy::SPLITDUPLEX, create_hostzone, {}, create_limiter) ->seal(); } +/** + * @brief Creates a Fat Tree cluster + * + * Creates a Fat Tree cluster with 2 levels and 6 nodes + * The following parameters are used to create this cluster: + * - Levels: 2 - two-level cluster + * - Down links: 2, 3 - L1 routers is connected to 2 elements, L2 routers to 3 elements + * - Up links: 1, 2 - Each node (A-F) is connected to 1 L2 router, L2 routers are connected to 2 L1 + * - Link count: 1, 1 - Use 1 link in each level + * + * The first parameter describes how many levels we have. + * The following ones describe the connection between the elements and must have exactly n_levels components. + * + * + * S3 S4 <-- Level 1 routers + * / \ / \ + * / /\ \ + * link: 10GBps --> | / \ | + * (full-duplex) | / \ | + * S1 S2 <-- Level 2 routers + * link:10GBps --> / | \ / | \ + * + + + + + + + * link:limiter -> / | \ / | \ + * A B C D E F <-- Nodes + * + * Each element (A to F) is a StarZone containing 8 Hosts. + * The connection uses 2 links: + * 1) limiter: a 1Gbs limiter link (set by user through the set_limiter callback) + * 2) link: 10Gbs link connecting the components (created automatically) + * + * For example, a communication from A to C goes through: + * A->limiter(A)->link(A-S1)->link(S1-C)->->limiter(C)->C + * + * More precisely, considering that A and C are StarZones, a + * communication from A-CPU-3 to C-CPU-7 goes through: + * 1) StarZone A: A-CPU-3 -> link-up-A-CPU-3 -> A-CPU-0 + * 2) A-CPU-0->limiter(A)->link(A-S1)->link(S1-C)->limiter(C)->C-CPU-0 + * 3) StarZone C: C-CPU-0-> link-down-C-CPU-7 -> C-CPU-7 + * + * Note that limiters are only valid for leaves, not routers. + * + * More details in: Fat Tree + * Cluster + */ +static void create_fatTree_cluster() +{ + /* create the fat tree cluster, 10Gbs link between elements in the cluster */ + sg4::create_fatTree_zone("cluster", nullptr, {2, {2, 3}, {1, 2}, {1, 1}}, 10e9, 10e-6, + sg4::Link::SharingPolicy::SPLITDUPLEX, create_hostzone, {}, create_limiter) + ->seal(); +} + +/*************************************************************************************************/ + int main(int argc, char* argv[]) { sg4::Engine e(&argc, argv); + std::string platform = argv[1]; /* create platform */ - create_torus_cluster(); + if (platform == "torus") + create_torus_cluster(); + else if (platform == "fatTree") + create_fatTree_cluster(); std::vector host_list = e.get_all_hosts(); /* create the sender actor running on first host */ diff --git a/examples/cpp/torus-multicpu/s4u-torus-multicpu.tesh b/examples/cpp/clusters-multicpu/s4u-clusters-multicpu.tesh similarity index 56% rename from examples/cpp/torus-multicpu/s4u-torus-multicpu.tesh rename to examples/cpp/clusters-multicpu/s4u-clusters-multicpu.tesh index 74c57fcd8b..8f4cf82a5d 100644 --- a/examples/cpp/torus-multicpu/s4u-torus-multicpu.tesh +++ b/examples/cpp/clusters-multicpu/s4u-clusters-multicpu.tesh @@ -1,6 +1,6 @@ #!/usr/bin/env tesh -$ ${bindir:=.}/s4u-torus-multicpu +$ ${bindir:=.}/s4u-clusters-multicpu torus > [host0-cpu0:sender:(1) 0.000000] [s4u_torus_multicpu/INFO] Done dispatching all messages > [host0-cpu7:receiver-host0-cpu7:(9) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. > [host0-cpu6:receiver-host0-cpu6:(8) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. @@ -67,3 +67,55 @@ $ ${bindir:=.}/s4u-torus-multicpu > [host7-cpu2:receiver-host7-cpu2:(60) 0.057862] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. > [host7-cpu1:receiver-host7-cpu1:(59) 0.057862] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. > [host0-cpu0:sender:(1) 0.057862] [s4u_torus_multicpu/INFO] Goodbye now! + +$ ${bindir:=.}/s4u-clusters-multicpu fatTree +> [host0-cpu0:sender:(1) 0.000000] [s4u_torus_multicpu/INFO] Done dispatching all messages +> [host0-cpu7:receiver-host0-cpu7:(9) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu6:receiver-host0-cpu6:(8) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu5:receiver-host0-cpu5:(7) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu4:receiver-host0-cpu4:(6) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu3:receiver-host0-cpu3:(5) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu2:receiver-host0-cpu2:(4) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu1:receiver-host0-cpu1:(3) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu0:receiver-host0-cpu0:(2) 0.000083] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu0:receiver-host1-cpu0:(10) 0.033296] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu7:receiver-host1-cpu7:(17) 0.033504] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu6:receiver-host1-cpu6:(16) 0.033504] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu5:receiver-host1-cpu5:(15) 0.033504] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu4:receiver-host1-cpu4:(14) 0.033504] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu3:receiver-host1-cpu3:(13) 0.033504] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu2:receiver-host1-cpu2:(12) 0.033504] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host1-cpu1:receiver-host1-cpu1:(11) 0.033504] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu0:receiver-host5-cpu0:(42) 0.043420] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu0:receiver-host4-cpu0:(34) 0.043420] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu0:receiver-host3-cpu0:(26) 0.043420] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu0:receiver-host2-cpu0:(18) 0.043420] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu7:receiver-host5-cpu7:(49) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu6:receiver-host5-cpu6:(48) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu5:receiver-host5-cpu5:(47) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu4:receiver-host5-cpu4:(46) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu3:receiver-host5-cpu3:(45) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu2:receiver-host5-cpu2:(44) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host5-cpu1:receiver-host5-cpu1:(43) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu7:receiver-host4-cpu7:(41) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu6:receiver-host4-cpu6:(40) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu5:receiver-host4-cpu5:(39) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu4:receiver-host4-cpu4:(38) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu3:receiver-host4-cpu3:(37) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu2:receiver-host4-cpu2:(36) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host4-cpu1:receiver-host4-cpu1:(35) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu7:receiver-host3-cpu7:(33) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu6:receiver-host3-cpu6:(32) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu5:receiver-host3-cpu5:(31) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu4:receiver-host3-cpu4:(30) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu3:receiver-host3-cpu3:(29) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu2:receiver-host3-cpu2:(28) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host3-cpu1:receiver-host3-cpu1:(27) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu7:receiver-host2-cpu7:(25) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu6:receiver-host2-cpu6:(24) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu5:receiver-host2-cpu5:(23) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu4:receiver-host2-cpu4:(22) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu3:receiver-host2-cpu3:(21) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu2:receiver-host2-cpu2:(20) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host2-cpu1:receiver-host2-cpu1:(19) 0.043559] [s4u_torus_multicpu/INFO] I got a 'Hello, I'm alive and running on host0-cpu0'. +> [host0-cpu0:sender:(1) 0.043559] [s4u_torus_multicpu/INFO] Goodbye now!