From 72b259695914c93a27739cad3c186aefbe78a01b Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Tue, 7 Feb 2012 22:50:03 +0100 Subject: [PATCH 01/16] Don't try to be clever with exceptions, let them flow and abort immediately. There is no need to make debugging more difficult than it already is. --- main.cpp | 215 +++++++++++++++++++++++++------------------------------ 1 file changed, 96 insertions(+), 119 deletions(-) diff --git a/main.cpp b/main.cpp index 011b258..8a1d5c8 100644 --- a/main.cpp +++ b/main.cpp @@ -71,53 +71,47 @@ static int simulation_main(int argc, char* argv[]) { int result; process* proc; - try { - proc = opt::loba_algorithms.new_instance(opt::loba_algo, argc, argv); - - proc_mutex->acquire(); - ++proc_counter; - proc_mutex->release(); - - result = proc->run(); - - proc_mutex->acquire(); - loads.push(proc->get_real_load()); - comps.push(proc->get_comp_amount()); - comp_iterations.push(proc->get_comp_iter()); - all_comp_iterations.push(proc->get_all_comp_iter()); - iter_deviation.push(proc->get_iter_deviation()); - data_send_amount.push(proc->get_data_send_amount()); - data_recv_amount.push(proc->get_data_recv_amount()); - data_send_count.push(proc->get_data_send_count()); - data_recv_count.push(proc->get_data_recv_count()); - ctrl_send_amount.push(proc->get_ctrl_send_amount()); - ctrl_recv_amount.push(proc->get_ctrl_recv_amount()); - ctrl_send_count.push(proc->get_ctrl_send_count()); - ctrl_recv_count.push(proc->get_ctrl_recv_count()); - idle_duration.push(proc->get_idle_duration()); - double c = proc->get_convergence(); - if (c != -1.0) - convergence.push(c); - - // Synchronization barrier... - // The goal is to circumvent a limitation in SimGrid (at least - // in version 3.5): a process must be alive when another one - // destroys a communication they had together. - - --proc_counter; - proc_cond->broadcast(); - while (proc_counter > 0) - proc_cond->wait(*proc_mutex); - proc_mutex->release(); - - delete proc; - } - catch (const std::invalid_argument& e) { - THROWF(arg_error, 0, "%s", e.what()); - } - catch (const std::exception& e) { - THROWF(0, 0, "%s", e.what()); - } + + proc = opt::loba_algorithms.new_instance(opt::loba_algo, argc, argv); + + proc_mutex->acquire(); + ++proc_counter; + proc_mutex->release(); + + result = proc->run(); + + proc_mutex->acquire(); + loads.push(proc->get_real_load()); + comps.push(proc->get_comp_amount()); + comp_iterations.push(proc->get_comp_iter()); + all_comp_iterations.push(proc->get_all_comp_iter()); + iter_deviation.push(proc->get_iter_deviation()); + data_send_amount.push(proc->get_data_send_amount()); + data_recv_amount.push(proc->get_data_recv_amount()); + data_send_count.push(proc->get_data_send_count()); + data_recv_count.push(proc->get_data_recv_count()); + ctrl_send_amount.push(proc->get_ctrl_send_amount()); + ctrl_recv_amount.push(proc->get_ctrl_recv_amount()); + ctrl_send_count.push(proc->get_ctrl_send_count()); + ctrl_recv_count.push(proc->get_ctrl_recv_count()); + idle_duration.push(proc->get_idle_duration()); + double c = proc->get_convergence(); + if (c != -1.0) + convergence.push(c); + + // Synchronization barrier... + // The goal is to circumvent a limitation in SimGrid (at least + // in version 3.5): a process must be alive when another one + // destroys a communication they had together. + + --proc_counter; + proc_cond->broadcast(); + while (proc_counter > 0) + proc_cond->wait(*proc_mutex); + proc_mutex->release(); + + delete proc; + return result; } @@ -194,13 +188,10 @@ static void install_signal_handler() int main(int argc, char* argv[]) { - // Note: variables modified between TRY (setjmp) and THROW (longjmp) - // must be declared as static or volatile. - static int exit_status = 0; // global exit status - static double simulated_time = -1.0; + int exit_status = 0; // global exit status + double simulated_time = -1.0; timestamp elapsed_time(timestamp::wallclock_time); timestamp simulation_time(timestamp::cpu_time); - xbt_ex_t ex; MSG_error_t res; elapsed_time.start(); @@ -218,8 +209,6 @@ int main(int argc, char* argv[]) } // Initialize some MSG internal data. - // Note: MSG_global_init() may throw an exception, but it seems - // impossible to catch it correctly :-( MSG_global_init(&argc, argv); install_signal_handler(); @@ -241,77 +230,65 @@ int main(int argc, char* argv[]) version::date.c_str()); opt::print(); - TRY { - exit_status = EXIT_FAILURE_INIT; // ===== - - // Register the default function of an agent - // MSG_function_register("simulation_main", simulation_main); - MSG_function_register_default(simulation_main); - - // Create the platform and the application. - XBT_DEBUG("Loading platform file..."); - MSG_create_environment(opt::platform_file.c_str()); - XBT_DEBUG("Creating hostdata..."); - hostdata::create(); - XBT_INFO("Loaded description of %zd hosts.", hostdata::size()); - XBT_DEBUG("Deploying processes..."); - if (opt::auto_depl::enabled) { - if (!opt::auto_depl::nhosts) - opt::auto_depl::nhosts = hostdata::size(); - if (opt::auto_depl::nhosts > hostdata::size()) { - XBT_WARN("%u hosts is too much: limiting to %zu", - opt::auto_depl::nhosts, hostdata::size()); - opt::auto_depl::nhosts = hostdata::size(); - } - if (opt::auto_depl::load == 0.0) { - XBT_WARN("Initial load is zero! " - "Falling back on old behaviour (load = nhosts)."); - opt::auto_depl::load = opt::auto_depl::nhosts; - } else if (opt::auto_depl::load < 0.0) - opt::auto_depl::load = - -opt::auto_depl::load * opt::auto_depl::nhosts; - double iload = std::trunc(opt::auto_depl::load); - if (opt::integer_transfer && opt::auto_depl::load != iload) { - XBT_WARN("Total load %g is not an integer. Truncate it.", - opt::auto_depl::load); - opt::auto_depl::load = iload; - } - MY_launch_application(); // it is already opt::* aware... - } else { - MSG_launch_application(opt::deployment_file.c_str()); + // Register the default function of an agent + // MSG_function_register("simulation_main", simulation_main); + MSG_function_register_default(simulation_main); + + // Create the platform and the application. + XBT_DEBUG("Loading platform file..."); + MSG_create_environment(opt::platform_file.c_str()); + XBT_DEBUG("Creating hostdata..."); + hostdata::create(); + XBT_INFO("Loaded description of %zd hosts.", hostdata::size()); + XBT_DEBUG("Deploying processes..."); + if (opt::auto_depl::enabled) { + if (!opt::auto_depl::nhosts) + opt::auto_depl::nhosts = hostdata::size(); + if (opt::auto_depl::nhosts > hostdata::size()) { + XBT_WARN("%u hosts is too much: limiting to %zu", + opt::auto_depl::nhosts, hostdata::size()); + opt::auto_depl::nhosts = hostdata::size(); } + if (opt::auto_depl::load == 0.0) { + XBT_WARN("Initial load is zero! " + "Falling back on old behaviour (load = nhosts)."); + opt::auto_depl::load = opt::auto_depl::nhosts; + } else if (opt::auto_depl::load < 0.0) + opt::auto_depl::load = + -opt::auto_depl::load * opt::auto_depl::nhosts; + double iload = std::trunc(opt::auto_depl::load); + if (opt::integer_transfer && opt::auto_depl::load != iload) { + XBT_WARN("Total load %g is not an integer. Truncate it.", + opt::auto_depl::load); + opt::auto_depl::load = iload; + } + MY_launch_application(); // it is already opt::* aware... + } else { + MSG_launch_application(opt::deployment_file.c_str()); + } - // Register tracing categories - TRACE_category_with_color(TRACE_CAT_COMP, TRACE_COLOR_COMP); - TRACE_category_with_color(TRACE_CAT_CTRL, TRACE_COLOR_CTRL); - TRACE_category_with_color(TRACE_CAT_DATA, TRACE_COLOR_DATA); - - exit_status = EXIT_FAILURE_SIMU; // ===== - - proc_mutex = new mutex_t(); - proc_cond = new condition_t(); + // Register tracing categories + TRACE_category_with_color(TRACE_CAT_COMP, TRACE_COLOR_COMP); + TRACE_category_with_color(TRACE_CAT_CTRL, TRACE_COLOR_CTRL); + TRACE_category_with_color(TRACE_CAT_DATA, TRACE_COLOR_DATA); - // Launch the MSG simulation. - XBT_INFO("Starting simulation at %f...", MSG_get_clock()); - res = MSG_main(); - simulated_time = MSG_get_clock(); - XBT_INFO("Simulation ended at %f.", simulated_time); + proc_mutex = new mutex_t(); + proc_cond = new condition_t(); - delete proc_cond; - delete proc_mutex; + // Launch the MSG simulation. + XBT_INFO("Starting simulation at %f...", MSG_get_clock()); + res = MSG_main(); + simulated_time = MSG_get_clock(); + XBT_INFO("Simulation ended at %f.", simulated_time); - if (res != MSG_OK) - THROWF(0, 0, "MSG_main() failed with status %#x", res); + delete proc_cond; + delete proc_mutex; - exit_status = EXIT_NO_FAILURE; // ===== - } - CATCH (ex) { - int len = strlen(ex.msg); - if (len > 0 && ex.msg[len - 1] == '\n') - ex.msg[len - 1] = '\0'; // strip the ending '\n' - XBT_ERROR("%s", ex.msg); - XBT_DEBUG("Error from %s() in %s:%d", ex.func, ex.file, ex.line); - xbt_ex_free(ex); + if (res == MSG_OK) { + exit_status = EXIT_NO_FAILURE; + } else { + XBT_ERROR("MSG_main() failed with status %#x", res); + exit_status = EXIT_FAILURE_SIMU; } // Clean the MSG simulation. -- 2.39.5 From 9075d25e1e7cffd10842fba8da121a6df7cac8e8 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Thu, 16 Feb 2012 16:49:03 +0100 Subject: [PATCH 02/16] contexts/factory:raw is now the default. --- Experimentations/run-all | 1 - 1 file changed, 1 deletion(-) diff --git a/Experimentations/run-all b/Experimentations/run-all index 581ee1f..95d70b1 100755 --- a/Experimentations/run-all +++ b/Experimentations/run-all @@ -70,7 +70,6 @@ variable_check DEADLINE : ${LOBA:=$PWD/loba} COMMON_OPTS=( - --cfg=contexts/factory:raw "${MORE_ARGS[@]}" ) -- 2.39.5 From 62b90f31f04373d8d2beb6afd797f48ff6fc398b Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Mon, 27 Feb 2012 13:26:08 +0100 Subject: [PATCH 03/16] Document a bug with parallel executions. --- BUGS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/BUGS b/BUGS index 040cfc3..d313473 100644 --- a/BUGS +++ b/BUGS @@ -1,4 +1,8 @@ ======================================================================== +Les variables globales process::total_load_* ne sont pas protégées +contre les accès concurrents. Il n'est donc pas possible actuellement +d'exécuter les simulations en parallèle (--cfg=contexts/nthreads). + ======================================================================== ##### RESOLVED BUGS COME AFTER THIS #################################### ======================================================================== -- 2.39.5 From 4d62e191e30069e0d496608250aca37bc20e4c9e Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Tue, 28 Feb 2012 08:20:22 +0100 Subject: [PATCH 04/16] ag_parameters: update for last experimentations. --- Experimentations/ag_parameters | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Experimentations/ag_parameters b/Experimentations/ag_parameters index 1e53983..293d9b4 100644 --- a/Experimentations/ag_parameters +++ b/Experimentations/ag_parameters @@ -2,20 +2,20 @@ # define the topologies TOPOLOGIES=( - btree - clique +# btree +# clique hcube line - ring - star +# ring +# star torus ) # define the algorithms ALGORITHMS=( - simple +# simple besteffort - 2besteffort +# 2besteffort makhoul ) @@ -23,10 +23,10 @@ ALGORITHMS=( PLATFORMS=( cluster16.xml cluster64.xml -# cluster256.xml + cluster256.xml grid16.xml grid64.xml -# grid256.xml + grid256.xml ) # number of hosts @@ -39,7 +39,7 @@ LOAD=-1000 DEADLINE=10000 # optional: additional arguments for loba (default: empty) -MORE_ARGS=( -l100 -x4 -m1e-4 -M10 ) +MORE_ARGS=( -l100 -m1e-4 -M10 -D5 -%.1) # optional: path to binary (default: ./loba) #LOBA=./loba -- 2.39.5 From b4391fa1cfad2af65474a95234401be106295507 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 15:31:59 +0100 Subject: [PATCH 05/16] Fix ispell dictionary in dir-locals.el. --- .dir-locals.el | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.dir-locals.el b/.dir-locals.el index 6a93d44..7f2c468 100644 --- a/.dir-locals.el +++ b/.dir-locals.el @@ -1,5 +1,6 @@ ;;; Emacs per-directory local variables ;;; Set preferred style (k&r style, no tabs, basic offset of 4) -((nil . ((indent-tabs-mode . nil) +((nil . ((ispell-local-dictionary . "american") + (indent-tabs-mode . nil) (c-basic-offset . 4) (c-file-style . "k&r")))) -- 2.39.5 From 526e1f382ddc9f0010c41a526f3029af769162c9 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 15:54:31 +0100 Subject: [PATCH 06/16] Protect concurrent accesses to shared variables in process. --- BUGS | 3 +++ main.cpp | 2 ++ process.cpp | 26 ++++++++++++++++---------- process.h | 7 +++++++ 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/BUGS b/BUGS index d313473..632bdde 100644 --- a/BUGS +++ b/BUGS @@ -3,6 +3,9 @@ Les variables globales process::total_load_* ne sont pas protégées contre les accès concurrents. Il n'est donc pas possible actuellement d'exécuter les simulations en parallèle (--cfg=contexts/nthreads). +Corrigé en partie. Il reste quelques "fixme: get locked?" à régler +(ou pas). + ======================================================================== ##### RESOLVED BUGS COME AFTER THIS #################################### ======================================================================== diff --git a/main.cpp b/main.cpp index 8a1d5c8..f31e847 100644 --- a/main.cpp +++ b/main.cpp @@ -274,6 +274,7 @@ int main(int argc, char* argv[]) proc_mutex = new mutex_t(); proc_cond = new condition_t(); + process::set_proc_mutex(proc_mutex); // Launch the MSG simulation. XBT_INFO("Starting simulation at %f...", MSG_get_clock()); @@ -281,6 +282,7 @@ int main(int argc, char* argv[]) simulated_time = MSG_get_clock(); XBT_INFO("Simulation ended at %f.", simulated_time); + process::set_proc_mutex(NULL); delete proc_cond; delete proc_mutex; diff --git a/process.cpp b/process.cpp index c85b605..36bdff0 100644 --- a/process.cpp +++ b/process.cpp @@ -16,6 +16,8 @@ XBT_LOG_EXTERNAL_DEFAULT_CATEGORY(proc); #include "process.h" +mutex_t *process::proc_mutex; + double process::total_load_init = 0.0; double process::total_load_running = 0.0; double process::total_load_exit = 0.0; @@ -58,22 +60,21 @@ process::process(int argc, char* argv[]) rev_neigh.insert(std::make_pair(host, ptr)); } - // Note: there should not be race condition with the current - // version of Simgrid, when updating the global variables. - prev_load_broadcast = -1; // force sending of load on first send_all() expected_load = real_load; - total_load_running += real_load; - total_load_init += real_load; received_load = 0.0; idle_duration = 0.0; convergence = -1.0; + proc_mutex->acquire(); process_counter++; + total_load_init += real_load; + total_load_running += real_load; total_load_average = total_load_running / process_counter; load_diff_threshold = (opt::load_ratio_threshold + opt::avg_load_ratio * total_load_average) / 100.0; + proc_mutex->release(); ctrl_close_pending = data_close_pending = neigh.size(); close_received = false; @@ -103,7 +104,9 @@ process::process(int argc, char* argv[]) process::~process() { delete lb_thread; + proc_mutex->acquire(); total_load_exit += real_load; + proc_mutex->release(); xbt_assert(received_load == 0.0, "received_load is %g, but should be 0.0 !", received_load); if (opt::log_rate < 0) @@ -123,7 +126,7 @@ process::~process() double process::get_iter_deviation() const { - double average_cost = opt::comp_cost(total_load_average); + double average_cost = opt::comp_cost(total_load_average); // fixme: get locked? // Do not count idle periods double comp_iter_opt = acc.comp_amount / average_cost; /* @@ -270,26 +273,29 @@ void process::compute_loop() } real_load += received_load; received_load = 0.0; + proc_mutex->acquire(); total_load_running -= real_load; + proc_mutex->release(); convergence_check(); comm.data_flush(true); } void process::convergence_check() { - double load_diff = std::fabs(real_load - total_load_average); + double average = total_load_average; // fixme: get locked? + double load_diff = std::fabs(real_load - average); bool converged = load_diff <= load_diff_threshold; if (convergence >= 0.0) { if (!converged) { XBT_VERB("current load has diverged: %g (%.4g%%)", - real_load, 100.0 * load_diff / total_load_average); + real_load, 100.0 * load_diff / average); convergence = -1.0; } } else { if (converged) { XBT_VERB("current load has converged: %g (%.4g%%)", - real_load, 100.0 * load_diff / total_load_average); + real_load, 100.0 * load_diff / average); convergence = MSG_get_clock(); } } @@ -327,7 +333,7 @@ bool process::still_running() last_status = false; } else if (100.0 * total_load_running / total_load_init <= - opt::load_ratio_threshold) { + opt::load_ratio_threshold) { // fixme: get locked? // fixme: this check should be implemented with a distributed // algorithm, and not a shared global variable! XBT_VERB("No more load to balance in system."); diff --git a/process.h b/process.h index c31d213..901d381 100644 --- a/process.h +++ b/process.h @@ -25,6 +25,9 @@ class process { public: + static void set_proc_mutex(mutex_t* m) { proc_mutex = m; } + + // Note: normally used with proc_mutex locked. static double get_total_load_init() { return total_load_init; } static double get_total_load_running() { return total_load_running; } static double get_total_load_exit() { return total_load_exit; } @@ -83,6 +86,10 @@ protected: xbt_log_category_t cat = _XBT_LOGV(default)) const; private: + static mutex_t *proc_mutex; // protect access to global variables + // (must be set before constructing + // the first object!) + static double total_load_init; // sum of process loads at init static double total_load_running; // sum of loads while running static double total_load_exit; // sum of process loads at exit -- 2.39.5 From e2986b2661919a35ed6cab7627ee83e63495ed3b Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 16:16:45 +0100 Subject: [PATCH 07/16] One more bug :( --- BUGS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/BUGS b/BUGS index 632bdde..8f62e4d 100644 --- a/BUGS +++ b/BUGS @@ -1,3 +1,7 @@ +======================================================================== +Les exécutions parallèles donnent des résultats différents. +Pourquoi ? + ======================================================================== Les variables globales process::total_load_* ne sont pas protégées contre les accès concurrents. Il n'est donc pas possible actuellement -- 2.39.5 From 5012e07790450b078e83e0494af6de8ba60c8c92 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 16:19:28 +0100 Subject: [PATCH 08/16] Remove option -x as an alternative for -k. --- TODO | 3 --- options.cpp | 7 ------- 2 files changed, 10 deletions(-) diff --git a/TODO b/TODO index a337bc4..77dbb42 100644 --- a/TODO +++ b/TODO @@ -4,9 +4,6 @@ (i) send the tasks that have the most count of iterations; (ii) send the tasks that have the least count of iterations. -* Remove usage of opt::option_x in loba_besteffort.cpp and loba_2besteffort.cpp - Use -k instead (done). Remove hack marked "FIXME" in opt::parse_args(). - * Support heterogeneous platforms? Not sure yet. Should be doable if each process also sends its speed to its neighbors. diff --git a/options.cpp b/options.cpp index 01ae447..7c5a649 100644 --- a/options.cpp +++ b/options.cpp @@ -355,13 +355,6 @@ bool opt::parse_args(int* argc, char* argv[]) #undef PARSE_ARG - if (opt::option_x) { // FIXME: remove this one day... - opt::loba_best_divisor = opt::option_x; - XBT_WARN("divisor for algorithms *best* set from option -x (%d => %u)," - " use -k instead", - opt::option_x, opt::loba_best_divisor); - } - if (opt::version_requested || opt::help_requested) return 1; -- 2.39.5 From 5b9ddb42c7ce6f65f3a0a242209dc4ae1312d54c Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 16:31:56 +0100 Subject: [PATCH 09/16] Document changing API with latest version of SG. --- BUGS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/BUGS b/BUGS index 8f62e4d..464e620 100644 --- a/BUGS +++ b/BUGS @@ -1,3 +1,7 @@ +======================================================================== +Les fonctions MSG_get_host{number,table} n'existent plus dans les +dernières versions de SimGrid. Utiliser MSG_hosts_as_dynar à la place. + ======================================================================== Les exécutions parallèles donnent des résultats différents. Pourquoi ? -- 2.39.5 From 8f5e351ba6584886c256d88d2112bd1efab5f1bf Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 16:41:03 +0100 Subject: [PATCH 10/16] Move loading of atomic vs. cstdatomic in atomic_compat.h. --- atomic_compat.h | 20 ++++++++++++++++++++ sync_queue.h | 12 +----------- 2 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 atomic_compat.h diff --git a/atomic_compat.h b/atomic_compat.h new file mode 100644 index 0000000..1fec370 --- /dev/null +++ b/atomic_compat.h @@ -0,0 +1,20 @@ +#ifndef ATOMIC_COMPAT_H +#define ATOMIC_COMPAT_H + +#if __GNUC__ == 4 && __GNUC_MINOR__ == 4 +# include // is named in gcc 4.4 + +template // fix missing definition in gcc 4.4 +void +atomic<_Tp*>::store(_Tp* __v, memory_order __m) volatile +{ atomic_address::store(__v, __m); } + +#else +# include +#endif + +#endif // !ATOMIC_COMPAT_H + +// Local variables: +// mode: c++ +// End: diff --git a/sync_queue.h b/sync_queue.h index c3f29e5..fc35339 100644 --- a/sync_queue.h +++ b/sync_queue.h @@ -1,17 +1,7 @@ #ifndef SYNC_QUEUE_H #define SYNC_QUEUE_H -#if __GNUC__ == 4 && __GNUC_MINOR__ == 4 -# include // is named in gcc 4.4 - -template // fix missing definition in gcc 4.4 -void -atomic<_Tp*>::store(_Tp* __v, memory_order __m) volatile -{ atomic_address::store(__v, __m); } - -#else -# include -#endif +#include "atomic_compat.h" #define SYNC_QUEUE_BUFSIZE 16 -- 2.39.5 From 245565c060611ddfed5067ab4000e91e440c402c Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 16:59:19 +0100 Subject: [PATCH 11/16] Add option -G, to exit on detection of global convergence. --- options.cpp | 9 ++++++++- options.h | 1 + process.cpp | 9 +++++++++ process.h | 3 +++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/options.cpp b/options.cpp index 7c5a649..2db9aeb 100644 --- a/options.cpp +++ b/options.cpp @@ -80,6 +80,7 @@ namespace opt { unsigned lb_maxiter = 0; unsigned comp_maxiter = 0; double time_limit = 0; + bool exit_on_convergence = false; bool exit_on_close = true; // Named parameters lists @@ -233,7 +234,7 @@ bool opt::parse_args(int* argc, char* argv[]) opterr = 0; while ((c = getopt(*argc, argv, "%:" - "a:bc:C:d:D:eEhi:I:k:l:L:m:M:N:r:Rs:S:t:T:vVx:X:Z")) + "a:bc:C:d:D:eEGhi:I:k:l:L:m:M:N:r:Rs:S:t:T:vVx:X:Z")) != -1) { switch (c) { case '%': @@ -279,6 +280,9 @@ bool opt::parse_args(int* argc, char* argv[]) case 'E': opt::egocentric = !opt::egocentric; break; + case 'G': + opt::exit_on_convergence = !opt::exit_on_convergence; + break; case 'h': opt::help_requested++; break; @@ -443,6 +447,7 @@ void opt::print() h.val_or_string(comp_maxiter, "no limit")); DESCR("convergence is assumed within (\%)", "%g", opt::avg_load_ratio); DESCR("time limit", "%s", h.val_or_string(time_limit, "no limit")); + DESCR("exit on convergence", "%s", h.on_off(exit_on_convergence)); DESCR("exit on close", "%s", h.on_off(exit_on_close)); XBT_INFO("`----"); @@ -564,6 +569,8 @@ void opt::usage() std::clog << o("-% value") << "percent of the load average to assume the convergence" << " [" << opt::avg_load_ratio << "]\n"; + std::clog << o("-G") << "toggle exit on detection of convergence" + << " [" << opt_helper::on_off(opt::exit_on_convergence) << "]\n"; std::clog << o("-t value") << "time limit (simulated time), 0 for no limit" << " [" << opt::time_limit << "]\n"; diff --git a/options.h b/options.h index 264a566..ccb56bd 100644 --- a/options.h +++ b/options.h @@ -63,6 +63,7 @@ namespace opt { extern unsigned lb_maxiter; extern unsigned comp_maxiter; extern double time_limit; + extern bool exit_on_convergence; extern bool exit_on_close; // Named parameters lists diff --git a/process.cpp b/process.cpp index 36bdff0..dfbf123 100644 --- a/process.cpp +++ b/process.cpp @@ -26,6 +26,8 @@ int process::process_counter = 0; double process::total_load_average; double process::load_diff_threshold; +std::atomic process::convergence_counter(0); + namespace { void sleep_until_date(double& date, double duration) @@ -69,6 +71,7 @@ process::process(int argc, char* argv[]) proc_mutex->acquire(); process_counter++; + convergence_counter++; total_load_init += real_load; total_load_running += real_load; total_load_average = total_load_running / process_counter; @@ -291,12 +294,14 @@ void process::convergence_check() XBT_VERB("current load has diverged: %g (%.4g%%)", real_load, 100.0 * load_diff / average); convergence = -1.0; + convergence_counter++; } } else { if (converged) { XBT_VERB("current load has converged: %g (%.4g%%)", real_load, 100.0 * load_diff / average); convergence = MSG_get_clock(); + convergence_counter--; } } } @@ -324,6 +329,10 @@ bool process::still_running() XBT_VERB("Reached comp_maxiter: %d/%d", comp_iter, opt::comp_maxiter); last_status = false; + } else if (opt::exit_on_convergence && convergence_counter == 0) { + XBT_VERB("Global convergence detected"); + last_status = false; + } else if (opt::exit_on_close && close_received) { XBT_VERB("Close received"); last_status = false; diff --git a/process.h b/process.h index 901d381..5e1faa6 100644 --- a/process.h +++ b/process.h @@ -16,6 +16,7 @@ #include #include #include +#include "atomic_compat.h" #include "communicator.h" #include "misc.h" #include "msg_thread.h" @@ -98,6 +99,8 @@ private: static double total_load_average; static double load_diff_threshold; + static std::atomic convergence_counter; + typedef MAP_TEMPLATE rev_neigh_type; neigh_type neigh; // list of neighbors (do not alter // after construction!) -- 2.39.5 From 6327e2032096180078429253034afecb48208abd Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 17:37:13 +0100 Subject: [PATCH 12/16] Add possibility to set the convergence threshold automatically. --- options.cpp | 3 +++ process.cpp | 12 +++++++++--- process.h | 1 + 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/options.cpp b/options.cpp index 2db9aeb..21f9b9b 100644 --- a/options.cpp +++ b/options.cpp @@ -569,6 +569,9 @@ void opt::usage() std::clog << o("-% value") << "percent of the load average to assume the convergence" << " [" << opt::avg_load_ratio << "]\n"; + if (opt::help_requested > 1) + std::clog << o("") + << "or, if negative, 100 * (nhosts / -value) / avg_load\n"; std::clog << o("-G") << "toggle exit on detection of convergence" << " [" << opt_helper::on_off(opt::exit_on_convergence) << "]\n"; std::clog << o("-t value") diff --git a/process.cpp b/process.cpp index dfbf123..01b5e0f 100644 --- a/process.cpp +++ b/process.cpp @@ -24,6 +24,7 @@ double process::total_load_exit = 0.0; int process::process_counter = 0; double process::total_load_average; +double process::average_load_ratio; double process::load_diff_threshold; std::atomic process::convergence_counter(0); @@ -75,8 +76,13 @@ process::process(int argc, char* argv[]) total_load_init += real_load; total_load_running += real_load; total_load_average = total_load_running / process_counter; + if (opt::avg_load_ratio >= 0.0) + average_load_ratio = opt::avg_load_ratio; + else + average_load_ratio = 100.0 * + (process_counter / -opt::avg_load_ratio) / total_load_average; load_diff_threshold = (opt::load_ratio_threshold + - opt::avg_load_ratio * total_load_average) / 100.0; + average_load_ratio * total_load_average) / 100.0; proc_mutex->release(); ctrl_close_pending = data_close_pending = neigh.size(); @@ -118,10 +124,10 @@ process::~process() lb_iter, comp_iter, all_comp_iter, real_load); if (convergence >= 0.0) XBT_INFO("Convergence within %g%% was achieved at time %g", - opt::avg_load_ratio, convergence); + average_load_ratio, convergence); else XBT_INFO("Convergence within %g%% was not achieved", - opt::avg_load_ratio); + average_load_ratio); XBT_VERB("Expected load was: %g", expected_load); XBT_VERB("Total computation for this process: %g", get_comp_amount()); print_loads(true, xbt_log_priority_debug); diff --git a/process.h b/process.h index 5e1faa6..aa9ebcb 100644 --- a/process.h +++ b/process.h @@ -97,6 +97,7 @@ private: static int process_counter; static double total_load_average; + static double average_load_ratio; static double load_diff_threshold; static std::atomic convergence_counter; -- 2.39.5 From e2ceb7b82ec3988dc7519e75a410b312f389c917 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 17:39:40 +0100 Subject: [PATCH 13/16] Experimental: use TRY_NO_BACKTRACE if available. No functional change. --- messages.cpp | 2 +- misc.h | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/messages.cpp b/messages.cpp index 2d5de94..e9fbced 100644 --- a/messages.cpp +++ b/messages.cpp @@ -72,7 +72,7 @@ bool message_queue::pop(message*& msg, m_host_t& from, double timeout) if (!queue.try_pop(task)) { xbt_ex_t e; XBT_DEBUG("waiting for a message to come"); - TRY { + TRY_FAST { if (timeout > 0) cond.timedwait(mutex, timeout); else diff --git a/misc.h b/misc.h index 557bed5..e58ac61 100644 --- a/misc.h +++ b/misc.h @@ -22,6 +22,14 @@ namespace misc { */ #define XBT_XCLOG(c, p, ...) XBT_CLOG_((*(c)), p, __VA_ARGS__) +/* Use a faster version of TRY if available. + */ +#if defined(TRY_NO_BACKTRACE) +# define TRY_FAST TRY_NO_BACKTRACE +#else +# define TRY_FAST TRY +#endif + #endif // !MISC_H // Local variables: -- 2.39.5 From dd6e93b0d35c26c2dd9dacdf3fb964a2da2c1cb2 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 18:20:12 +0100 Subject: [PATCH 14/16] Cosmetics: improve help message. --- options.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/options.cpp b/options.cpp index 21f9b9b..9b427bc 100644 --- a/options.cpp +++ b/options.cpp @@ -571,7 +571,7 @@ void opt::usage() << " [" << opt::avg_load_ratio << "]\n"; if (opt::help_requested > 1) std::clog << o("") - << "or, if negative, 100 * (nhosts / -value) / avg_load\n"; + << "- if negative: use 100 * (nhosts / -value) / avg_load\n"; std::clog << o("-G") << "toggle exit on detection of convergence" << " [" << opt_helper::on_off(opt::exit_on_convergence) << "]\n"; std::clog << o("-t value") -- 2.39.5 From b4d90a50ce65a5efb75c0a28b269120ea51d57cf Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Wed, 29 Feb 2012 18:21:14 +0100 Subject: [PATCH 15/16] Parameterize the exit on convergence. Parameter is the number of iterations to wait while converged before to decide to exit. --- options.cpp | 19 ++++++++++++------- options.h | 2 +- process.cpp | 23 +++++++++++++---------- process.h | 1 + 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/options.cpp b/options.cpp index 9b427bc..25d7642 100644 --- a/options.cpp +++ b/options.cpp @@ -80,7 +80,7 @@ namespace opt { unsigned lb_maxiter = 0; unsigned comp_maxiter = 0; double time_limit = 0; - bool exit_on_convergence = false; + int exit_on_convergence = 0; bool exit_on_close = true; // Named parameters lists @@ -234,7 +234,7 @@ bool opt::parse_args(int* argc, char* argv[]) opterr = 0; while ((c = getopt(*argc, argv, "%:" - "a:bc:C:d:D:eEGhi:I:k:l:L:m:M:N:r:Rs:S:t:T:vVx:X:Z")) + "a:bc:C:d:D:eEg:hi:I:k:l:L:m:M:N:r:Rs:S:t:T:vVx:X:Z")) != -1) { switch (c) { case '%': @@ -280,8 +280,8 @@ bool opt::parse_args(int* argc, char* argv[]) case 'E': opt::egocentric = !opt::egocentric; break; - case 'G': - opt::exit_on_convergence = !opt::exit_on_convergence; + case 'g': + PARSE_ARG(opt::exit_on_convergence); break; case 'h': opt::help_requested++; @@ -447,7 +447,8 @@ void opt::print() h.val_or_string(comp_maxiter, "no limit")); DESCR("convergence is assumed within (\%)", "%g", opt::avg_load_ratio); DESCR("time limit", "%s", h.val_or_string(time_limit, "no limit")); - DESCR("exit on convergence", "%s", h.on_off(exit_on_convergence)); + DESCR("exit on convergence", "%s", + h.val_or_string(exit_on_convergence, "disabled")); DESCR("exit on close", "%s", h.on_off(exit_on_close)); XBT_INFO("`----"); @@ -572,8 +573,12 @@ void opt::usage() if (opt::help_requested > 1) std::clog << o("") << "- if negative: use 100 * (nhosts / -value) / avg_load\n"; - std::clog << o("-G") << "toggle exit on detection of convergence" - << " [" << opt_helper::on_off(opt::exit_on_convergence) << "]\n"; + std::clog << o("-g value") + << "number of consecutive detections of convergence to exit" + << " [" << opt::exit_on_convergence << "]\n"; + if (opt::help_requested > 1) + std::clog << o("") + << "- use 0 to disable\n"; std::clog << o("-t value") << "time limit (simulated time), 0 for no limit" << " [" << opt::time_limit << "]\n"; diff --git a/options.h b/options.h index ccb56bd..5b47022 100644 --- a/options.h +++ b/options.h @@ -63,7 +63,7 @@ namespace opt { extern unsigned lb_maxiter; extern unsigned comp_maxiter; extern double time_limit; - extern bool exit_on_convergence; + extern int exit_on_convergence; extern bool exit_on_close; // Named parameters lists diff --git a/process.cpp b/process.cpp index 01b5e0f..97f2a69 100644 --- a/process.cpp +++ b/process.cpp @@ -295,19 +295,22 @@ void process::convergence_check() double load_diff = std::fabs(real_load - average); bool converged = load_diff <= load_diff_threshold; - if (convergence >= 0.0) { - if (!converged) { - XBT_VERB("current load has diverged: %g (%.4g%%)", - real_load, 100.0 * load_diff / average); - convergence = -1.0; - convergence_counter++; - } - } else { - if (converged) { + if (converged) { + if (convergence < 0) { XBT_VERB("current load has converged: %g (%.4g%%)", real_load, 100.0 * load_diff / average); convergence = MSG_get_clock(); - convergence_counter--; + local_convergence_counter = opt::exit_on_convergence; + } + if (local_convergence_counter > 0 && --local_convergence_counter == 0) + --convergence_counter; + } else { + if (convergence >= 0.0) { + XBT_VERB("current load has diverged: %g (%.4g%%)", + real_load, 100.0 * load_diff / average); + convergence = -1.0; + if (local_convergence_counter == 0) + ++convergence_counter; } } } diff --git a/process.h b/process.h index aa9ebcb..14d514f 100644 --- a/process.h +++ b/process.h @@ -128,6 +128,7 @@ private: double idle_duration; // how long we had nothing to compute double convergence; // date when convergence was achieved, or -1.0 + int local_convergence_counter; // number of iterations since convergence mutex_t mutex; // synchronization between threads condition_t cond; -- 2.39.5 From c1cbfced478eee0bb6cc64eeb409de67e2366bf8 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Thu, 1 Mar 2012 08:07:00 +0100 Subject: [PATCH 16/16] ag_parameters: update. --- Experimentations/ag_parameters | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Experimentations/ag_parameters b/Experimentations/ag_parameters index 293d9b4..0e1c625 100644 --- a/Experimentations/ag_parameters +++ b/Experimentations/ag_parameters @@ -36,10 +36,10 @@ NHOSTS=0 LOAD=-1000 # time limit for the simulation -DEADLINE=10000 +DEADLINE=0 # optional: additional arguments for loba (default: empty) -MORE_ARGS=( -l100 -m1e-4 -M10 -D5 -%.1) +MORE_ARGS=( -l100 -m1e-4 -M10 -D5 -%-2 -g10 ) # optional: path to binary (default: ./loba) #LOBA=./loba -- 2.39.5