From 28a30c9d25c9555dd21935653d2386653b0f3fe8 Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Mon, 3 Oct 2011 17:35:25 +0200 Subject: [PATCH 1/1] Add statistics about convergence (see parameter -%). --- main.cpp | 8 ++++++++ options.cpp | 9 +++++++++ options.h | 1 + process.cpp | 33 +++++++++++++++++++++++++++++++++ process.h | 6 ++++++ 5 files changed, 57 insertions(+) diff --git a/main.cpp b/main.cpp index 9ad4b53..12f1ae2 100644 --- a/main.cpp +++ b/main.cpp @@ -61,6 +61,7 @@ namespace { statistics ctrl_recv_amount; statistics ctrl_send_count; statistics ctrl_recv_count; + statistics convergence; } @@ -90,6 +91,9 @@ static int simulation_main(int argc, char* argv[]) ctrl_recv_amount.push(proc->get_ctrl_recv_amount()); ctrl_send_count.push(proc->get_ctrl_send_count()); ctrl_recv_count.push(proc->get_ctrl_recv_count()); + double c = proc->get_convergence(); + if (c != -1) + convergence.push(c); // Synchronization barrier... // The goal is to circumvent a limitation in SimGrid (at least @@ -347,6 +351,10 @@ int main(int argc, char* argv[]) "percent of the load average"); PR_VALUE("Data transfer amount", "%g %s", transfer_amount, "times the total amount of data"); + PR_VALUE("Number of hosts that converged", "%u / %u", + convergence.get_count(), loads.get_count()); + PR_VALUE("Date of first host convergence", "%g", convergence.get_min()); + PR_VALUE("Date of last host convergence", "%g", convergence.get_max()); XBT_INFO("`----"); } diff --git a/options.cpp b/options.cpp index 8027463..01ae447 100644 --- a/options.cpp +++ b/options.cpp @@ -76,6 +76,7 @@ namespace opt { double comp_time_delay = 0.0; // fixme: find better defaults // Parameters for the end of the simulation + double avg_load_ratio = 0.0; unsigned lb_maxiter = 0; unsigned comp_maxiter = 0; double time_limit = 0; @@ -231,9 +232,13 @@ bool opt::parse_args(int* argc, char* argv[]) int c; opterr = 0; while ((c = getopt(*argc, argv, + "%:" "a:bc:C:d:D:eEhi:I:k:l:L:m:M:N:r:Rs:S:t:T:vVx:X:Z")) != -1) { switch (c) { + case '%': + PARSE_ARG(opt::avg_load_ratio); + break; case 'a': opt::loba_algo = optarg; result = opt_helper::nol_find_prefix(opt::loba_algorithms, @@ -443,6 +448,7 @@ void opt::print() h.val_or_string(lb_maxiter, "no limit")); DESCR("maximum number of comp. iterations", "%s", h.val_or_string(comp_maxiter, "no limit")); + DESCR("convergence is assumed within (\%)", "%g", opt::avg_load_ratio); DESCR("time limit", "%s", h.val_or_string(time_limit, "no limit")); DESCR("exit on close", "%s", h.on_off(exit_on_close)); XBT_INFO("`----"); @@ -562,6 +568,9 @@ void opt::usage() std::clog << o("-I value") << "maximum number of comp. iterations, 0 for no limit" << " [" << opt::comp_maxiter << "]\n"; + std::clog << o("-% value") + << "percent of the load average to assume the convergence" + << " [" << opt::avg_load_ratio << "]\n"; std::clog << o("-t value") << "time limit (simulated time), 0 for no limit" << " [" << opt::time_limit << "]\n"; diff --git a/options.h b/options.h index ef7a73e..264a566 100644 --- a/options.h +++ b/options.h @@ -59,6 +59,7 @@ namespace opt { extern double comp_time_delay; // Parameters for the end of the simulation + extern double avg_load_ratio; extern unsigned lb_maxiter; extern unsigned comp_maxiter; extern double time_limit; diff --git a/process.cpp b/process.cpp index 07372dc..76524fd 100644 --- a/process.cpp +++ b/process.cpp @@ -20,6 +20,9 @@ double process::total_load_init = 0.0; double process::total_load_running = 0.0; double process::total_load_exit = 0.0; +int process::process_counter = 0; +double process::total_load_average; + namespace { void sleep_until_date(double& date, double duration) @@ -54,12 +57,20 @@ process::process(int argc, char* argv[]) rev_neigh.insert(std::make_pair(host, ptr)); } + // Note: there should not be race condition with the current + // version of Simgrid, when updating the global variables. + prev_load_broadcast = -1; // force sending of load on first send_all() expected_load = real_load; total_load_running += real_load; total_load_init += real_load; received_load = 0.0; + convergence = -1.0; + + process_counter++; + total_load_average = total_load_running / process_counter; + ctrl_close_pending = data_close_pending = neigh.size(); close_received = false; finalizing = false; @@ -95,6 +106,12 @@ process::~process() return; XBT_INFO("Final load after %d:%d:%d iterations: %g", lb_iter, comp_iter, all_comp_iter, real_load); + if (convergence >= 0.0) + XBT_INFO("Convergence within %g%% was achieved at time %g", + opt::avg_load_ratio, convergence); + else + XBT_INFO("Convergence within %g%% was not achieved", + opt::avg_load_ratio); XBT_VERB("Expected load was: %g", expected_load); XBT_VERB("Total computation for this process: %g", get_comp_amount()); print_loads(true, xbt_log_priority_debug); @@ -199,6 +216,22 @@ void process::compute_loop() if (real_load == 0.0) continue; + double load_ratio = + 100.0 * std::fabs(real_load / total_load_average - 1.0); + if (convergence >= 0.0) { + if (load_ratio > opt::avg_load_ratio) { + XBT_VERB("current load has diverged: %g (%.4g%%)", + real_load, load_ratio); + convergence = -1.0; + } + } else { + if (load_ratio <= opt::avg_load_ratio) { + XBT_VERB("current load has converged: %g (%.4g%%)", + real_load, load_ratio); + convergence = MSG_get_clock(); + } + } + // compute ++comp_iter; double flops = opt::comp_cost(real_load); diff --git a/process.h b/process.h index 9ea0521..ff8b751 100644 --- a/process.h +++ b/process.h @@ -44,6 +44,7 @@ public: double get_ctrl_recv_amount() const { return acc.ctrl_recv.amount; } unsigned get_ctrl_send_count() const { return acc.ctrl_send.count; } unsigned get_ctrl_recv_count() const { return acc.ctrl_recv.count; } + double get_convergence() const { return convergence; } int run(); @@ -84,6 +85,9 @@ private: static double total_load_running; // sum of loads while running static double total_load_exit; // sum of process loads at exit + static int process_counter; + static double total_load_average; + typedef MAP_TEMPLATE rev_neigh_type; neigh_type neigh; // list of neighbors (do not alter // after construction!) @@ -108,6 +112,8 @@ private: double expected_load; // expected load in bookkeeping mode double received_load; // load received from neighbors + double convergence; // date when convergence was achieved, or -1.0 + mutex_t mutex; // synchronization between threads condition_t cond; -- 2.39.5