From bc691c290cd062a5c929b94d32b92f18ef5d235b Mon Sep 17 00:00:00 2001 From: Arnaud Giersch Date: Tue, 18 Jan 2011 23:26:24 +0100 Subject: [PATCH] Bug fixed: use a timeout on receive. --- TODO | 13 +------------ communicator.cpp | 21 +++++++++++++++++---- communicator.h | 5 +++-- process.cpp | 20 +++++++++++++------- process.h | 6 ++++-- 5 files changed, 38 insertions(+), 27 deletions(-) diff --git a/TODO b/TODO index 3152ad9..2083dab 100644 --- a/TODO +++ b/TODO @@ -1,15 +1,4 @@ -* segfault with ./loba cluster1000.xml -N64 -a fairstrategy - - this is a deadlock occuring when: - - a process is in the finalize stage; - - all processes but one are blocked on receive; - - the process that is still running owns all the remaining load, - and sends it all to the finalizing process, and then goes in - blocking receive. - The finalizing process receives the load, and blocks again, - waiting for a close message. - All processes are then blocked, and non-one is able to see that - there is no more load in the system! +* review receive with timeout. * verify bookkeeping version. diff --git a/communicator.cpp b/communicator.cpp index 8adb258..90cf990 100644 --- a/communicator.cpp +++ b/communicator.cpp @@ -83,13 +83,26 @@ void communicator::send(const char* dest, message* msg) sent_comm.push_back(comm); } -bool communicator::recv(message*& msg, m_host_t& from, bool wait) +bool communicator::recv(message*& msg, m_host_t& from, double timeout) { - if (wait) { + if (timeout != 0) { + volatile double deadline = + timeout > 0 ? MSG_get_clock() + timeout : 0.0; xbt_mutex_acquire(mutex); - while (received.empty()) { + while (received.empty() && (!deadline || deadline > MSG_get_clock())) { + xbt_ex_t e; DEBUG0("waiting for a message to come"); - xbt_cond_wait(cond, mutex); + TRY { + if (deadline) + xbt_cond_timedwait(cond, mutex, deadline - MSG_get_clock()); + else + xbt_cond_wait(cond, mutex); + } + CATCH (e) { + if (e.category != timeout_error) + RETHROW; + xbt_ex_free(e); + } } xbt_mutex_release(mutex); } diff --git a/communicator.h b/communicator.h index f00e063..66dc800 100644 --- a/communicator.h +++ b/communicator.h @@ -34,8 +34,9 @@ public: void send(const char* dest, message* msg); // Try to get a message. Returns true on success. - // If "wait" is true, blocks until success. - bool recv(message*& msg, m_host_t& from, bool wait); + // Parameter "timeout" may be 0 for non-blocking operation, -1 for + // infinite waiting, or any positive timeout. + bool recv(message*& msg, m_host_t& from, double timeout); // Try to flush pending sending communications. // If "wait" is true, blocks until success. diff --git a/process.cpp b/process.cpp index 0544b21..0f53576 100644 --- a/process.cpp +++ b/process.cpp @@ -123,8 +123,14 @@ int process::run() // block on receiving unless there is something to compute or // to send - bool wait = (load == 0 && lb_load() == prev_load_broadcast); - receive(wait); + double timeout; + if (load != 0 || lb_load() != prev_load_broadcast) + timeout = 0.0; + else if (opt::min_iter_duration) + timeout = opt::min_iter_duration; + else + timeout = 1.0; + receive(timeout); // one of our neighbor is finalizing if (opt::exit_on_close && close_received) { @@ -261,13 +267,13 @@ void process::send() comm.flush(false); } -void process::receive(bool wait) +void process::receive(double timeout) { message* msg; m_host_t from; - DEBUG1("%sblocking receive", "\0non-" + !wait); - while (may_receive() && comm.recv(msg, from, wait)) { + DEBUG2("%sblocking receive (%g)", "\0non-" + !timeout, timeout); + while (may_receive() && comm.recv(msg, from, timeout)) { switch (msg->get_type()) { case message::INFO: { neighbor* n = rev_neigh[from]; @@ -294,7 +300,7 @@ void process::receive(bool wait) break; } delete msg; - wait = false; // only wait on first recv + timeout = 0.0; // only wait on first recv } comm.flush(false); } @@ -322,7 +328,7 @@ void process::finalize() (unsigned long )neigh.size(), ESSE(neigh.size())); while (may_receive()) { comm.flush(false); - receive(true); + receive(-1.0); } comm.flush(true); diff --git a/process.h b/process.h index b124593..fb1fe3f 100644 --- a/process.h +++ b/process.h @@ -99,8 +99,10 @@ private: // Returns true if there remains neighbors to listen for bool may_receive() { return ctrl_close_pending || data_close_pending; } - // Receive procedure: wait (or not) for a message to come - void receive(bool wait); + // Receive procedure + // Parameter "timeout" may be 0 for non-blocking operation, -1 for + // infinite waiting, or any positive timeout. + void receive(double timeout); // Finalize sends a "close" message to each neighbor and wait for // all of them to answer. -- 2.39.5