From: Martin Quinson Date: Sun, 25 Aug 2019 19:41:40 +0000 (+0200) Subject: tesh kill timeouted processes with KILL also X-Git-Tag: v3.24~143 X-Git-Url: http://bilbo.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/b18431ee82f850d5f58e684b4aa97f30e37a6552 tesh kill timeouted processes with KILL also It seems that the sanitizers processes are surviving their timeouts on the server, so this may help fixing the build robots. On the way, I switched to save the PID of the forked process instead of its PGID. This helps when I try to see if the process is still alive, later on: I can always do PID->PGID later (as I do now) while it seems impossible to do PGID->PID to see whether the process is still alive. Fun fact: I think that PID=PGID on POSIX systems, but let's play safe :) --- diff --git a/tools/tesh/tesh.py b/tools/tesh/tesh.py index d62cd7f921..f9ef1d4775 100755 --- a/tools/tesh/tesh.py +++ b/tools/tesh/tesh.py @@ -33,6 +33,7 @@ import re import difflib import signal import argparse +import time if sys.version_info[0] == 3: import subprocess @@ -115,27 +116,47 @@ except NameError: # # Global variable. Stores which process group should be killed (or None otherwise) -running_pgids = list() +running_pids = list() -def kill_process_group(pgid): - if pgid is None: # Nobody to kill. We don't know who to kill on windows, or we don't have anyone to kill on signal handler +# Tests whether the process is dead already +def process_is_dead(pid): + try: + os.kill(pid, 0) + except ProcessLookupError: + return True + except OSError as err: + if err.errno == errno.ESRCH: # ESRCH == No such process. The process is now dead + return True + return False + +# This function send TERM signal + KILL signal after 0.2s to the group of the specified process +def kill_process_group(pid): + if pid is None: # Nobody to kill. We don't know who to kill on windows, or we don't have anyone to kill on signal handler return - # print("Kill process group {}".format(pgid)) + try: + pgid = os.getpgid(pid) + except: + # os.getpgid failed. Ok, don't cleanup. + return + try: os.killpg(pgid, signal.SIGTERM) + if process_is_dead(pid): + return + time.sleep(0.2) + os.killpg(pgid, signal.SIGKILL) except OSError: # os.killpg failed. OK. Some subprocesses may still be running. pass - def signal_handler(signal, frame): print("Caught signal {}".format(SIGNALS_TO_NAMES_DICT[signal])) - global running_pgids - running_pgids_copy = running_pgids # Just in case of interthread conflicts. - for pgid in running_pgids_copy: - kill_process_group(pgid) - running_pgids.clear() + global running_pids + running_pids_copy = running_pids # Just in case of interthread conflicts. + for pid in running_pids_copy: + kill_process_group(pid) + running_pids.clear() tesh_exit(5) @@ -328,8 +349,8 @@ class Cmd(object): args = shlex.split(self.args) - global running_pgids - local_pgid = None + global running_pids + local_pid = None global return_code try: @@ -344,13 +365,9 @@ class Cmd(object): stderr=subprocess.STDOUT, universal_newlines=True, preexec_fn=preexec_function) - try: - if not isWindows(): - local_pgid = os.getpgid(proc.pid) - running_pgids.append(local_pgid) - except OSError: - # os.getpgid failed. OK. No cleanup. - pass + if not isWindows(): + local_pid = proc.pid + running_pids.append(local_pid) except PermissionError: logs.append("[{file}:{number}] Cannot start '{cmd}': The binary is not executable.".format( file=FileReader().filename, number=self.linenumber, cmd=args[0])) @@ -381,14 +398,14 @@ class Cmd(object): cmdName = FileReader().filename + ":" + str(self.linenumber) try: (stdout_data, stderr_data) = proc.communicate("\n".join(self.input_pipe), self.timeout) - local_pgid = None + local_pid = None timeout_reached = False except subprocess.TimeoutExpired: timeout_reached = True logs.append("Test suite `{file}': NOK (<{cmd}> timeout after {timeout} sec)".format( file=FileReader().filename, cmd=cmdName, timeout=self.timeout)) - running_pgids.remove(local_pgid) - kill_process_group(local_pgid) + running_pids.remove(local_pid) + kill_process_group(local_pid) # Try to get the output of the timeout process, to help in debugging. try: (stdout_data, stderr_data) = proc.communicate(timeout=1)