drain runner queue and undo cancel_join_thread

Sometimes the runners can hang. Initially debugging lead to the queue join thread, so I thought we could cancel all the join threads and everything would be okay. But it turns out canceling the queue join threads can lead to corruption of the queues, so when we go to drain the queues the task hangs. But it also turns out that we were not properly draining the queues in the task process. We were waiting for all the runners to exit, then draining the queues. This is bad and will cause the queues to fill up and hang and/or drop data or corrupt the queues. The proper fix seems to be to draining the queues in a loop before calling join with a timeout. Also modified the queue drain loops to no block on queue.get() Revert "cancel all queue join threads" This reverts commit 75c0e3a54b8f6e8fd77c7d9d95decab830159929. Revert "duration runner: add teardown and cancel all queue join threads" This reverts commit 7eb6abb6931b24e085b139cc3500f4497cdde57d. Change-Id: Ic4f8e814cf23615621c1250535967716b425ac18 Signed-off-by: Ross Brattain <ross.b.brattain@intel.com>
author: Ross Brattain <ross.b.brattain@intel.com> 2017-10-01 22:58:45 -0700
committer: Ross Brattain <ross.b.brattain@intel.com> 2017-10-02 02:24:04 -0700
commit: 7826320fa41ad213ac36d369b1e4d71e4cabf176 (patch)
tree: 63bfd4ff4a78d8a3cb3e99eb1ae9a1ad3d48f751 /yardstick/benchmark
parent: a99e33c3d866da0de7be6f3c5ecdf9a33aedd12e (diff)
8 files changed, 96 insertions, 47 deletions
diff --git a/yardstick/benchmark/core/task.py b/yardstick/benchmark/core/task.py
index c175a950b..53298d8d3 100644
--- a/yardstick/benchmark/core/task.py
+++ b/yardstick/benchmark/core/task.py
@@ -260,25 +260,23 @@ class Task(object):     # pragma: no cover
 
             # Wait for runners to finish
             for runner in runners:
-                status = runner_join(runner)
+                status = runner_join(runner, self.outputs, result)
                 if status != 0:
-                    raise RuntimeError
-                self.outputs.update(runner.get_output())
-                result.extend(runner.get_result())
+                    raise RuntimeError(
+                        "{0} runner status {1}".format(runner.__execution_type__, status))
                 LOG.info("Runner ended, output in %s", output_file)
         else:
             # run serially
             for scenario in scenarios:
                 if not _is_background_scenario(scenario):
                     runner = self.run_one_scenario(scenario, output_file)
-                    status = runner_join(runner)
+                    status = runner_join(runner, self.outputs, result)
                     if status != 0:
                         LOG.error('Scenario NO.%s: "%s" ERROR!',
                                   scenarios.index(scenario) + 1,
                                   scenario.get('type'))
-                        raise RuntimeError
-                    self.outputs.update(runner.get_output())
-                    result.extend(runner.get_result())
+                        raise RuntimeError(
+                            "{0} runner status {1}".format(runner.__execution_type__, status))
                     LOG.info("Runner ended, output in %s", output_file)
 
         # Abort background runners
@@ -287,15 +285,13 @@ class Task(object):     # pragma: no cover
 
         # Wait for background runners to finish
         for runner in background_runners:
-            status = runner.join(JOIN_TIMEOUT)
+            status = runner.join(self.outputs, result, JOIN_TIMEOUT)
             if status is None:
                 # Nuke if it did not stop nicely
                 base_runner.Runner.terminate(runner)
-                runner.join(JOIN_TIMEOUT)
+                runner.join(self.outputs, result, JOIN_TIMEOUT)
             base_runner.Runner.release(runner)
 
-            self.outputs.update(runner.get_output())
-            result.extend(runner.get_result())
             print("Background task ended")
         return result
 
@@ -645,9 +641,14 @@ def get_networks_from_nodes(nodes):
     return networks
 
 
-def runner_join(runner):
-    """join (wait for) a runner, exit process at runner failure"""
-    status = runner.join()
+def runner_join(runner, outputs, result):
+    """join (wait for) a runner, exit process at runner failure
+    :param outputs:
+    :type outputs: dict
+    :param result:
+    :type result: list
+    """
+    status = runner.join(outputs, result)
     base_runner.Runner.release(runner)
     return status
 
diff --git a/yardstick/benchmark/runners/arithmetic.py b/yardstick/benchmark/runners/arithmetic.py
index 974fb21b3..3ff064ae1 100755
--- a/yardstick/benchmark/runners/arithmetic.py
+++ b/yardstick/benchmark/runners/arithmetic.py
@@ -46,11 +46,6 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
 
     sequence = 1
 
-    # if we don't do this we can hang waiting for the queue to drain
-    # have to do this in the subprocess
-    queue.cancel_join_thread()
-    output_queue.cancel_join_thread()
-
     runner_cfg = scenario_cfg['runner']
 
     interval = runner_cfg.get("interval", 1)
@@ -143,8 +138,18 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
         if errors and sla_action is None:
             break
 
-    benchmark.teardown()
+    try:
+        benchmark.teardown()
+    except Exception:
+        # catch any exception in teardown and convert to simple exception
+        # never pass exceptions back to multiprocessing, because some exceptions can
+        # be unpicklable
+        # https://bugs.python.org/issue9400
+        LOG.exception("")
+        raise SystemExit(1)
     LOG.info("worker END")
+    LOG.debug("queue.qsize() = %s", queue.qsize())
+    LOG.debug("output_queue.qsize() = %s", output_queue.qsize())
 
 
 class ArithmeticRunner(base.Runner):
diff --git a/yardstick/benchmark/runners/base.py b/yardstick/benchmark/runners/base.py
index 57903ebb9..3ecf67736 100755
--- a/yardstick/benchmark/runners/base.py
+++ b/yardstick/benchmark/runners/base.py
@@ -24,6 +24,9 @@ import subprocess
 import time
 import traceback
 
+
+from six.moves.queue import Empty
+
 import yardstick.common.utils as utils
 from yardstick.benchmark.scenarios import base as base_scenario
 
@@ -47,7 +50,6 @@ def _execute_shell_command(command):
 
 def _single_action(seconds, command, queue):
     """entrypoint for the single action process"""
-    queue.cancel_join_thread()
     log.debug("single action, fires after %d seconds (from now)", seconds)
     time.sleep(seconds)
     log.debug("single action: executing command: '%s'", command)
@@ -62,7 +64,6 @@ def _single_action(seconds, command, queue):
 
 def _periodic_action(interval, command, queue):
     """entrypoint for the periodic action process"""
-    queue.cancel_join_thread()
     log.debug("periodic action, fires every: %d seconds", interval)
     time_spent = 0
     while True:
@@ -118,7 +119,7 @@ class Runner(object):
     @staticmethod
     def terminate_all():
         """Terminate all runners (subprocesses)"""
-        log.debug("Terminating all runners")
+        log.debug("Terminating all runners", exc_info=True)
 
         # release dumper process as some errors before any runner is created
         if not Runner.runners:
@@ -139,9 +140,7 @@ class Runner(object):
         self.config = config
         self.periodic_action_process = None
         self.output_queue = multiprocessing.Queue()
-        self.output_queue.cancel_join_thread()
         self.result_queue = multiprocessing.Queue()
-        self.result_queue.cancel_join_thread()
         self.process = None
         self.aborted = multiprocessing.Event()
         Runner.runners.append(self)
@@ -209,9 +208,21 @@ class Runner(object):
         """Abort the execution of a scenario"""
         self.aborted.set()
 
-    def join(self, timeout=None):
-        self.process.join(timeout)
+    QUEUE_JOIN_INTERVAL = 5
+
+    def join(self, outputs, result, interval=QUEUE_JOIN_INTERVAL):
+        while self.process.exitcode is None:
+            # drain the queue while we are running otherwise we won't terminate
+            outputs.update(self.get_output())
+            result.extend(self.get_result())
+            self.process.join(interval)
+        # drain after the process has exited
+        outputs.update(self.get_output())
+        result.extend(self.get_result())
+
+        self.process.terminate()
         if self.periodic_action_process:
+            self.periodic_action_process.join(1)
             self.periodic_action_process.terminate()
             self.periodic_action_process = None
 
@@ -221,11 +232,19 @@ class Runner(object):
     def get_output(self):
         result = {}
         while not self.output_queue.empty():
-            result.update(self.output_queue.get())
+            log.debug("output_queue size %s", self.output_queue.qsize())
+            try:
+                result.update(self.output_queue.get(True, 1))
+            except Empty:
+                pass
         return result
 
     def get_result(self):
         result = []
         while not self.result_queue.empty():
-            result.append(self.result_queue.get())
+            log.debug("result_queue size %s", self.result_queue.qsize())
+            try:
+                result.append(self.result_queue.get(True, 1))
+            except Empty:
+                pass
         return result
diff --git a/yardstick/benchmark/runners/duration.py b/yardstick/benchmark/runners/duration.py
index 6a09131e1..75942766d 100644
--- a/yardstick/benchmark/runners/duration.py
+++ b/yardstick/benchmark/runners/duration.py
@@ -36,11 +36,6 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
 
     sequence = 1
 
-    # if we don't do this we can hang waiting for the queue to drain
-    # have to do this in the subprocess
-    queue.cancel_join_thread()
-    output_queue.cancel_join_thread()
-
     runner_cfg = scenario_cfg['runner']
 
     interval = runner_cfg.get("interval", 1)
@@ -116,6 +111,9 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
         LOG.exception("")
         raise SystemExit(1)
 
+    LOG.debug("queue.qsize() = %s", queue.qsize())
+    LOG.debug("output_queue.qsize() = %s", output_queue.qsize())
+
 
 class DurationRunner(base.Runner):
     """Run a scenario for a certain amount of time
diff --git a/yardstick/benchmark/runners/dynamictp.py b/yardstick/benchmark/runners/dynamictp.py
index 2f5f7e4f4..01e76c6f4 100755
--- a/yardstick/benchmark/runners/dynamictp.py
+++ b/yardstick/benchmark/runners/dynamictp.py
@@ -33,7 +33,6 @@ LOG = logging.getLogger(__name__)
 def _worker_process(queue, cls, method_name, scenario_cfg,
                     context_cfg, aborted):  # pragma: no cover
 
-    queue.cancel_join_thread()
     runner_cfg = scenario_cfg['runner']
     iterations = runner_cfg.get("iterations", 1)
     interval = runner_cfg.get("interval", 1)
@@ -142,7 +141,17 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
             LOG.debug("iterator: %s iterations: %s", iterator, iterations)
 
     if "teardown" in run_step:
-        benchmark.teardown()
+        try:
+            benchmark.teardown()
+        except Exception:
+            # catch any exception in teardown and convert to simple exception
+            # never pass exceptions back to multiprocessing, because some exceptions can
+            # be unpicklable
+            # https://bugs.python.org/issue9400
+            LOG.exception("")
+            raise SystemExit(1)
+
+    LOG.debug("queue.qsize() = %s", queue.qsize())
 
 
 class IterationRunner(base.Runner):
diff --git a/yardstick/benchmark/runners/iteration.py b/yardstick/benchmark/runners/iteration.py
index 822e67723..4a7439588 100644
--- a/yardstick/benchmark/runners/iteration.py
+++ b/yardstick/benchmark/runners/iteration.py
@@ -36,11 +36,6 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
 
     sequence = 1
 
-    # if we don't do this we can hang waiting for the queue to drain
-    # have to do this in the subprocess
-    queue.cancel_join_thread()
-    output_queue.cancel_join_thread()
-
     runner_cfg = scenario_cfg['runner']
 
     interval = runner_cfg.get("interval", 1)
@@ -95,7 +90,8 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
                 LOG.exception(e)
             else:
                 if result:
-                    output_queue.put(result)
+                    LOG.debug("output_queue.put %s", result)
+                    output_queue.put(result, True, 1)
 
             time.sleep(interval)
 
@@ -106,7 +102,8 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
                 'errors': errors
             }
 
-            queue.put(benchmark_output)
+            LOG.debug("queue.put, %s", benchmark_output)
+            queue.put(benchmark_output, True, 1)
 
             LOG.debug("runner=%(runner)s seq=%(sequence)s END",
                       {"runner": runner_cfg["runner_id"],
@@ -119,7 +116,18 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
                 LOG.info("worker END")
                 break
     if "teardown" in run_step:
-        benchmark.teardown()
+        try:
+            benchmark.teardown()
+        except Exception:
+            # catch any exception in teardown and convert to simple exception
+            # never pass exceptions back to multiprocessing, because some exceptions can
+            # be unpicklable
+            # https://bugs.python.org/issue9400
+            LOG.exception("")
+            raise SystemExit(1)
+
+    LOG.debug("queue.qsize() = %s", queue.qsize())
+    LOG.debug("output_queue.qsize() = %s", output_queue.qsize())
 
 
 class IterationRunner(base.Runner):
diff --git a/yardstick/benchmark/runners/sequence.py b/yardstick/benchmark/runners/sequence.py
index 68e272c57..f08ca5dde 100644
--- a/yardstick/benchmark/runners/sequence.py
+++ b/yardstick/benchmark/runners/sequence.py
@@ -105,8 +105,18 @@ def _worker_process(queue, cls, method_name, scenario_cfg,
         if (errors and sla_action is None) or aborted.is_set():
             break
 
-    benchmark.teardown()
+    try:
+        benchmark.teardown()
+    except Exception:
+        # catch any exception in teardown and convert to simple exception
+        # never pass exceptions back to multiprocessing, because some exceptions can
+        # be unpicklable
+        # https://bugs.python.org/issue9400
+        LOG.exception("")
+        raise SystemExit(1)
     LOG.info("worker END")
+    LOG.debug("queue.qsize() = %s", queue.qsize())
+    LOG.debug("output_queue.qsize() = %s", output_queue.qsize())
 
 
 class SequenceRunner(base.Runner):
diff --git a/yardstick/benchmark/scenarios/availability/monitor/basemonitor.py b/yardstick/benchmark/scenarios/availability/monitor/basemonitor.py
index 871f13f88..a6c1a28bd 100644
--- a/yardstick/benchmark/scenarios/availability/monitor/basemonitor.py
+++ b/yardstick/benchmark/scenarios/availability/monitor/basemonitor.py
@@ -90,7 +90,6 @@ class BaseMonitor(multiprocessing.Process):
         self._config = config
         self._context = context
         self._queue = multiprocessing.Queue()
-        self._queue.cancel_join_thread()
         self._event = multiprocessing.Event()
         self.monitor_data = data
         self.setup_done = False
author	Ross Brattain <ross.b.brattain@intel.com>	2017-10-01 22:58:45 -0700
committer	Ross Brattain <ross.b.brattain@intel.com>	2017-10-02 02:24:04 -0700
commit	7826320fa41ad213ac36d369b1e4d71e4cabf176 (patch)
tree	63bfd4ff4a78d8a3cb3e99eb1ae9a1ad3d48f751 /yardstick/benchmark
parent	a99e33c3d866da0de7be6f3c5ecdf9a33aedd12e (diff)