From 824d9c5d537916ebb1aebf1cfb6de9ab64484246 Mon Sep 17 00:00:00 2001 From: Martin Klozik Date: Fri, 29 Apr 2016 14:23:36 +0100 Subject: bugfix: Graceful shutdown of VM - improvement Cleanup phase of PVVP scenario sometimes causes server reboot. Following updates were made to prevent reboots: * better generic process termination procedure * ovsdb is terminated after vswitchd termination * vswitchd is terminated directly instead of parent sudo process * already running VNFs are terminated in case of failure during VNF start() Change-Id: Ic09d60d7bfdea01c84a2685ede3d0316f0d09be7 JIRA: VSPERF-271 Signed-off-by: Martin Klozik Reviewed-by: Maryam Tahhan --- conf/02_vswitch.conf | 2 +- core/vnf_controller.py | 9 ++++++-- src/ovs/daemon.py | 17 ++++++++++---- tools/systeminfo.py | 8 +++++++ tools/tasks.py | 63 +++++++++++++++++++++++++++++++++++++++++++------- vnfs/qemu/qemu.py | 28 ++++++++++++++-------- vnfs/vnf/vnf.py | 9 ++++---- vswitches/ovs.py | 2 +- 8 files changed, 108 insertions(+), 30 deletions(-) diff --git a/conf/02_vswitch.conf b/conf/02_vswitch.conf index d36d1786..67f96991 100644 --- a/conf/02_vswitch.conf +++ b/conf/02_vswitch.conf @@ -71,7 +71,7 @@ VHOST_USER_SOCKS = ['/tmp/dpdkvhostuser0', '/tmp/dpdkvhostuser1', # hardware configuration, like cpu numbering and NUMA. VSWITCHD_DPDK_ARGS = ['-c', '0x4', '-n', '4', '--socket-mem 1024,0'] -VSWITCHD_VANILLA_ARGS = ['--pidfile'] +VSWITCHD_VANILLA_ARGS = [] # use full module path to load module matching OVS version built from the source VSWITCH_VANILLA_KERNEL_MODULES = ['libcrc32c', 'ip_tunnel', 'vxlan', 'gre', 'nf_conntrack', 'nf_defrag_ipv4', 'nf_defrag_ipv6', os.path.join(OVS_DIR_VANILLA, 'datapath/linux/openvswitch.ko')] diff --git a/core/vnf_controller.py b/core/vnf_controller.py index 39a63044..8800ccaf 100644 --- a/core/vnf_controller.py +++ b/core/vnf_controller.py @@ -15,6 +15,7 @@ """ import logging +import pexpect from vnfs.vnf.vnf import IVnf class VnfController(object): @@ -68,8 +69,12 @@ class VnfController(object): """ self._logger.debug('start ' + str(len(self._vnfs)) + ' VNF[s] with ' + ' '.join(map(str, self._vnfs))) - for vnf in self._vnfs: - vnf.start() + try: + for vnf in self._vnfs: + vnf.start() + except pexpect.TIMEOUT: + self.stop() + raise def stop(self): """Stops all VNFs set-up by __init__. diff --git a/src/ovs/daemon.py b/src/ovs/daemon.py index 089bc7a4..f9b037b2 100644 --- a/src/ovs/daemon.py +++ b/src/ovs/daemon.py @@ -38,6 +38,7 @@ class VSwitchd(tasks.Process): _ovsdb_pid = None _logfile = _LOG_FILE_VSWITCHD _ovsdb_pidfile_path = os.path.join(settings.getValue('LOG_DIR'), "ovsdb_pidfile.pid") + _vswitchd_pidfile_path = os.path.join(settings.getValue('LOG_DIR'), "vswitchd_pidfile.pid") _proc_name = 'ovs-vswitchd' def __init__(self, timeout=30, vswitchd_args=None, expected_cmd=None): @@ -55,7 +56,10 @@ class VSwitchd(tasks.Process): vswitchd_args = vswitchd_args or [] ovs_vswitchd_bin = os.path.join( settings.getValue('OVS_DIR'), 'vswitchd', 'ovs-vswitchd') - self._cmd = ['sudo', '-E', ovs_vswitchd_bin] + vswitchd_args + sep = ['--'] if '--dpdk' in vswitchd_args else [] + self._cmd = ['sudo', '-E', ovs_vswitchd_bin] + vswitchd_args + sep + \ + ['--pidfile=' + self._vswitchd_pidfile_path, '--overwrite-pidfile', + '--log-file=' + self._logfile] # startup/shutdown @@ -77,15 +81,19 @@ class VSwitchd(tasks.Process): self._kill_ovsdb() raise exc - def kill(self, signal='-15', sleep=2): + def kill(self, signal='-15', sleep=10): """Kill ``ovs-vswitchd`` instance if it is alive. :returns: None """ self._logger.info('Killing ovs-vswitchd...') + with open(self._vswitchd_pidfile_path, "r") as pidfile: + vswitchd_pid = pidfile.read().strip() + tasks.terminate_task(vswitchd_pid, logger=self._logger) - self._kill_ovsdb() + self._kill_ovsdb() # ovsdb must be killed after vswitchd + # just for case, that sudo envelope has not terminated super(VSwitchd, self).kill(signal, sleep) # helper functions @@ -144,8 +152,7 @@ class VSwitchd(tasks.Process): self._logger.info("Killing ovsdb with pid: " + ovsdb_pid) if ovsdb_pid: - tasks.run_task(['sudo', 'kill', '-15', str(ovsdb_pid)], - self._logger, 'Killing ovsdb-server...') + tasks.terminate_task(ovsdb_pid, logger=self._logger) @staticmethod def get_db_sock_path(): diff --git a/tools/systeminfo.py b/tools/systeminfo.py index ba490946..9d8eb5cb 100644 --- a/tools/systeminfo.py +++ b/tools/systeminfo.py @@ -168,6 +168,14 @@ def get_pid(proc_name_str): """ return get_pids([proc_name_str]) +def pid_isalive(pid): + """ Checks if given PID is alive + + :param pid: PID of the process + :returns: True if given process is running, False otherwise + """ + return os.path.isdir('/proc/' + str(pid)) + # This function uses long switch per purpose, so let us suppress pylint warning too-many-branches # pylint: disable=R0912 def get_version(app_name): diff --git a/tools/tasks.py b/tools/tasks.py index 90b7e553..dda5217d 100644 --- a/tools/tasks.py +++ b/tools/tasks.py @@ -26,6 +26,7 @@ import locale import time from conf import settings +from tools import systeminfo CMD_PREFIX = 'cmd : ' @@ -150,6 +151,55 @@ def run_interactive_task(cmd, logger, msg): return child +def terminate_task_subtree(pid, signal='-15', sleep=10, logger=None): + """Terminate given process and all its children + + Function will sent given signal to the process. In case + that process will not terminate within given sleep interval + and signal was not SIGKILL, then process will be killed by SIGKILL. + After that function will check if all children of the process + are terminated and if not the same terminating procedure is applied + on any living child (only one level of children is considered). + + :param pid: Process ID to terminate + :param signal: Signal to be sent to the process + :param sleep: Maximum delay in seconds after signal is sent + :param logger: Logger to write details to + """ + try: + output = subprocess.check_output("pgrep -P " + str(pid), shell=True).decode().rstrip('\n') + except subprocess.CalledProcessError: + output = "" + + terminate_task(pid, signal, sleep, logger) + + # just for case children were kept alive + children = output.split('\n') + for child in children: + terminate_task(child, signal, sleep, logger) + +def terminate_task(pid, signal='-15', sleep=10, logger=None): + """Terminate process with given pid + + Function will sent given signal to the process. In case + that process will not terminate within given sleep interval + and signal was not SIGKILL, then process will be killed by SIGKILL. + + :param pid: Process ID to terminate + :param signal: Signal to be sent to the process + :param sleep: Maximum delay in seconds after signal is sent + :param logger: Logger to write details to + """ + if systeminfo.pid_isalive(pid): + run_task(['sudo', 'kill', signal, str(pid)], logger) + logger.debug('Wait for process %s to terminate after signal %s', pid, signal) + for dummy in range(sleep): + time.sleep(1) + if not systeminfo.pid_isalive(pid): + break + + if signal.lstrip('-').upper() not in ('9', 'KILL', 'SIGKILL') and systeminfo.pid_isalive(pid): + terminate_task(pid, '-9', sleep, logger) class Process(object): """Control an instance of a long-running process. @@ -242,17 +292,14 @@ class Process(object): self.kill() raise exc - def kill(self, signal='-15', sleep=2): + def kill(self, signal='-15', sleep=10): """Kill process instance if it is alive. :param signal: signal to be sent to the process :param sleep: delay in seconds after signal is sent """ - if self._child and self._child.isalive(): - run_task(['sudo', 'kill', signal, str(self._child.pid)], - self._logger) - self._logger.debug('Wait for process to terminate') - time.sleep(sleep) + if self.is_running(): + terminate_task_subtree(self._child.pid, signal, sleep, self._logger) if self.is_relinquished(): self._relinquish_thread.join() @@ -275,7 +322,7 @@ class Process(object): :returns: True if process is running, else False """ - return self._child is not None + return self._child and self._child.isalive() def _affinitize_pid(self, core, pid): """Affinitize a process with ``pid`` to ``core``. @@ -298,7 +345,7 @@ class Process(object): """ self._logger.info('Affinitizing process') - if self._child and self._child.isalive(): + if self.is_running(): self._affinitize_pid(core, self._child.pid) class ContinueReadPrintLoop(threading.Thread): diff --git a/vnfs/qemu/qemu.py b/vnfs/qemu/qemu.py index c735062f..d108dc9a 100644 --- a/vnfs/qemu/qemu.py +++ b/vnfs/qemu/qemu.py @@ -21,6 +21,7 @@ import locale import re import subprocess import time +import pexpect from conf import settings as S from conf import get_test_param @@ -133,15 +134,24 @@ class IVnfQemu(IVnf): """ Stops VNF instance gracefully first. """ - # exit testpmd if needed - if self._guest_loopback == 'testpmd': - self.execute_and_wait('stop', 120, "Done") - self.execute_and_wait('quit', 120, "bye") - - # turn off VM - self.execute_and_wait('poweroff', 120, "Power down") - # VM OS is off, but wait until qemu shutdowns - time.sleep(2) + try: + # exit testpmd if needed + if self._guest_loopback == 'testpmd': + self.execute_and_wait('stop', 120, "Done") + self.execute_and_wait('quit', 120, "bye") + + # turn off VM + self.execute_and_wait('poweroff', 120, "Power down") + + except pexpect.TIMEOUT: + self.kill() + + # wait until qemu shutdowns + self._logger.debug('Wait for QEMU to terminate') + for dummy in range(30): + time.sleep(1) + if not self.is_running(): + break # just for case that graceful shutdown failed super(IVnfQemu, self).stop() diff --git a/vnfs/vnf/vnf.py b/vnfs/vnf/vnf.py index 3dae2733..1410a0c4 100644 --- a/vnfs/vnf/vnf.py +++ b/vnfs/vnf/vnf.py @@ -51,11 +51,12 @@ class IVnf(tasks.Process): """ Stops VNF instance. """ - self._logger.info('Killing VNF...') + if self.is_running(): + self._logger.info('Killing VNF...') - # force termination of VNF and wait for it to terminate; It will avoid - # sporadic reboot of host. (caused by hugepages or DPDK ports) - super(IVnf, self).kill(signal='-9', sleep=10) + # force termination of VNF and wait for it to terminate; It will avoid + # sporadic reboot of host. (caused by hugepages or DPDK ports) + super(IVnf, self).kill(signal='-9', sleep=10) def execute(self, cmd, delay=0): """ diff --git a/vswitches/ovs.py b/vswitches/ovs.py index 06dc7a1a..dd49a1fc 100644 --- a/vswitches/ovs.py +++ b/vswitches/ovs.py @@ -21,7 +21,7 @@ from conf import settings from vswitches.vswitch import IVSwitch from src.ovs import OFBridge, flow_key, flow_match -_VSWITCHD_CONST_ARGS = ['--', '--pidfile', '--log-file'] +_VSWITCHD_CONST_ARGS = [] class IVSwitchOvs(IVSwitch): """Open vSwitch base class implementation -- cgit 1.2.3-korg