From 631c3f99f0e499edd1adcbc7550dd3d0de393e31 Mon Sep 17 00:00:00 2001 From: dongwenjuan Date: Tue, 20 Mar 2018 15:13:14 +0800 Subject: optimize the fault notification test Change-Id: I47572b4515049c49f9aa04bf7984757779a605f3 Signed-off-by: dongwenjuan --- doctor_tests/consumer/base.py | 9 ++ doctor_tests/consumer/sample.py | 4 +- doctor_tests/inspector/base.py | 18 +++ doctor_tests/inspector/sample.py | 9 +- doctor_tests/main.py | 148 ++------------------ doctor_tests/monitor/base.py | 9 ++ doctor_tests/monitor/sample.py | 4 +- doctor_tests/scenario/common.py | 26 ---- doctor_tests/scenario/fault_management.py | 219 ++++++++++++++++++++++++++++++ doctor_tests/scenario/network_failure.py | 84 ------------ 10 files changed, 277 insertions(+), 253 deletions(-) delete mode 100644 doctor_tests/scenario/common.py create mode 100644 doctor_tests/scenario/fault_management.py delete mode 100644 doctor_tests/scenario/network_failure.py diff --git a/doctor_tests/consumer/base.py b/doctor_tests/consumer/base.py index b6c4b34e..e21b0802 100644 --- a/doctor_tests/consumer/base.py +++ b/doctor_tests/consumer/base.py @@ -16,6 +16,15 @@ class BaseConsumer(object): def __init__(self, conf, log): self.conf = conf self.log = log + self._notified_time = None + + @property + def notified_time(self): + return self._notified_time + + @notified_time.setter + def notified_time(self, notified_time): + self._notified_time = notified_time @abc.abstractmethod def start(self): diff --git a/doctor_tests/consumer/sample.py b/doctor_tests/consumer/sample.py index eaf5fabd..c7dcf4f9 100644 --- a/doctor_tests/consumer/sample.py +++ b/doctor_tests/consumer/sample.py @@ -54,7 +54,9 @@ class ConsumerApp(Thread): @app.route('/failure', methods=['POST']) def event_posted(): - self.log.info('doctor consumer notified at %s' % time.time()) + notified_time = time.time() + self.log.info('doctor consumer notified at %s' % notified_time) + self.consumer.notified_time = notified_time data = json.loads(request.data.decode('utf8')) self.log.info('sample consumer received data = %s' % data) return 'OK' diff --git a/doctor_tests/inspector/base.py b/doctor_tests/inspector/base.py index a6eae451..967dc9ba 100644 --- a/doctor_tests/inspector/base.py +++ b/doctor_tests/inspector/base.py @@ -16,6 +16,24 @@ class BaseInspector(object): def __init__(self, conf, log): self.conf = conf self.log = log + self._host_down_time = None + self._vm_down_time = None + + @property + def host_down_time(self): + return self._host_down_time + + @host_down_time.setter + def host_down_time(self, host_down_time): + self._host_down_time = host_down_time + + @property + def vm_down_time(self): + return self._vm_down_time + + @vm_down_time.setter + def vm_down_time(self, vm_down_time): + self._vm_down_time = vm_down_time @abc.abstractmethod def get_inspector_url(self): diff --git a/doctor_tests/inspector/sample.py b/doctor_tests/inspector/sample.py index abc4fc6d..7742373d 100644 --- a/doctor_tests/inspector/sample.py +++ b/doctor_tests/inspector/sample.py @@ -108,8 +108,11 @@ class SampleInspector(BaseInspector): @utils.run_async def _disable_compute_host(self, hostname): self.nova.services.force_down(hostname, 'nova-compute', True) + + hostdown_time = time.time() + self.host_down_time = hostdown_time self.log.info('doctor mark host(%s) down at %s' - % (hostname, time.time())) + % (hostname, hostdown_time)) @utils.run_async def _vms_reset_state(self, state, hostname): @@ -117,8 +120,10 @@ class SampleInspector(BaseInspector): @utils.run_async def _vm_reset_state(nova, server, state): nova.servers.reset_state(server, state) + vmdown_time = time.time() + self.vm_down_time = vmdown_time self.log.info('doctor mark vm(%s) error at %s' - % (server, time.time())) + % (server, vmdown_time)) thrs = [] for nova, server in zip(self.novaclients, self.servers[hostname]): diff --git a/doctor_tests/main.py b/doctor_tests/main.py index 3f54fc32..61facb61 100644 --- a/doctor_tests/main.py +++ b/doctor_tests/main.py @@ -8,28 +8,17 @@ ############################################################################## import os from os.path import isfile, join -import random import sys import time -from doctor_tests.alarm import Alarm -from doctor_tests.common.constants import Host -from doctor_tests.common.utils import match_rep_in_file from doctor_tests import config -from doctor_tests.consumer import get_consumer from doctor_tests.identity_auth import get_identity_auth from doctor_tests.identity_auth import get_session from doctor_tests.image import Image -from doctor_tests.instance import Instance -from doctor_tests.inspector import get_inspector from doctor_tests.installer import get_installer import doctor_tests.logger as doctor_log -from doctor_tests.network import Network -from doctor_tests.monitor import get_monitor from doctor_tests.os_clients import nova_client -from doctor_tests.profiler_poc import main as profiler_main -from doctor_tests.scenario.common import calculate_notification_time -from doctor_tests.scenario.network_failure import NetworkFault +from doctor_tests.scenario.fault_management import FaultManagement from doctor_tests.user import User @@ -44,20 +33,10 @@ class DoctorTest(object): self.conf = conf self.image = Image(self.conf, LOG) self.user = User(self.conf, LOG) - self.network = Network(self.conf, LOG) - self.instance = Instance(self.conf, LOG) - self.alarm = Alarm(self.conf, LOG) self.installer = get_installer(self.conf, LOG) - self.inspector = get_inspector(self.conf, LOG) - self.monitor = get_monitor(self.conf, - self.inspector.get_inspector_url(), - LOG) - self.consumer = get_consumer(self.conf, LOG) - self.fault = NetworkFault(self.conf, self.installer, LOG) auth = get_identity_auth(project=self.conf.doctor_project) self.nova = nova_client(self.conf.nova_version, get_session(auth=auth)) - self.down_host = None def setup(self): # prepare the cloud env @@ -69,34 +48,15 @@ class DoctorTest(object): # creating test user... self.user.create() - def setup_fault_management(self): - # user settings... - self.user.update_quota() - - # creating VM... - self.network.create() - self.instance.create() - self.instance.wait_for_vm_launch() - - # creating alarm... - self.alarm.create() - - # starting doctor sample components... - # tbd tojuvone: move inspector and consumer to common setup - # when they support updating VMs via instance.create and - # instance.delete alarm - - self.inspector.start() - self.consumer.start() - self.down_host = self.get_host_info_for_random_vm() - self.monitor.start(self.down_host) - def test_fault_management(self): try: LOG.info('doctor fault management test starting.......') + self.fault_management = \ + FaultManagement(self.conf, self.installer, self.user, LOG) + # prepare test env - self.setup_fault_management() + self.fault_management.setup() # wait for aodh alarms are updated in caches for event evaluator, # sleep time should be larger than event_alarm_cache_ttl @@ -106,34 +66,20 @@ class DoctorTest(object): # injecting host failure... # NOTE (umar) add INTERFACE_NAME logic to host injection - - self.fault.start(self.down_host) + self.fault_management.start() time.sleep(10) # verify the test results # NOTE (umar) copy remote monitor.log file when monitor=collectd - self.check_host_status(self.down_host.name, 'down') - - notification_time = calculate_notification_time(LogFile) - if notification_time < 1 and notification_time > 0: - LOG.info('doctor fault management test successfully, ' - 'notification_time=%s' % notification_time) - else: - LOG.error('doctor fault management test failed, ' - 'notification_time=%s' % notification_time) - sys.exit(1) + self.fault_management.check_host_status('down') + self.fault_management.check_notification_time() - if self.conf.profiler_type: - LOG.info('doctor fault management test begin to run ' - 'profile.......') - self.collect_logs() - self.run_profiler() except Exception as e: LOG.error('doctor fault management test failed, ' 'Exception=%s' % e) sys.exit(1) finally: - self.cleanup_fault_management() + self.fault_management.cleanup() def _amount_compute_nodes(self): services = self.nova.services.list(binary='nova-compute') @@ -177,82 +123,6 @@ class DoctorTest(object): finally: self.cleanup() - def get_host_info_for_random_vm(self): - num = random.randint(0, self.conf.instance_count - 1) - vm_name = "%s%d" % (self.conf.instance_basename, num) - - servers = {getattr(server, 'name'): server - for server in self.nova.servers.list()} - server = servers.get(vm_name) - if not server: - raise Exception('Can not find instance: vm_name(%s)' % vm_name) - host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') - host_ip = self.installer.get_host_ip_from_hostname(host_name) - - LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at' - % (host_name, host_ip, vm_name)) - return Host(host_name, host_ip) - - def check_host_status(self, hostname, state): - service = self.nova.services.list(host=hostname, - binary='nova-compute') - host_state = service[0].__dict__.get('state') - assert host_state == state - - def unset_forced_down_hosts(self): - if self.down_host: - self.nova.services.force_down(self.down_host.name, - 'nova-compute', False) - time.sleep(2) - self.check_host_status(self.down_host.name, 'up') - - def collect_logs(self): - self.fault.get_disable_network_log() - - def run_profiler(self): - - net_down_log_file = self.fault.get_disable_network_log() - reg = '(?<=doctor set link down at )\d+.\d+' - linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0)) - - reg = '(.* doctor mark vm.* error at )(\d+.\d+)' - vmdown = float(match_rep_in_file(reg, LogFile).group(2)) - - reg = '(.* doctor mark host.* down at )(\d+.\d+)' - hostdown = float(match_rep_in_file(reg, LogFile).group(2)) - - reg = '(?<=doctor monitor detected at )\d+.\d+' - detected = float(match_rep_in_file(reg, LogFile).group(0)) - - reg = '(?<=doctor consumer notified at )\d+.\d+' - notified = float(match_rep_in_file(reg, LogFile).group(0)) - - # TODO(yujunz) check the actual delay to verify time sync status - # expected ~1s delay from $trigger to $linkdown - relative_start = linkdown - os.environ['DOCTOR_PROFILER_T00'] = ( - str(int((linkdown - relative_start) * 1000))) - os.environ['DOCTOR_PROFILER_T01'] = ( - str(int((detected - relative_start) * 1000))) - os.environ['DOCTOR_PROFILER_T03'] = ( - str(int((vmdown - relative_start) * 1000))) - os.environ['DOCTOR_PROFILER_T04'] = ( - str(int((hostdown - relative_start) * 1000))) - os.environ['DOCTOR_PROFILER_T09'] = ( - str(int((notified - relative_start) * 1000))) - - profiler_main(log=LOG) - - def cleanup_fault_management(self): - self.unset_forced_down_hosts() - self.inspector.stop() - self.monitor.stop() - self.consumer.stop() - self.alarm.delete() - self.instance.delete() - self.network.delete() - self.fault.cleanup() - def cleanup(self): self.installer.cleanup() self.image.delete() diff --git a/doctor_tests/monitor/base.py b/doctor_tests/monitor/base.py index 119c8a1c..c2341225 100644 --- a/doctor_tests/monitor/base.py +++ b/doctor_tests/monitor/base.py @@ -17,6 +17,15 @@ class BaseMonitor(object): self.conf = conf self.log = log self.inspector_url = inspector_url + self._detected_time = None + + @property + def detected_time(self): + return self._detected_time + + @detected_time.setter + def detected_time(self, detected_time): + self._detected_time = detected_time @abc.abstractmethod def start(self, host): diff --git a/doctor_tests/monitor/sample.py b/doctor_tests/monitor/sample.py index c207cd9f..9b21750b 100644 --- a/doctor_tests/monitor/sample.py +++ b/doctor_tests/monitor/sample.py @@ -94,7 +94,9 @@ class Pinger(Thread): sock.sendto(self.ICMP_ECHO_MESSAGE, (self.ip_addr, 0)) sock.recv(4096) except socket.timeout: - self.log.info("doctor monitor detected at %s" % time.time()) + detected_time = time.time() + self.log.info("doctor monitor detected at %s" % detected_time) + self.monitor.detected_time = detected_time self.monitor.report_error(self.hostname) self.log.info("ping timeout, quit monitoring...") self._stopped = True diff --git a/doctor_tests/scenario/common.py b/doctor_tests/scenario/common.py deleted file mode 100644 index a7240c00..00000000 --- a/doctor_tests/scenario/common.py +++ /dev/null @@ -1,26 +0,0 @@ -############################################################################## -# Copyright (c) 2017 ZTE Corporation and others. -# -# All rights reserved. This program and the accompanying materials -# are made available under the terms of the Apache License, Version 2.0 -# which accompanies this distribution, and is available at -# http://www.apache.org/licenses/LICENSE-2.0 -############################################################################## -from doctor_tests.common.utils import match_rep_in_file - - -def calculate_notification_time(log_file): - - reg = '(?<=doctor monitor detected at )\d+.\d+' - result = match_rep_in_file(reg, log_file) - if not result: - raise Exception('Can not match detected time') - detected = result.group(0) - - reg = '(?<=doctor consumer notified at )\d+.\d+' - result = match_rep_in_file(reg, log_file) - if not result: - raise Exception('Can not match notified time') - notified = result.group(0) - - return float(notified) - float(detected) diff --git a/doctor_tests/scenario/fault_management.py b/doctor_tests/scenario/fault_management.py new file mode 100644 index 00000000..b1fe8099 --- /dev/null +++ b/doctor_tests/scenario/fault_management.py @@ -0,0 +1,219 @@ +############################################################################## +# Copyright (c) 2017 ZTE Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +import os +import random +import time + +from doctor_tests.alarm import Alarm +from doctor_tests.common.constants import Host +from doctor_tests.common.utils import get_doctor_test_root_dir +from doctor_tests.common.utils import match_rep_in_file +from doctor_tests.common.utils import SSHClient +from doctor_tests.consumer import get_consumer +from doctor_tests.identity_auth import get_identity_auth +from doctor_tests.identity_auth import get_session +from doctor_tests.instance import Instance +from doctor_tests.inspector import get_inspector +from doctor_tests.monitor import get_monitor +from doctor_tests.network import Network +from doctor_tests.profiler_poc import main as profiler_main +from doctor_tests.os_clients import nova_client + + +LINK_DOWN_SCRIPT = """ +#!/bin/bash -x +dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}') +sleep 1 +sudo ip link set $dev down +echo "doctor set link down at" $(date "+%s.%N") +sleep 10 +sudo ip link set $dev up +sleep 1 +""" + + +class FaultManagement(object): + + def __init__(self, conf, installer, user, log): + self.conf = conf + self.log = log + self.user = user + self.installer = installer + auth = get_identity_auth(project=self.conf.doctor_project) + self.nova = nova_client(self.conf.nova_version, + get_session(auth=auth)) + self.test_dir = get_doctor_test_root_dir() + self.down_host = None + self.GetLog = False + self.disable_network_log = None + self.network = Network(self.conf, log) + self.instance = Instance(self.conf, log) + self.alarm = Alarm(self.conf, log) + self.inspector = get_inspector(self.conf, log) + self.monitor = get_monitor(self.conf, + self.inspector.get_inspector_url(), + log) + self.consumer = get_consumer(self.conf, log) + + def setup(self): + self.log.info('fault management setup......') + + # user settings... + self.user.update_quota() + + # creating VM... + self.network.create() + self.instance.create() + self.instance.wait_for_vm_launch() + + # creating alarm... + self.alarm.create() + + # starting doctor sample components... + # tbd tojuvone: move inspector and consumer to common setup + # when they support updating VMs via instance.create and + # instance.delete alarm + + self.inspector.start() + self.consumer.start() + self.down_host = self.get_host_info_for_random_vm() + self.monitor.start(self.down_host) + + def start(self): + self.log.info('fault management start......') + self._set_link_down(self.down_host.ip) + self.log.info('fault management end......') + + def cleanup(self): + self.log.info('fault management cleanup......') + + self.get_disable_network_log() + self.unset_forced_down_hosts() + self.inspector.stop() + self.monitor.stop() + self.consumer.stop() + self.alarm.delete() + self.instance.delete() + self.network.delete() + + def get_host_info_for_random_vm(self): + num = random.randint(0, self.conf.instance_count - 1) + vm_name = "%s%d" % (self.conf.instance_basename, num) + + servers = {getattr(server, 'name'): server + for server in self.nova.servers.list()} + server = servers.get(vm_name) + if not server: + raise Exception('Can not find instance: vm_name(%s)' % vm_name) + host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') + host_ip = self.installer.get_host_ip_from_hostname(host_name) + + self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at' + % (host_name, host_ip, vm_name)) + return Host(host_name, host_ip) + + def unset_forced_down_hosts(self): + if self.down_host: + self.nova.services.force_down(self.down_host.name, + 'nova-compute', False) + time.sleep(2) + self.check_host_status('up') + + def check_host_status(self, state): + service = self.nova.services.list(host=self.down_host.name, + binary='nova-compute') + host_state = service[0].__dict__.get('state') + assert host_state == state + + def get_disable_network_log(self): + if self.GetLog: + self.log.info('Already get the disable_netork.log ' + 'from down_host......') + return self.disable_network_log + if self.down_host is not None: + client = SSHClient( + self.down_host.ip, + self.installer.node_user_name, + key_filename=self.installer.get_ssh_key_from_installer(), + look_for_keys=True, + log=self.log) + + self.disable_network_log = \ + '{0}/{1}'.format(self.test_dir, + 'disable_network.log') + client.scp('disable_network.log', + self.disable_network_log, + method='get') + self.log.info('Get the disable_netork.log from' + 'down_host(host_name:%s, host_ip:%s)' + % (self.down_host.name, self.down_host.ip)) + self.GetLog = True + return self.disable_network_log + + def _set_link_down(self, compute_ip): + file_name = '{0}/{1}'.format(self.test_dir, 'disable_network.sh') + with open(file_name, 'w') as file: + file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip)) + client = SSHClient( + compute_ip, + self.installer.node_user_name, + key_filename=self.installer.get_ssh_key_from_installer(), + look_for_keys=True, + log=self.log) + client.scp(file_name, 'disable_network.sh') + command = 'bash disable_network.sh > disable_network.log 2>&1 &' + client.ssh(command) + + def check_notification_time(self): + if self.consumer.notified_time is None \ + or self.monitor.detected_time is None: + raise Exception('doctor fault management test failed, ' + 'detected_time=%s, notified_time=%s' + % (self.monitor.detected_time, + self.consumer.notified_time)) + notification_time = \ + self.consumer.notified_time - \ + self.monitor.detected_time + if notification_time < 1 and notification_time > 0: + self.log.info('doctor fault management test successfully,' + 'notification_time=%s' % notification_time) + else: + raise Exception('doctor fault management test failed, ' + 'notification_time=%s' % notification_time) + + if self.conf.profiler_type: + self.log.info('run doctor fault management profile.......') + self.run_profiler() + + def run_profiler(self): + + net_down_log_file = self.get_disable_network_log() + reg = '(?<=doctor set link down at )\d+.\d+' + linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0)) + + vmdown = self.inspector.vm_down_time + hostdown = self.inspector.host_down_time + detected = self.monitor.detected_time + notified = self.consumer.notified_time + + # TODO(yujunz) check the actual delay to verify time sync status + # expected ~1s delay from $trigger to $linkdown + relative_start = linkdown + os.environ['DOCTOR_PROFILER_T00'] = ( + str(int((linkdown - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T01'] = ( + str(int((detected - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T03'] = ( + str(int((vmdown - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T04'] = ( + str(int((hostdown - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T09'] = ( + str(int((notified - relative_start) * 1000))) + + profiler_main(log=self.log) diff --git a/doctor_tests/scenario/network_failure.py b/doctor_tests/scenario/network_failure.py deleted file mode 100644 index b55440ed..00000000 --- a/doctor_tests/scenario/network_failure.py +++ /dev/null @@ -1,84 +0,0 @@ -############################################################################## -# Copyright (c) 2017 ZTE Corporation and others. -# -# All rights reserved. This program and the accompanying materials -# are made available under the terms of the Apache License, Version 2.0 -# which accompanies this distribution, and is available at -# http://www.apache.org/licenses/LICENSE-2.0 -############################################################################## -from doctor_tests.identity_auth import get_session -from doctor_tests.os_clients import nova_client -from doctor_tests.common.utils import get_doctor_test_root_dir -from doctor_tests.common.utils import SSHClient - -LINK_DOWN_SCRIPT = """ -#!/bin/bash -x -dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}') -sleep 1 -sudo ip link set $dev down -echo "doctor set link down at" $(date "+%s.%N") -sleep 10 -sudo ip link set $dev up -sleep 1 -""" - - -class NetworkFault(object): - - def __init__(self, conf, installer, log): - self.conf = conf - self.log = log - self.installer = installer - self.nova = nova_client(self.conf.nova_version, get_session()) - self.test_dir = get_doctor_test_root_dir() - self.host = None - self.GetLog = False - self.disable_network_log = None - - def start(self, host): - self.log.info('fault inject start......') - self._set_link_down(host.ip) - self.host = host - self.log.info('fault inject end......') - - def cleanup(self): - self.log.info('fault inject cleanup......') - self.get_disable_network_log() - - def get_disable_network_log(self): - if self.GetLog: - self.log.info('Already get the disable_netork.log ' - 'from down_host......') - return self.disable_network_log - if self.host is not None: - client = SSHClient( - self.host.ip, - self.installer.node_user_name, - key_filename=self.installer.get_ssh_key_from_installer(), - look_for_keys=True, - log=self.log) - - self.disable_network_log = '{0}/{1}'.format(self.test_dir, - 'disable_network.log') - client.scp('disable_network.log', - self.disable_network_log, - method='get') - self.log.info('Get the disable_netork.log from' - 'down_host(host_name:%s, host_ip:%s)' - % (self.host.name, self.host.ip)) - self.GetLog = True - return self.disable_network_log - - def _set_link_down(self, compute_ip): - file_name = '{0}/{1}'.format(self.test_dir, 'disable_network.sh') - with open(file_name, 'w') as file: - file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip)) - client = SSHClient( - compute_ip, - self.installer.node_user_name, - key_filename=self.installer.get_ssh_key_from_installer(), - look_for_keys=True, - log=self.log) - client.scp(file_name, 'disable_network.sh') - command = 'bash disable_network.sh > disable_network.log 2>&1 &' - client.ssh(command) -- cgit 1.2.3-korg