summaryrefslogtreecommitdiffstats
path: root/doctor_tests/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'doctor_tests/main.py')
-rw-r--r--doctor_tests/main.py246
1 files changed, 80 insertions, 166 deletions
diff --git a/doctor_tests/main.py b/doctor_tests/main.py
index f54b6403..7573faec 100644
--- a/doctor_tests/main.py
+++ b/doctor_tests/main.py
@@ -1,5 +1,5 @@
##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
+# Copyright (c) 2019 ZTE Corporation and others.
#
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
@@ -8,28 +8,19 @@
##############################################################################
import os
from os.path import isfile, join
-import random
import sys
import time
+from traceback import format_exc
-from doctor_tests.alarm import Alarm
-from doctor_tests.common.constants import Host
-from doctor_tests.common.utils import match_rep_in_file
from doctor_tests import config
-from doctor_tests.consumer import get_consumer
from doctor_tests.identity_auth import get_identity_auth
from doctor_tests.identity_auth import get_session
from doctor_tests.image import Image
-from doctor_tests.instance import Instance
-from doctor_tests.inspector import get_inspector
from doctor_tests.installer import get_installer
import doctor_tests.logger as doctor_log
-from doctor_tests.network import Network
-from doctor_tests.monitor import get_monitor
+from doctor_tests.scenario.fault_management import FaultManagement
from doctor_tests.os_clients import nova_client
-from doctor_tests.profiler_poc import main as profiler_main
-from doctor_tests.scenario.common import calculate_notification_time
-from doctor_tests.scenario.network_failure import NetworkFault
+from doctor_tests.scenario.maintenance import Maintenance
from doctor_tests.user import User
@@ -44,95 +35,65 @@ class DoctorTest(object):
self.conf = conf
self.image = Image(self.conf, LOG)
self.user = User(self.conf, LOG)
- self.network = Network(self.conf, LOG)
- self.instance = Instance(self.conf, LOG)
- self.alarm = Alarm(self.conf, LOG)
self.installer = get_installer(self.conf, LOG)
- self.inspector = get_inspector(self.conf, LOG)
- self.monitor = get_monitor(self.conf,
- self.inspector.get_inspector_url(),
- LOG)
- self.consumer = get_consumer(self.conf, LOG)
- self.fault = NetworkFault(self.conf, self.installer, LOG)
auth = get_identity_auth(project=self.conf.doctor_project)
self.nova = nova_client(self.conf.nova_version,
get_session(auth=auth))
- self.down_host = None
def setup(self):
# prepare the cloud env
self.installer.setup()
-
# preparing VM image...
self.image.create()
# creating test user...
self.user.create()
- def setup_fault_management(self):
- # user settings...
- self.user.update_quota()
-
- # creating VM...
- self.network.create()
- self.instance.create()
- self.instance.wait_for_vm_launch()
-
- # creating alarm...
- self.alarm.create()
-
- # starting doctor sample components...
- # tbd tojuvone: move inspector and consumer to common setup
- # when they support updating VMs via instance.create and
- # instance.delete alarm
-
- self.inspector.start()
- self.consumer.start()
- self.down_host = self.get_host_info_for_random_vm()
- self.monitor.start(self.down_host)
-
def test_fault_management(self):
- try:
- LOG.info('doctor fault management test starting.......')
-
- # prepare test env
- self.setup_fault_management()
-
- # wait for aodh alarms are updated in caches for event evaluator,
- # sleep time should be larger than event_alarm_cache_ttl
- # (default 60)
- time.sleep(60)
-
- # injecting host failure...
- # NOTE (umar) add INTERFACE_NAME logic to host injection
-
- self.fault.start(self.down_host)
- time.sleep(10)
-
- # verify the test results
- # NOTE (umar) copy remote monitor.log file when monitor=collectd
- self.check_host_status(self.down_host.name, 'down')
-
- notification_time = calculate_notification_time(LogFile)
- if notification_time < 1 and notification_time > 0:
- LOG.info('doctor fault management test successfully, '
- 'notification_time=%s' % notification_time)
- else:
+ retry = 2
+ # Retry once if notified_time is None
+ while retry > 0:
+ try:
+ self.fault_management = None
+ LOG.info('doctor fault management test starting.......')
+ transport_url = self.installer.get_transport_url()
+ self.fault_management = \
+ FaultManagement(self.conf, self.installer, self.user, LOG,
+ transport_url)
+
+ # prepare test env
+ self.fault_management.setup()
+
+ # wait for aodh alarms are updated in caches for event
+ # evaluator,sleep time should be larger than
+ # event_alarm_cache_ttl (default 60)
+ # (tojuvone) Fraser currently needs 120
+ time.sleep(120)
+
+ # injecting host failure...
+ # NOTE (umar) add INTERFACE_NAME logic to host injection
+ self.fault_management.start()
+ time.sleep(30)
+
+ # verify the test results
+ # NOTE (umar) copy remote monitor.log file when
+ # monitor=collectd
+ self.fault_management.check_host_status('down')
+ self.fault_management.check_notification_time()
+ retry = 0
+
+ except Exception as e:
LOG.error('doctor fault management test failed, '
- 'notification_time=%s' % notification_time)
+ 'Exception=%s' % e)
+ if 'notified_time=None' in str(e):
+ retry -= 1
+ LOG.info('doctor fault management retry')
+ continue
+ LOG.error(format_exc())
sys.exit(1)
-
- if self.conf.profiler_type:
- LOG.info('doctor fault management test begin to run '
- 'profile.......')
- self.collect_logs()
- self.run_profiler()
- except Exception as e:
- LOG.error('doctor fault management test failed, '
- 'Exception=%s' % e)
- sys.exit(1)
- finally:
- self.cleanup_fault_management()
+ finally:
+ if self.fault_management is not None:
+ self.fault_management.cleanup()
def _amount_compute_nodes(self):
services = self.nova.services.list(binary='nova-compute')
@@ -145,109 +106,62 @@ class DoctorTest(object):
LOG.info('not enough compute nodes, skipping doctor '
'maintenance test')
return
+ elif self.conf.installer.type not in ['apex', 'fuel', 'devstack']:
+ LOG.info('not supported installer, skipping doctor '
+ 'maintenance test')
+ return
try:
+ maintenance = None
LOG.info('doctor maintenance test starting.......')
- # TODO (tojuvone) test setup and actual test
+ trasport_url = self.installer.get_transport_url()
+ maintenance = Maintenance(trasport_url, self.conf, LOG)
+ maintenance.setup_maintenance(self.user)
+
+ # wait for aodh alarms are updated in caches for event evaluator,
+ # sleep time should be larger than event_alarm_cache_ttl
+ # (default 60)
+ LOG.info('wait aodh for 120s.......')
+ time.sleep(120)
+
+ session_id = maintenance.start_maintenance()
+ maintenance.wait_maintenance_complete(session_id)
+
+ LOG.info('doctor maintenance complete.......')
+
except Exception as e:
LOG.error('doctor maintenance test failed, Exception=%s' % e)
+ LOG.error(format_exc())
sys.exit(1)
- # TODO (tojuvone) finally: test case specific cleanup
+ finally:
+ if maintenance is not None:
+ maintenance.cleanup_maintenance()
def run(self):
"""run doctor tests"""
try:
LOG.info('doctor test starting.......')
+
# prepare common test env
self.setup()
+
if self.conf.test_case == 'all':
self.test_fault_management()
self.test_maintenance()
else:
- getattr(self, self.conf.test_case)()
+ function = 'test_%s' % self.conf.test_case
+ if hasattr(self, function):
+ getattr(self, function)()
+ else:
+ raise Exception('Can not find function <%s> in'
+ 'DoctorTest, see config manual'
+ % function)
except Exception as e:
LOG.error('doctor test failed, Exception=%s' % e)
+ LOG.error(format_exc())
sys.exit(1)
finally:
self.cleanup()
- def get_host_info_for_random_vm(self):
- num = random.randint(0, self.conf.instance_count - 1)
- vm_name = "%s%d" % (self.conf.instance_basename, num)
-
- servers = \
- {getattr(server, 'name'): server
- for server in self.nova.servers.list()}
- server = servers.get(vm_name)
- if not server:
- raise \
- Exception('Can not find instance: vm_name(%s)' % vm_name)
- host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
- host_ip = self.installer.get_host_ip_from_hostname(host_name)
-
- LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
- % (host_name, host_ip, vm_name))
- return Host(host_name, host_ip)
-
- def check_host_status(self, hostname, state):
- service = self.nova.services.list(host=hostname,
- binary='nova-compute')
- host_state = service[0].__dict__.get('state')
- assert host_state == state
-
- def unset_forced_down_hosts(self):
- if self.down_host:
- self.nova.services.force_down(self.down_host.name,
- 'nova-compute', False)
- time.sleep(2)
- self.check_host_status(self.down_host.name, 'up')
-
- def collect_logs(self):
- self.fault.get_disable_network_log()
-
- def run_profiler(self):
-
- net_down_log_file = self.fault.get_disable_network_log()
- reg = '(?<=doctor set link down at )\d+.\d+'
- linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
-
- reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
- vmdown = float(match_rep_in_file(reg, LogFile).group(2))
-
- reg = '(.* doctor mark host.* down at )(\d+.\d+)'
- hostdown = float(match_rep_in_file(reg, LogFile).group(2))
-
- reg = '(?<=doctor monitor detected at )\d+.\d+'
- detected = float(match_rep_in_file(reg, LogFile).group(0))
-
- reg = '(?<=doctor consumer notified at )\d+.\d+'
- notified = float(match_rep_in_file(reg, LogFile).group(0))
-
- # TODO(yujunz) check the actual delay to verify time sync status
- # expected ~1s delay from $trigger to $linkdown
- relative_start = linkdown
- os.environ['DOCTOR_PROFILER_T00'] = \
- str(int((linkdown - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T01'] = \
- str(int((detected - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T03'] = \
- str(int((vmdown - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T04'] = \
- str(int((hostdown - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T09'] = \
- str(int((notified - relative_start) * 1000))
-
- profiler_main(log=LOG)
-
- def cleanup_fault_management(self):
- self.unset_forced_down_hosts()
- self.inspector.stop()
- self.monitor.stop()
- self.consumer.stop()
- self.alarm.delete()
- self.instance.delete()
- self.network.delete()
- self.fault.cleanup()
-
def cleanup(self):
self.installer.cleanup()
self.image.delete()