summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore3
-rw-r--r--docs/development/design/index.rst1
-rw-r--r--docs/development/design/maintenance-design-guideline.rst155
-rw-r--r--docs/development/manuals/monitors.rst36
-rw-r--r--etc/doctor.sample.conf50
-rw-r--r--tests/alarm.py4
-rw-r--r--tests/common/__init__.py8
-rw-r--r--tests/common/constants.py12
-rw-r--r--tests/common/utils.py105
-rw-r--r--tests/config.py10
-rw-r--r--tests/consumer/__init__.py11
-rw-r--r--tests/consumer/base.py26
-rw-r--r--tests/consumer/sample.py71
-rw-r--r--tests/inspector.py3
-rw-r--r--tests/inspector/__init__.py5
-rw-r--r--tests/inspector/congress.py94
-rw-r--r--tests/inspector/sample.py70
-rw-r--r--tests/installer/__init__.py38
-rw-r--r--tests/installer/apex.py127
-rw-r--r--tests/installer/base.py36
-rw-r--r--tests/installer/common/congress.py47
-rw-r--r--tests/installer/common/restore_ceilometer.py27
-rw-r--r--tests/installer/common/set_ceilometer.py44
-rw-r--r--tests/installer/local.py109
-rw-r--r--tests/lib/installers/apex13
-rw-r--r--tests/lib/monitor31
-rw-r--r--tests/lib/monitors/collectd/collectd101
-rw-r--r--tests/lib/monitors/collectd/collectd_plugin.py167
-rw-r--r--tests/lib/monitors/sample/monitor.py (renamed from tests/monitor.py)0
-rw-r--r--tests/lib/monitors/sample/sample18
-rw-r--r--tests/logger.py5
-rw-r--r--tests/main.py125
-rw-r--r--tests/monitor/__init__.py29
-rw-r--r--tests/monitor/base.py27
-rw-r--r--tests/monitor/collectd.py138
-rw-r--r--tests/monitor/sample.py106
-rw-r--r--tests/os_clients.py6
-rw-r--r--tests/profiler_poc.py (renamed from tests/profiler-poc.py)23
-rwxr-xr-xtests/run.sh44
-rw-r--r--tests/scenario/__init__.py8
-rw-r--r--tests/scenario/common.py28
-rw-r--r--tests/scenario/network_failure.py71
-rw-r--r--tox.ini3
43 files changed, 1955 insertions, 80 deletions
diff --git a/.gitignore b/.gitignore
index 83868b54..7e893869 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,5 +10,8 @@
/releng/
/tests/*.img
+#IntelJ Idea
+.idea/
+
#Build results
.tox
diff --git a/docs/development/design/index.rst b/docs/development/design/index.rst
index 87d14d42..e50c1704 100644
--- a/docs/development/design/index.rst
+++ b/docs/development/design/index.rst
@@ -26,3 +26,4 @@ See also https://wiki.opnfv.org/requirements_projects .
port-data-plane-status.rst
inspector-design-guideline.rst
performance-profiler.rst
+ maintenance-design-guideline.rst
diff --git a/docs/development/design/maintenance-design-guideline.rst b/docs/development/design/maintenance-design-guideline.rst
new file mode 100644
index 00000000..93c3cf4e
--- /dev/null
+++ b/docs/development/design/maintenance-design-guideline.rst
@@ -0,0 +1,155 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+====================================
+Planned Maintenance Design Guideline
+====================================
+
+.. NOTE::
+ This is spec draft of design guideline for planned maintenance.
+ JIRA ticket to track the update and collect comments: `DOCTOR-52`_.
+
+This document describes how one can implement planned maintenance by utilizing
+the `OPNFV Doctor project`_. framework and to meet the set requirements.
+
+Problem Description
+===================
+
+Telco application need to know when planned maintenance is going to happen in
+order to guarantee zero down time in its operation. It needs to be possible to
+make own actions to have application running on not affected resource or give
+guidance to admin actions like migration. More details are defined in
+requirement documentation: `use cases`_, `architecture`_ and `implementation`_.
+Also discussion in the OPNFV summit about `planned maintenance session`_.
+
+Guidelines
+==========
+
+Cloud admin needs to make a notification about planned maintenance including
+all details that application needs in order to make decisions upon his affected
+service. This notification payload can be consumed by application by subscribing
+to corresponding event alarm trough alarming service like OpenStack AODH.
+
+Before maintenance starts application needs to be able to make switch over for
+his ACT-STBY service affected, do operation to move service to not effected part
+of infra or give a hint for admin operation like migration that can be
+automatically issued by admin tool according to agreed policy.
+
+Flow diagram::
+
+ admin alarming project controller inspector
+ | service app manager | |
+ | 1. | | | |
+ +------------------------->+ |
+ +<-------------------------+ |
+ | 2. | | | |
+ +------>+ 3. | | |
+ | +-------->+ 4. | |
+ | | +------->+ |
+ | | 5. +<-------+ |
+ +<----------------+ | |
+ | | 6. | |
+ +------------------------->+ |
+ +<-------------------------+ 7. |
+ +------------------------------------->+
+ | 8. | | | |
+ +------>+ 9. | | |
+ | +-------->+ | |
+ +--------------------------------------+
+ | 10. |
+ +--------------------------------------+
+ | 11. | | | |
+ +------------------------->+ |
+ +<-------------------------+ |
+ | 12. | | | |
+ +------>+-------->+ | 13. |
+ +------------------------------------->+
+ +-------+---------+--------+-----------+
+
+Concepts used below:
+
+- `full maintenance`: This means maintenance will take a longer time and
+ resource should be emptied, meaning container or VM need to be moved or
+ deleted. Admin might need to test resource to work after maintenance.
+
+- `reboot`: Only a reboot is needed and admin does not need separate testing
+ after that. Container or VM can be left in place if so wanted.
+
+- `notification`: Notification to rabbitmq.
+
+Admin makes a planned maintenance session where he sets
+a `maintenance_session_id` that is a unique ID for all the hardware resources he
+is going to have the maintenance at the same time. Mostly maintenance should be
+done node by node, meaning a single compute node at a time would be in single
+planned maintenance session having unique `maintenance_session_id`. This ID will
+be carried trough the whole session in all places and can be used to query
+maintenance in admin tool API. Project running a Telco application should set
+a specific role for admin tool to know it cannot do planned maintenance unless
+project has agreed actions to be done for its VMs or containers. This means the
+project has configured itself to get alarms upon planned maintenance and it is
+capable of agreeing needed actions. Admin is supposed to use an admin tool to
+automate maintenance process partially or entirely.
+
+The flow of a successful planned maintenance session as in OpenStack example
+case:
+
+1. Admin disables nova-compute in order to do planned maintenance on a compute
+ host and gets ACK from the API call. This action needs to be done to ensure
+ no thing will be placed in this compute host by any user. Action is always
+ done regardless the whole compute will be affected or not.
+2. Admin sends a project specific maintenance notification with state
+ `planned maintenance`. This includes detailed information about maintenance,
+ like when it is going to start, is it `reboot` or `full maintenance`
+ including the information about project containers or VMs running on host or
+ the part of it that will need maintenance. Also default action like
+ migration will be mentioned that will be issued by admin before maintenance
+ starts if no other action is set by project. In case project has a specific
+ role set, planned maintenance cannot start unless project has agreed the
+ admin action. Available admin actions are also listed in notification.
+3. Application manager of the project receives AODH alarm about the same.
+4. Application manager can do switch over to his ACT-STBY service, delete and
+ re-instantiate his service on not affected resource if so wanted.
+5. Application manager may call admin tool API to give preferred instructions
+ for leaving VMs and containers in place or do admin action to migrate them.
+ In case admin does not receive this instruction before maintenance is to
+ start it will do the pre-configured default action like migration to
+ projects without a specific role to say project need to agree the action.
+ VMs or Containers can be left on host if type of maintenance is just `reboot`.
+6. Admin does possible actions to VMs and containers and receives an ACK.
+7. In case everything went ok, Admin sends admin type of maintenance
+ notification with state `in maintenance`. This notification can be consumed
+ by Inspector and other cloud services to know there is ongoing maintenance
+ which means things like automatic fault management actions for the hardware
+ resources should be disabled.
+8. If maintenance type is `reboot` and project is still having containers or
+ VMs running on affected hardware resource, Admin sends project specific
+ maintenance notification with state updated to `in maintenance`. If project
+ do not have anything left running on affected hardware resource, state will
+ be `maintenance over` instead. If maintenance can not be performed for some
+ reason state should be `maintenance cancelled`. In this case last operation
+ remaining for admin is to re-enable nova-compute service, ensure
+ everything is running and not to proceed any further steps.
+9. Application manager of the project receives AODH alarm about the same.
+10. Admin will do the maintenance. This is out of Doctor scope.
+11. Admin enables nova-compute service when maintenance is over and host can be
+ put back to production. An ACK is received from API call.
+12. In case project had left containers or VMs on hardware resource over
+ maintenance, Admin sends project specific maintenance notification with
+ state updated to `maintenance over`.
+13. Admin sends admin type of maintenance notification with state updated to
+ `maintenance over`. Inspector and other
+ cloud services can consume this to know hardware resource is back in use.
+
+POC
+---
+
+There was a `Maintenance POC`_ for planned maintenance in the OPNFV Beijing
+summit to show the basic concept of using framework defined by the project.
+
+.. _DOCTOR-52: https://jira.opnfv.org/browse/DOCTOR-52
+.. _OPNFV Doctor project: https://wiki.opnfv.org/doctor
+.. _use cases: http://artifacts.opnfv.org/doctor/docs/requirements/02-use_cases.html#nvfi-maintenance
+.. _architecture: http://artifacts.opnfv.org/doctor/docs/requirements/03-architecture.html#nfvi-maintenance
+.. _implementation: http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html#nfvi-maintenance
+.. _planned maintenance session: https://lists.opnfv.org/pipermail/opnfv-tech-discuss/2017-June/016677.html
+.. _Maintenance POC: https://wiki.opnfv.org/download/attachments/5046291/Doctor%20Maintenance%20PoC%202017.pptx?version=1&modificationDate=1498182869000&api=v2
diff --git a/docs/development/manuals/monitors.rst b/docs/development/manuals/monitors.rst
new file mode 100644
index 00000000..0d22b1de
--- /dev/null
+++ b/docs/development/manuals/monitors.rst
@@ -0,0 +1,36 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Monitor Types and Limitations
+=============================
+
+Currently there are two monitor types supported: sample and collectd
+
+Sample Monitor
+--------------
+
+Sample monitor type pings the compute host from the control host and calculates the
+notification time after the ping timeout.
+Also if inspector type is sample, the compute node needs to communicate with the control
+node on port 12345. This port needs to be opened for incomming traffic on control node.
+
+Collectd Monitor
+----------------
+
+Collectd monitor type uses collectd daemon running ovs_events plugin. Collectd runs on
+compute to send instant notification to the control node. The notification time is
+calculated by using the difference of time at which compute node sends notification to
+control node and the time at which consumer is notified. The time on control and compute
+node has to be synchronized for this reason. For further details on setting up collectd
+on the compute node, use the following link:
+http://docs.opnfv.org/en/stable-danube/submodules/barometer/docs/release/userguide/feature.userguide.html#id18
+
+Collectd monitors an interface managed by OVS. If the interface is not be assigned
+an IP, the user has to provide the name of interface to be monitored. The command to
+launch the doctor test in that case is:
+MONITOR_TYPE=collectd INSPECTOR_TYPE=sample INTERFACE_NAME=example_iface ./run.sh
+
+If the interface name or IP is not provided, the collectd monitor type will monitor the
+default management interface. This may result in the failure of doctor run.sh test case.
+The test case sets the monitored interface down and if the inspector (sample or congress)
+is running on the same subnet, collectd monitor will not be able to communicate with it.
diff --git a/etc/doctor.sample.conf b/etc/doctor.sample.conf
index 8a1ddc3e..f81d3b2d 100644
--- a/etc/doctor.sample.conf
+++ b/etc/doctor.sample.conf
@@ -7,20 +7,46 @@
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
[DEFAULT]
-image_name = cirros
-image_format = qcow2
-image_filename = cirros.img
-image_download_url = https://launchpad.net/cirros/trunk/0.3.0/+download/cirros-0.3.0-x86_64-disk.img
+#image_name = cirros
+#image_format = qcow2
+#image_filename = cirros.img
+#image_download_url = https://launchpad.net/cirros/trunk/0.3.0/+download/cirros-0.3.0-x86_64-disk.img
-glance_version = 2
-nova_version = 2.34
+#glance_version = 2
+#nova_version = 2.34
+#aodh_version = 2
-doctor_user = doctor
-doctor_passwd = doctor
-doctor_project = doctor
-doctor_role = _member_
-quota_instances = 1
-quota_cores = 1
+#doctor_user = doctor
+#doctor_passwd = doctor
+#doctor_project = doctor
+#doctor_role = _member_
+#quota_instances = 1
+#quota_cores = 1
+#net_name = doctor_net
+#net_cidr = 192.168.168.0/24
+#flavor = m1.tiny
+#instance_count = 1
+#instance_basename = doctor_vm
+#alarm_basename = doctor_alarm
+#profiler_type = poc
+
+[installer]
+#type = local
+#ip = 127.0.0.1
+#username = root
+
+[monitor]
+#type = sample
+
+[inspector]
+#type = sample
+#ip = 127.0.0.1
+#port = 12345
+
+[consumer]
+#type = sample
+#ip = 127.0.0.1
+#port = 12346
diff --git a/tests/alarm.py b/tests/alarm.py
index 0582085e..916f4405 100644
--- a/tests/alarm.py
+++ b/tests/alarm.py
@@ -26,9 +26,7 @@ class Alarm(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
- self.auth = get_identity_auth(username=self.conf.doctor_user,
- password=self.conf.doctor_passwd,
- project=self.conf.doctor_project)
+ self.auth = get_identity_auth(project=self.conf.doctor_project)
self.aodh = \
aodh_client(conf.aodh_version,
get_session(auth=self.auth))
diff --git a/tests/common/__init__.py b/tests/common/__init__.py
new file mode 100644
index 00000000..e68a3070
--- /dev/null
+++ b/tests/common/__init__.py
@@ -0,0 +1,8 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+############################################################################## \ No newline at end of file
diff --git a/tests/common/constants.py b/tests/common/constants.py
new file mode 100644
index 00000000..72d037af
--- /dev/null
+++ b/tests/common/constants.py
@@ -0,0 +1,12 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from collections import namedtuple
+
+
+Host = namedtuple('Host', ['name', 'ip'])
diff --git a/tests/common/utils.py b/tests/common/utils.py
new file mode 100644
index 00000000..2e823acb
--- /dev/null
+++ b/tests/common/utils.py
@@ -0,0 +1,105 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import json
+import os
+import paramiko
+import re
+
+
+def load_json_file(full_path):
+ """Loads JSON from file
+ :param target_filename:
+ :return:
+ """
+ if not os.path.isfile(full_path):
+ raise Exception('File(%s) does not exist' % full_path)
+
+ with open(full_path, 'r') as file:
+ return json.load(file)
+
+
+def write_json_file(full_path, data):
+ """write JSON from file
+ :param target_filename:
+ :return:
+ """
+
+ with open(full_path, 'w+') as file:
+ file.write(json.dumps(data))
+
+
+def match_rep_in_file(regex, full_path):
+ if not os.path.isfile(full_path):
+ raise Exception('File(%s) does not exist' % full_path)
+
+ with open(full_path, 'r') as file:
+ for line in file:
+ result = re.search(regex, line)
+ if result:
+ return result
+
+ return None
+
+
+class SSHClient(object):
+ def __init__(self, ip, username, password=None, pkey=None,
+ key_filename=None, log=None, look_for_keys=False,
+ allow_agent=False):
+ self.client = paramiko.SSHClient()
+ self.client.load_system_host_keys()
+ self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ self.client.connect(ip, username=username, password=password,
+ pkey=pkey, key_filename=key_filename,
+ look_for_keys=look_for_keys,
+ allow_agent=allow_agent)
+ self.log = log
+
+ def __del__(self):
+ self.client.close()
+
+ def ssh(self, command):
+ if self.log:
+ self.log.info("Executing: %s" % command)
+ stdin, stdout, stderr = self.client.exec_command(command)
+ ret = stdout.channel.recv_exit_status()
+ output = list()
+ for line in stdout.read().splitlines():
+ output.append(line.decode('utf-8'))
+ if ret:
+ if self.log:
+ self.log.info("*** FAILED to run command %s (%s)" % (command, ret))
+ raise Exception(
+ "Unable to run \ncommand: %s\nret: %s"
+ % (command, ret))
+ if self.log:
+ self.log.info("*** SUCCESSFULLY run command %s" % command)
+ return ret, output
+
+ def scp(self, source, dest, method='put'):
+ if self.log:
+ self.log.info("Copy %s -> %s" % (source, dest))
+ ftp = self.client.open_sftp()
+ if method == 'put':
+ ftp.put(source, dest)
+ elif method == 'get':
+ ftp.get(source, dest)
+ ftp.close()
+
+
+def run_async(func):
+ from threading import Thread
+ from functools import wraps
+
+ @wraps(func)
+ def async_func(*args, **kwargs):
+ thread = Thread(target=func, args=args, kwargs=kwargs)
+ thread.start()
+ return thread
+
+ return async_func
diff --git a/tests/config.py b/tests/config.py
index 969d829f..c71d5ad7 100644
--- a/tests/config.py
+++ b/tests/config.py
@@ -14,23 +14,29 @@ import alarm
import consumer
import image
import instance
+import installer
import network
import inspector
+import monitor
import os_clients
+import profiler_poc
import user
def list_opts():
return [
- ('consumer', consumer.OPTS),
+ ('installer', installer.OPTS),
+ ('monitor', monitor.OPTS),
('inspector', inspector.OPTS),
+ ('consumer', consumer.OPTS),
('DEFAULT', itertools.chain(
os_clients.OPTS,
image.OPTS,
user.OPTS,
network.OPTS,
instance.OPTS,
- alarm.OPTS))
+ alarm.OPTS,
+ profiler_poc.OPTS))
]
diff --git a/tests/consumer/__init__.py b/tests/consumer/__init__.py
index 68cc5dc9..ccec8644 100644
--- a/tests/consumer/__init__.py
+++ b/tests/consumer/__init__.py
@@ -7,6 +7,7 @@
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
from oslo_config import cfg
+from oslo_utils import importutils
OPTS = [
@@ -24,3 +25,13 @@ OPTS = [
help='the port of doctor consumer',
required=True),
]
+
+
+_consumer_name_class_mapping = {
+ 'sample': 'consumer.sample.SampleConsumer'
+}
+
+
+def get_consumer(conf, log):
+ consumer_class = _consumer_name_class_mapping.get(conf.consumer.type)
+ return importutils.import_object(consumer_class, conf, log) \ No newline at end of file
diff --git a/tests/consumer/base.py b/tests/consumer/base.py
new file mode 100644
index 00000000..35170748
--- /dev/null
+++ b/tests/consumer/base.py
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseConsumer(object):
+
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+
+ @abc.abstractmethod
+ def start(self):
+ pass
+
+ @abc.abstractmethod
+ def stop(self):
+ pass \ No newline at end of file
diff --git a/tests/consumer/sample.py b/tests/consumer/sample.py
new file mode 100644
index 00000000..20ad9d57
--- /dev/null
+++ b/tests/consumer/sample.py
@@ -0,0 +1,71 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from flask import Flask
+from flask import request
+import json
+import time
+from threading import Thread
+import requests
+
+from consumer.base import BaseConsumer
+
+
+class SampleConsumer(BaseConsumer):
+
+ def __init__(self, conf, log):
+ super(SampleConsumer, self).__init__(conf, log)
+ self.app = None
+
+ def start(self):
+ self.log.info('sample consumer start......')
+ self.app = ConsumerApp(self.conf.consumer.port, self, self.log)
+ self.app.start()
+
+ def stop(self):
+ self.log.info('sample consumer stop......')
+ if not self.app:
+ return
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ }
+ url = 'http://%s:%d/shutdown'\
+ % (self.conf.consumer.ip,
+ self.conf.consumer.port)
+ requests.post(url, data='', headers=headers)
+
+
+class ConsumerApp(Thread):
+
+ def __init__(self, port, consumer, log):
+ Thread.__init__(self)
+ self.port = port
+ self.consumer = consumer
+ self.log = log
+
+ def run(self):
+ app = Flask('consumer')
+
+ @app.route('/failure', methods=['POST'])
+ def event_posted():
+ self.log.info('doctor consumer notified at %s' % time.time())
+ self.log.info('sample consumer received data = %s' % request.data)
+ data = json.loads(request.data.decode('utf8'))
+ return 'OK'
+
+ @app.route('/shutdown', methods=['POST'])
+ def shutdown():
+ self.log.info('shutdown consumer app server at %s' % time.time())
+ func = request.environ.get('werkzeug.server.shutdown')
+ if func is None:
+ raise RuntimeError('Not running with the Werkzeug Server')
+ func()
+ return 'consumer app shutting down...'
+
+ app.run(host="0.0.0.0", port=self.port) \ No newline at end of file
diff --git a/tests/inspector.py b/tests/inspector.py
index a61051f1..82ffc338 100644
--- a/tests/inspector.py
+++ b/tests/inspector.py
@@ -116,8 +116,7 @@ def get_args():
def main():
args = get_args()
- app.run(port=args.port)
-
+ app.run(host='0.0.0.0', port=args.port)
if __name__ == '__main__':
main()
diff --git a/tests/inspector/__init__.py b/tests/inspector/__init__.py
index 35bdb5b9..afba4800 100644
--- a/tests/inspector/__init__.py
+++ b/tests/inspector/__init__.py
@@ -11,6 +11,7 @@ import os
from oslo_config import cfg
from oslo_utils import importutils
+
OPTS = [
cfg.StrOpt('type',
default=os.environ.get('INSPECTOR_TYPE', 'sample'),
@@ -19,7 +20,7 @@ OPTS = [
required=True),
cfg.StrOpt('ip',
default='127.0.0.1',
- help='the ip of default inspector',
+ help='the host ip of inspector',
required=False),
cfg.StrOpt('port',
default='12345',
@@ -30,8 +31,10 @@ OPTS = [
_inspector_name_class_mapping = {
'sample': 'inspector.sample.SampleInspector',
+ 'congress': 'inspector.congress.CongressInspector',
}
+
def get_inspector(conf, log):
inspector_class = _inspector_name_class_mapping[conf.inspector.type]
return importutils.import_object(inspector_class, conf, log)
diff --git a/tests/inspector/congress.py b/tests/inspector/congress.py
new file mode 100644
index 00000000..ae295852
--- /dev/null
+++ b/tests/inspector/congress.py
@@ -0,0 +1,94 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from identity_auth import get_identity_auth
+from identity_auth import get_session
+from os_clients import congress_client
+
+from inspector.base import BaseInspector
+
+
+class CongressInspector(BaseInspector):
+ nova_api_min_version = '2.11'
+ doctor_driver = 'doctor'
+ doctor_datasource = 'doctor'
+ policy = 'classification'
+ rules = {
+ 'host_down':
+ 'host_down(host) :- doctor:events(hostname=host, type="compute.host.down", status="down")',
+ 'active_instance_in_host':
+ 'active_instance_in_host(vmid, host) :- nova:servers(id=vmid, host_name=host, status="ACTIVE")',
+ 'host_force_down':
+ 'execute[nova:services.force_down(host, "nova-compute", "True")] :- host_down(host)',
+ 'error_vm_states':
+ 'execute[nova:servers.reset_state(vmid, "error")] :- host_down(host), active_instance_in_host(vmid, host)'
+ }
+
+ def __init__(self, conf, log):
+ super(CongressInspector, self).__init__(conf, log)
+ self.auth = get_identity_auth()
+ self.congress = congress_client(get_session(auth=self.auth))
+ self._init_driver_and_ds()
+ self.inspector_url = self.get_inspector_url()
+
+ def _init_driver_and_ds(self):
+ datasources = \
+ {ds['name']: ds for ds in self.congress.list_datasources()['results']}
+
+ # check nova_api version
+ nova_api_version = datasources['nova']['config'].get('api_version')
+ if nova_api_version and nova_api_version < self.nova_api_min_version:
+ raise Exception('Congress Nova datasource API version < nova_api_min_version(%s)'
+ % self.nova_api_min_version)
+
+ # create doctor datasource if it's not exist
+ if self.doctor_datasource not in datasources:
+ self.congress.create_datasource(
+ body={'driver': self.doctor_driver,
+ 'name': self.doctor_datasource})
+
+ # check whether doctor driver exist
+ drivers = \
+ {driver['id']: driver for driver in self.congress.list_drivers()['results']}
+ if self.doctor_driver not in drivers:
+ raise Exception('Do not support doctor driver in congress')
+
+ self.policy_rules = \
+ {rule['name']: rule for rule in
+ self.congress.list_policy_rules(self.policy)['results']}
+
+ def get_inspector_url(self):
+ ds = self.congress.list_datasources()['results']
+ doctor_ds = next((item for item in ds if item['driver'] == 'doctor'),
+ None)
+ congress_endpoint = self.congress.httpclient.get_endpoint(auth=self.auth)
+ return ('%s/v1/data-sources/%s/tables/events/rows' %
+ (congress_endpoint, doctor_ds['id']))
+
+ def start(self):
+ self.log.info('congress inspector start......')
+
+ for rule_name, rule in self.rules.items():
+ self._add_rule(rule_name, rule)
+
+ def stop(self):
+ self.log.info('congress inspector stop......')
+
+ for rule_name in self.rules.keys():
+ self._del_rule(rule_name)
+
+ def _add_rule(self, rule_name, rule):
+ if rule_name not in self.policy_rules:
+ self.congress.create_policy_rule(self.policy,
+ body={'name': rule_name,
+ 'rule': rule})
+
+ def _del_rule(self, rule_name):
+ if rule_name in self.policy_rules:
+ rule_id = self.policy_rules[rule_name]['id']
+ self.congress.delete_policy_rule(self.policy, rule_id)
diff --git a/tests/inspector/sample.py b/tests/inspector/sample.py
index db477de9..1c05cede 100644
--- a/tests/inspector/sample.py
+++ b/tests/inspector/sample.py
@@ -14,9 +14,11 @@ import time
from threading import Thread
import requests
+from common import utils
from identity_auth import get_identity_auth
from identity_auth import get_session
from os_clients import nova_client
+from os_clients import neutron_client
from inspector.base import BaseInspector
@@ -31,6 +33,10 @@ class SampleInspector(BaseInspector):
# Normally we use this client for non redundant API calls
self.nova = self.novaclients[0]
+ auth = get_identity_auth(project=self.conf.doctor_project)
+ session = get_session(auth=auth)
+ self.neutron = neutron_client(session)
+
self.servers = collections.defaultdict(list)
self.hostnames = list()
self.app = None
@@ -86,37 +92,49 @@ class SampleInspector(BaseInspector):
event_type = event['type']
if event_type == self.event_type:
self.hostnames.append(hostname)
- self.disable_compute_host(hostname)
-
- def disable_compute_host(self, hostname):
- threads = []
- if len(self.servers[hostname]) > self.NUMBER_OF_CLIENTS:
- # TODO(tojuvone): This could be enhanced in future with dynamic
- # reuse of self.novaclients when all threads in use
- self.log.error('%d servers in %s. Can handle only %d'%(
- self.servers[hostname], hostname, self.NUMBER_OF_CLIENTS))
- for nova, server in zip(self.novaclients, self.servers[hostname]):
- t = ThreadedResetState(nova, "error", server, self.log)
- t.start()
- threads.append(t)
- for t in threads:
- t.join()
+ thr1 = self._disable_compute_host(hostname)
+ thr2 = self._vms_reset_state('error', hostname)
+ thr3 = self._set_ports_data_plane_status('DOWN', hostname)
+ thr1.join()
+ thr2.join()
+ thr3.join()
+
+ @utils.run_async
+ def _disable_compute_host(self, hostname):
self.nova.services.force_down(hostname, 'nova-compute', True)
self.log.info('doctor mark host(%s) down at %s' % (hostname, time.time()))
+ @utils.run_async
+ def _vms_reset_state(self, state, hostname):
-class ThreadedResetState(Thread):
+ @utils.run_async
+ def _vm_reset_state(nova, server, state):
+ nova.servers.reset_state(server, state)
+ self.log.info('doctor mark vm(%s) error at %s' % (server, time.time()))
- def __init__(self, nova, state, server, log):
- Thread.__init__(self)
- self.nova = nova
- self.state = state
- self.server = server
- self.log = log
+ thrs = []
+ for nova, server in zip(self.novaclients, self.servers[hostname]):
+ t = _vm_reset_state(nova, server, state)
+ thrs.append(t)
+ for t in thrs:
+ t.join()
- def run(self):
- self.nova.servers.reset_state(self.server, self.state)
- self.log.info('doctor mark vm(%s) error at %s' % (self.server, time.time()))
+ @utils.run_async
+ def _set_ports_data_plane_status(self, status, hostname):
+ body = {'data_plane_status': status}
+
+ @utils.run_async
+ def _set_port_data_plane_status(port_id):
+ self.neutron.update_port(port_id, body)
+ self.log.info('doctor set data plane status %s on port %s' % (status, port_id))
+
+ thrs = []
+ params = {'binding:host_id': hostname}
+ for port_id in self.neutron.list_ports(**params):
+ t = _set_port_data_plane_status(port_id)
+ thrs.append(t)
+ for t in thrs:
+ t.join()
class InspectorApp(Thread):
@@ -135,7 +153,7 @@ class InspectorApp(Thread):
self.log.info('event posted in sample inspector at %s' % time.time())
self.log.info('sample inspector = %s' % self.inspector)
self.log.info('sample inspector received data = %s' % request.data)
- events = json.loads(request.data)
+ events = json.loads(request.data.decode('utf8'))
self.inspector.handle_events(events)
return "OK"
diff --git a/tests/installer/__init__.py b/tests/installer/__init__.py
new file mode 100644
index 00000000..bb0e452d
--- /dev/null
+++ b/tests/installer/__init__.py
@@ -0,0 +1,38 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+
+from oslo_config import cfg
+from oslo_utils import importutils
+
+OPTS = [
+ cfg.StrOpt('type',
+ default=os.environ.get('INSTALLER_TYPE', 'local'),
+ choices=['local', 'apex'],
+ help='the type of installer',
+ required=True),
+ cfg.StrOpt('ip',
+ default=os.environ.get('INSTALLER_IP', '127.0.0.1'),
+ help='the ip of installer'),
+ cfg.StrOpt('username',
+ default='root',
+ help='the user name for login installer server',
+ required=True),
+]
+
+
+_installer_name_class_mapping = {
+ 'local': 'installer.local.LocalInstaller',
+ 'apex': 'installer.apex.ApexInstaller'
+}
+
+
+def get_installer(conf, log):
+ installer_class = _installer_name_class_mapping[conf.installer.type]
+ return importutils.import_object(installer_class, conf, log) \ No newline at end of file
diff --git a/tests/installer/apex.py b/tests/installer/apex.py
new file mode 100644
index 00000000..98eb6c9c
--- /dev/null
+++ b/tests/installer/apex.py
@@ -0,0 +1,127 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import getpass
+import grp
+import os
+import pwd
+import stat
+import subprocess
+import sys
+
+from common.utils import SSHClient
+from installer.base import BaseInstaller
+
+
+class ApexInstaller(BaseInstaller):
+ node_user_name = 'heat-admin'
+ cm_set_script = 'set_ceilometer.py'
+ cm_restore_script = 'restore_ceilometer.py'
+
+ def __init__(self, conf, log):
+ super(ApexInstaller, self).__init__(conf, log)
+ self.client = SSHClient(self.conf.installer.ip,
+ self.conf.installer.username,
+ look_for_keys=True)
+ self.key_file = None
+ self.controllers = list()
+ self.controller_clients = list()
+ self.servers = list()
+
+ def setup(self):
+ self.log.info('Setup Apex installer start......')
+
+ self.get_ssh_key_from_installer()
+ self.get_controller_ips()
+ self.set_apply_patches()
+ self.setup_stunnel()
+
+ def cleanup(self):
+ self.restore_apply_patches()
+ for server in self.servers:
+ server.terminate()
+
+ def get_ssh_key_from_installer(self):
+ self.log.info('Get SSH keys from Apex installer......')
+
+ if self.key_file is not None:
+ self.log.info('Already have SSH keys from Apex installer......')
+ return self.key_file
+
+ self.client.scp('/home/stack/.ssh/id_rsa', './instack_key', method='get')
+ user = getpass.getuser()
+ uid = pwd.getpwnam(user).pw_uid
+ gid = grp.getgrnam(user).gr_gid
+ os.chown('./instack_key', uid, gid)
+ os.chmod('./instack_key', stat.S_IREAD)
+ current_dir = sys.path[0]
+ self.key_file = '{0}/{1}'.format(current_dir, 'instack_key')
+ return self.key_file
+
+ def get_controller_ips(self):
+ self.log.info('Get controller ips from Apex installer......')
+
+ command = "source stackrc; " \
+ "nova list | grep ' overcloud-controller-[0-9] ' " \
+ "| sed -e 's/^.*ctlplane=//' |awk '{print $1}'"
+ ret, controllers = self.client.ssh(command)
+ if ret:
+ raise Exception('Exec command to get controller ips in Apex installer failed'
+ 'ret=%s, output=%s' % (ret, controllers))
+ self.log.info('Get controller_ips:%s from Apex installer' % controllers)
+ self.controllers = controllers
+
+ def get_host_ip_from_hostname(self, hostname):
+ self.log.info('Get host ip from host name in Apex installer......')
+
+ hostname_in_undercloud = hostname.split('.')[0]
+
+ command = "source stackrc; nova show %s | awk '/ ctlplane network /{print $5}'" % (hostname_in_undercloud)
+ ret, host_ip = self.client.ssh(command)
+ if ret:
+ raise Exception('Exec command to get host ip from hostname(%s) in Apex installer failed'
+ 'ret=%s, output=%s' % (hostname, ret, host_ip))
+ self.log.info('Get host_ip:%s from host_name:%s in Apex installer' % (host_ip, hostname))
+ return host_ip[0]
+
+ def setup_stunnel(self):
+ self.log.info('Setup ssh stunnel in controller nodes in Apex installer......')
+ for node_ip in self.controllers:
+ cmd = "sudo ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i %s %s@%s -R %s:localhost:%s sleep 600 > ssh_tunnel.%s.log 2>&1 < /dev/null &" \
+ % (self.key_file, self.node_user_name, node_ip,
+ self.conf.consumer.port, self.conf.consumer.port, node_ip)
+ server = subprocess.Popen(cmd, shell=True)
+ self.servers.append(server)
+ server.communicate()
+
+ def set_apply_patches(self):
+ self.log.info('Set apply patches start......')
+
+ for node_ip in self.controllers:
+ client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file)
+ self.controller_clients.append(client)
+ self._ceilometer_apply_patches(client, self.cm_set_script)
+
+ def restore_apply_patches(self):
+ self.log.info('restore apply patches start......')
+
+ for client in self.controller_clients:
+ self._ceilometer_apply_patches(client, self.cm_restore_script)
+
+ def _ceilometer_apply_patches(self, ssh_client, script_name):
+ installer_dir = os.path.dirname(os.path.realpath(__file__))
+ script_abs_path = '{0}/{1}/{2}'.format(installer_dir, 'common', script_name)
+
+ ssh_client.scp(script_abs_path, script_name)
+ cmd = 'sudo python %s' % script_name
+ ret, output = ssh_client.ssh(cmd)
+ if ret:
+ raise Exception('Do the ceilometer command in controller node failed....'
+ 'ret=%s, cmd=%s, output=%s' % (ret, cmd, output))
+ ssh_client.ssh('sudo systemctl restart openstack-ceilometer-notification.service')
+
diff --git a/tests/installer/base.py b/tests/installer/base.py
new file mode 100644
index 00000000..fa39816a
--- /dev/null
+++ b/tests/installer/base.py
@@ -0,0 +1,36 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseInstaller(object):
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+
+ @abc.abstractproperty
+ def node_user_name(self):
+ """user name for login to cloud node"""
+
+ @abc.abstractmethod
+ def get_ssh_key_from_installer(self):
+ pass
+
+ @abc.abstractmethod
+ def get_host_ip_from_hostname(self, hostname):
+ pass
+
+ @abc.abstractmethod
+ def setup(self):
+ pass
+
+ @abc.abstractmethod
+ def cleanup(self):
+ pass
diff --git a/tests/installer/common/congress.py b/tests/installer/common/congress.py
new file mode 100644
index 00000000..db882de2
--- /dev/null
+++ b/tests/installer/common/congress.py
@@ -0,0 +1,47 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+def set_doctor_driver_conf(ssh_client, restart_cmd):
+ cg_set_cmd = '''#!/bin/bash
+co_conf=/etc/congress/congress.conf
+co_conf_bak=/etc/congress/congress.conf.bak
+co_entry="congress.datasources.doctor_driver.DoctorDriver"
+if sudo grep -e "^drivers.*$co_entry" $co_conf; then
+ echo "NOTE: congress is configured as we needed"
+else
+ echo "modify the congress config"
+ sudo cp $co_conf $co_conf_bak
+ sudo sed -i -e "/^drivers/s/$/,$co_entry/" $co_conf
+ %s
+fi
+ ''' % (restart_cmd)
+
+ ret, output = ssh_client.ssh(cg_set_cmd)
+ if ret:
+ raise Exception('Do the congress command in controller node failed....'
+ 'ret=%s, cmd=%s, output=%s' % (ret, cg_set_cmd, output))
+
+
+def restore_doctor_driver_conf(ssh_client, restart_cmd):
+ cg_restore_cmd = '''#!/bin/bash
+co_conf=/etc/congress/congress.conf
+co_conf_bak=/etc/congress/congress.conf.bak
+if [ -e $co_conf_bak ]; then
+ echo "restore the congress config"
+ sudo cp $co_conf_bak $co_conf
+ sudo rm $co_conf_bak
+ %s
+else
+ echo "Do not need to restore the congress config"
+fi
+ ''' % (restart_cmd)
+
+ ret, output = ssh_client.ssh(cg_restore_cmd)
+ if ret:
+ raise Exception('Do the congress command in controller node failed....'
+ 'ret=%s, cmd=%s, output=%s' % (ret, cg_restore_cmd, output))
diff --git a/tests/installer/common/restore_ceilometer.py b/tests/installer/common/restore_ceilometer.py
new file mode 100644
index 00000000..d25b9ede
--- /dev/null
+++ b/tests/installer/common/restore_ceilometer.py
@@ -0,0 +1,27 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+ep_file = '/etc/ceilometer/event_pipeline.yaml'
+ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
+
+
+def restore_ep_config():
+
+ if not os.path.isfile(ep_file_bak):
+ print('Bak_file:%s does not exist.' % ep_file_bak)
+ else:
+ print('restore')
+ shutil.copyfile(ep_file_bak, ep_file)
+ os.remove(ep_file_bak)
+ return
+
+
+restore_ep_config()
diff --git a/tests/installer/common/set_ceilometer.py b/tests/installer/common/set_ceilometer.py
new file mode 100644
index 00000000..f5946cb2
--- /dev/null
+++ b/tests/installer/common/set_ceilometer.py
@@ -0,0 +1,44 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+import yaml
+
+ep_file = '/etc/ceilometer/event_pipeline.yaml'
+ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
+event_notifier_topic = 'notifier://?topic=alarm.all'
+
+
+def set_notifier_topic():
+ config_modified = False
+
+ if not os.path.isfile(ep_file):
+ raise Exception("File doesn't exist: %s." % ep_file)
+
+ with open(ep_file, 'r') as file:
+ config = yaml.safe_load(file)
+
+ sinks = config['sinks']
+ for sink in sinks:
+ if sink['name'] == 'event_sink':
+ publishers = sink['publishers']
+ if event_notifier_topic not in publishers:
+ print('Add event notifier in ceilometer')
+ publishers.append(event_notifier_topic)
+ config_modified = True
+ else:
+ print('NOTE: event notifier is configured in ceilometer as we needed')
+
+ if config_modified:
+ shutil.copyfile(ep_file, ep_file_bak)
+ with open(ep_file, 'w+') as file:
+ file.write(yaml.safe_dump(config))
+
+
+set_notifier_topic()
diff --git a/tests/installer/local.py b/tests/installer/local.py
new file mode 100644
index 00000000..dcdf41e3
--- /dev/null
+++ b/tests/installer/local.py
@@ -0,0 +1,109 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+import subprocess
+
+from installer.base import BaseInstaller
+from common.utils import load_json_file
+from common.utils import write_json_file
+
+
+class LocalInstaller(BaseInstaller):
+ node_user_name = 'root'
+
+ nova_policy_file = '/etc/nova/policy.json'
+ nova_policy_file_backup = '%s%s' % (nova_policy_file, '.bak')
+
+ def __init__(self, conf, log):
+ super(LocalInstaller, self).__init__(conf, log)
+ self.policy_modified = False
+ self.add_policy_file = False
+
+ def setup(self):
+ self.get_ssh_key_from_installer()
+ self.set_apply_patches()
+
+ def cleanup(self):
+ self.restore_apply_patches()
+
+ def get_ssh_key_from_installer(self):
+ self.log.info('Assuming SSH keys already exchanged with computer for local installer type')
+ return None
+
+ def get_host_ip_from_hostname(self, hostname):
+ self.log.info('Get host ip from host name in local installer......')
+
+ cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname)
+ server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
+ stdout, stderr = server.communicate()
+ host_ip = stdout.strip()
+
+ self.log.info('Get host_ip:%s from host_name:%s in local installer' % (host_ip, hostname))
+ return host_ip
+
+ def set_apply_patches(self):
+ self._set_nova_policy()
+
+ def restore_apply_patches(self):
+ self._restore_nova_policy()
+
+ def _set_nova_policy(self):
+ host_status_policy = 'os_compute_api:servers:show:host_status'
+ host_status_rule = 'rule:admin_or_owner'
+ policy_data = {
+ 'context_is_admin': 'role:admin',
+ 'owner': 'user_id:%(user_id)s',
+ 'admin_or_owner': 'rule:context_is_admin or rule:owner',
+ host_status_policy: host_status_rule
+ }
+
+ if os.path.isfile(self.nova_policy_file):
+ data = load_json_file(self.nova_policy_file)
+ if host_status_policy in data:
+ rule_origion = data[host_status_policy]
+ if host_status_rule == rule_origion:
+ self.log.info('Do not need to modify nova policy.')
+ self.policy_modified = False
+ else:
+ # update the host_status_policy
+ data[host_status_policy] = host_status_rule
+ self.policy_modified = True
+ else:
+ # add the host_status_policy, if the admin_or_owner is not
+ # defined, add it also
+ for policy, rule in policy_data.items():
+ if policy not in data:
+ data[policy] = rule
+ self.policy_modified = True
+ if self.policy_modified:
+ self.log.info('Nova policy is Modified.')
+ shutil.copyfile(self.nova_policy_file,
+ self.nova_policy_file_backup)
+ else:
+ # file does not exit, create a new one and add the policy
+ self.log.info('Nova policy file not exist. Creating a new one')
+ data = policy_data
+ self.add_policy_file = True
+
+ if self.policy_modified or self.add_policy_file:
+ write_json_file(self.nova_policy_file, data)
+ os.system('screen -S stack -p n-api -X stuff "^C^M^[[A^M"')
+
+ def _restore_nova_policy(self):
+ if self.policy_modified:
+ shutil.copyfile(self.nova_policy_file_backup, self.nova_policy_file)
+ os.remove(self.nova_policy_file_backup)
+ elif self.add_policy_file:
+ os.remove(self.nova_policy_file)
+
+ if self.add_policy_file or self.policy_modified:
+ os.system('screen -S stack -p n-api -X stuff "^C^M^[[A^M"')
+ self.add_policy_file = False
+ self.policy_modified = False
diff --git a/tests/lib/installers/apex b/tests/lib/installers/apex
index 3d94e1c4..f7b9624e 100644
--- a/tests/lib/installers/apex
+++ b/tests/lib/installers/apex
@@ -44,13 +44,14 @@ function installer_apply_patches {
fi
co_conf=/etc/congress/congress.conf
+ co_conf_bak=/etc/congress/congress.conf.bak
co_entry="congress.datasources.doctor_driver.DoctorDriver"
if sudo grep -e "^drivers.*$co_entry" $co_conf; then
echo "NOTE: congress is configured as we needed"
else
echo "modify the congress config"
- sudo sed -i -e "/^drivers/s/$/,$co_entry # added by doctor script/" \
- $co_conf
+ sudo cp $co_conf $co_conf_bak
+ sudo sed -i -e "/^drivers/s/$/,$co_entry/" $co_conf
sudo systemctl restart openstack-congress-server.service
fi
' > installer_apply_patches_$node.log 2>&1
@@ -90,10 +91,10 @@ function installer_revert_patches {
date
co_conf=/etc/congress/congress.conf
- co_entry="congress.datasources.doctor_driver.DoctorDriver"
- if sudo grep -q -e "# added by doctor script" $co_conf; then
- echo "modify the congress config"
- sudo sed -i -e "/^drivers/s/^\(.*\),$co_entry # added by doctor script/\1/" $co_conf
+ co_conf_bak=/etc/congress/congress.conf.bak
+ if [ -e $co_conf_bak ]; then
+ echo "restore the congress config"
+ sudo mv $co_conf_bak $co_conf
sudo systemctl restart openstack-congress-server.service
fi
diff --git a/tests/lib/monitor b/tests/lib/monitor
new file mode 100644
index 00000000..6b804ec2
--- /dev/null
+++ b/tests/lib/monitor
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+MONITOR_TYPE=${MONITOR_TYPE:-sample}
+
+function is_monitor_supported {
+ local monitor="$1"
+ [[ -f $TOP_DIR/lib/monitors/$monitor/$monitor ]]
+}
+
+function is_monitor {
+ local monitor="$1"
+ [[ $monitor == $MONITOR_TYPE ]]
+}
+
+function start_monitor {
+ start_monitor_$MONITOR_TYPE
+}
+
+function stop_monitor {
+ stop_monitor_$MONITOR_TYPE
+}
+
+function cleanup_monitor {
+ cleanup_monitor_$MONITOR_TYPE
+}
+
+if ! is_monitor_supported $MONITOR_TYPE; then
+ die $LINENO "MONITOR_TYPE=$MONITOR_TYPE is not supported."
+fi
+
+source $TOP_DIR/lib/monitors/$MONITOR_TYPE/$MONITOR_TYPE
diff --git a/tests/lib/monitors/collectd/collectd b/tests/lib/monitors/collectd/collectd
new file mode 100644
index 00000000..f5096658
--- /dev/null
+++ b/tests/lib/monitors/collectd/collectd
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+function start_monitor_collectd {
+ ## CONTROL_IP is the IP of primary interface of control node i.e.
+ ## eth0, eno1. It is used by collectd monitor to communicate with
+ ## sample inspector.
+ ## @TODO (umar) see if mgmt IP of control is a better option. Also
+ ## primary interface may not be the right option
+ CONTROL_IP="$(ip a | sed -En 's/127.0.0.1//;s/.*inet (addr:)?(([0-9]*\.){3}[0-9]*).*/\2/p' | sed -n 1p)"
+ #CONTROL_IP=192.168.98.6
+
+ echo "
+Hostname \"$COMPUTE_HOST\"
+FQDNLookup false
+Interval 1
+MaxReadInterval 2
+
+<LoadPlugin python>
+ Globals true
+</LoadPlugin>
+LoadPlugin ovs_events
+LoadPlugin logfile
+
+<Plugin logfile>
+ File \"/var/log/collectd.log\"
+ Timestamp true
+ LogLevel \"info\"
+</Plugin>
+
+<Plugin python>
+ ModulePath \"/home/$COMPUTE_USER\"
+ LogTraces true
+ Interactive false
+ Import \"collectd_plugin\"
+ <Module \"collectd_plugin\">
+ control_ip \"$CONTROL_IP\"
+ compute_ip \"$COMPUTE_IP\"
+ compute_host \"$COMPUTE_HOST\"
+ compute_user \"$COMPUTE_USER\"
+ inspector_type \"$INSPECTOR_TYPE\"
+ os_auth_url \"$OS_AUTH_URL\"
+ os_username \"$OS_USERNAME\"
+ os_password \"$OS_PASSWORD\"
+ os_project_name \"$OS_PROJECT_NAME\"
+ os_user_domain_name \"$OS_USER_DOMAIN_NAME\"
+ os_user_domain_id \"$OS_USER_DOMAIN_ID\"
+ os_project_domain_name \"$OS_PROJECT_DOMAIN_NAME\"
+ os_project_domain_id \"$OS_PROJECT_DOMAIN_ID\"
+ </Module>
+</Plugin>
+
+<Plugin ovs_events>
+ Port 6640
+ Socket \"/var/run/openvswitch/db.sock\"
+ Interfaces \"@INTERFACE_NAME@\"
+ SendNotification true
+ DispatchValues false
+</Plugin>
+
+" > $TOP_DIR/lib/monitors/collectd.conf
+
+ scp $ssh_opts_cpu $TOP_DIR/lib/monitors/collectd.conf $COMPUTE_USER@$COMPUTE_IP:
+ ## @TODO (umar) Always assuming that the interface is assigned an IP if
+ ## interface name is not provided. See if there is a better approach
+ ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" "
+ if [ -n \"$INTERFACE_NAME\" ]; then
+ dev=$INTERFACE_NAME
+ else
+ dev=\$(sudo ip a | awk '/ $COMPUTE_IP\//{print \$NF}')
+ fi
+ sed -i -e \"s/@INTERFACE_NAME@/\$dev/\" collectd.conf
+ collectd_conf=/opt/collectd/etc/collectd.conf
+ if [ -e \$collectd_conf ]; then
+ sudo cp \$collectd_conf \${collectd_conf}-doctor-saved
+ else
+ sudo touch \${collectd_conf}-doctor-created
+ fi
+ sudo mv collectd.conf /opt/collectd/etc/collectd.conf"
+
+ scp $ssh_opts_cpu $TOP_DIR/lib/monitors/collectd/collectd_plugin.py $COMPUTE_USER@$COMPUTE_IP:collectd_plugin.py
+ ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" "sudo pkill collectd
+ sudo /opt/collectd/sbin/collectd"
+}
+
+function stop_monitor_collectd {
+ ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'sudo pkill collectd'
+}
+
+function cleanup_monitor_collectd {
+ ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" "
+ collectd_conf=/opt/collectd/etc/collectd.conf
+ if [ -e \"\${collectd_conf}-doctor-created\" ]; then
+ sudo rm \"\${collectd_conf}-doctor-created\"
+ sudo rm \$collectd_conf
+ elif [ -e \"\${collectd_conf}-doctor-saved\" ]; then
+ sudo cp -f \"\${collectd_conf}-doctor-saved\" \$collectd_conf
+ sudo rm \"\${collectd_conf}-doctor-saved\"
+ fi"
+
+ rm $TOP_DIR/lib/monitors/collectd.conf
+}
diff --git a/tests/lib/monitors/collectd/collectd_plugin.py b/tests/lib/monitors/collectd/collectd_plugin.py
new file mode 100644
index 00000000..70fcf26e
--- /dev/null
+++ b/tests/lib/monitors/collectd/collectd_plugin.py
@@ -0,0 +1,167 @@
+##############################################################################
+# Copyright (c) 2017 NEC Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+import collectd
+import sys
+from netifaces import interfaces, ifaddresses, AF_INET
+from datetime import datetime
+import json
+import requests
+import time
+from requests.exceptions import ConnectionError
+
+from keystoneauth1 import loading
+from keystoneauth1 import session
+from congressclient.v1 import client
+
+
+def write_debug(str_write, write_type, compute_user):
+ file_name = ('/home/%s/monitor.log' % compute_user)
+ file_tmp = open(file_name, write_type)
+ file_tmp.write( "%s" % str_write)
+ file_tmp.close()
+
+
+class DoctorMonitorCollectd(object):
+ def __init__(self):
+ self.control_ip = ''
+ self.compute_user = ''
+ self.compute_ip = ''
+ self.host_name = ''
+ self.inspector_type = ''
+ self.inspector_url = ''
+ self.os_auth_url = ''
+ self.os_username = ''
+ self.os_password = ''
+ self.os_project_name = ''
+ self.os_user_domain_name = ''
+ self.os_user_domain_id = ''
+ self.os_project_domain_name = ''
+ self.os_project_domain_id = ''
+ self.sess = ''
+ self.auth = ''
+ self.inspector_notified = 0
+ self.start_notifications = 0
+ self.monitor_type = 'sample'
+
+ def config_func(self, config):
+ for node in config.children:
+ key = node.key.lower()
+ val = node.values[0]
+
+ if key == 'compute_host':
+ self.host_name = val
+ elif key == 'control_ip':
+ self.control_ip = val
+ elif key == 'compute_ip':
+ self.compute_ip = val
+ elif key == 'compute_user':
+ self.compute_user = val
+ elif key == 'inspector_type':
+ self.inspector_type = val
+ elif key == 'os_auth_url':
+ self.os_auth_url = val
+ elif key == 'os_username':
+ self.os_username = val
+ elif key == 'os_password':
+ self.os_password = val
+ elif key == 'os_project_name':
+ self.os_project_name = val
+ elif key == 'os_user_domain_name':
+ self.os_user_domain_name = val
+ elif key == 'os_user_domain_id':
+ self.os_user_domain_id = val
+ elif key == 'os_project_domain_name':
+ self.os_project_domain_name = val
+ elif key == 'os_project_domain_id':
+ self.os_project_domain_id = val
+ else:
+ collectd.info('Unknown config key "%s"' % key)
+
+ def init_collectd(self):
+ write_debug("Compute node collectd monitor start at %s\n\n" % datetime.now().isoformat(), "w", self.compute_user)
+
+ if self.inspector_type == 'sample':
+ self.inspector_url = ('http://%s:12345/events' % self.control_ip)
+ elif self.inspector_type == 'congress':
+ loader = loading.get_plugin_loader('password')
+ self.auth = loader.load_from_options(auth_url=self.os_auth_url,
+ username=self.os_username,
+ password=self.os_password,
+ project_name=self.os_project_name,
+ user_domain_name=self.os_user_domain_name,
+ user_domain_id=self.os_user_domain_id,
+ project_domain_name=self.os_project_domain_name,
+ project_domain_id=self.os_project_domain_id)
+ self.sess=session.Session(auth=self.auth)
+ congress = client.Client(session=self.sess, service_type='policy')
+ ds = congress.list_datasources()['results']
+ doctor_ds = next((item for item in ds if item['driver'] == 'doctor'),
+ None)
+
+ congress_endpoint = congress.httpclient.get_endpoint(auth=self.auth)
+ self.inspector_url = ('%s/v1/data-sources/%s/tables/events/rows' %
+ (congress_endpoint, doctor_ds['id']))
+ else:
+ sys.exit()
+ self.start_notifications = 1
+
+
+ def notify_inspector(self):
+ event_type = "compute.host.down"
+ payload = [
+ {
+ 'id': ("monitor_%s_id1" % self.monitor_type),
+ 'time': datetime.now().isoformat(),
+ 'type': event_type,
+ 'details': {
+ 'hostname': self.host_name,
+ 'status': 'down',
+ 'monitor': ("monitor_%s" % self.monitor_type),
+ 'monitor_event_id': ("monitor_%s_event1" % self.monitor_type)
+ },
+ },
+ ]
+ data = json.dumps(payload)
+ self.inspector_notified = 1
+
+ if self.inspector_type == 'sample':
+ headers = {'content-type': 'application/json'}
+ try:
+ requests.post(self.inspector_url, data=data, headers=headers)
+ except ConnectionError as err:
+ print err
+ elif self.inspector_type == 'congress':
+ # TODO(umar) enhance for token expiry case
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ 'X-Auth-Token': self.sess.get_token()
+ }
+ requests.put(self.inspector_url, data=data, headers=headers)
+
+
+ def handle_notif(self, notification, data=None):
+ if (notification.severity == collectd.NOTIF_FAILURE or
+ notification.severity == collectd.NOTIF_WARNING):
+ if (self.start_notifications == 1 and self.inspector_notified == 0):
+ write_debug("Received down notification: doctor monitor detected at %s\n" % time.time(), "a", self.compute_user)
+ self.notify_inspector()
+
+ elif notification.severity == collectd.NOTIF_OKAY:
+ collectd.info("Interface status: UP again %s\n" % time.time())
+ else:
+ collectd.info("Unknown notification severity %s\n" % notification.severity)
+
+
+monitor = DoctorMonitorCollectd()
+
+collectd.register_config(monitor.config_func)
+collectd.register_init(monitor.init_collectd)
+collectd.register_notification(monitor.handle_notif)
diff --git a/tests/monitor.py b/tests/lib/monitors/sample/monitor.py
index 7450c534..7450c534 100644
--- a/tests/monitor.py
+++ b/tests/lib/monitors/sample/monitor.py
diff --git a/tests/lib/monitors/sample/sample b/tests/lib/monitors/sample/sample
new file mode 100644
index 00000000..1d310333
--- /dev/null
+++ b/tests/lib/monitors/sample/sample
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+function start_monitor_sample {
+ cp $TOP_DIR/lib/monitors/sample/monitor.py $TOP_DIR/monitor.py
+ pgrep -f "python monitor.py" && return 0
+ sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \
+ > monitor.log 2>&1 &
+}
+
+function stop_monitor_sample {
+ pgrep -f "python monitor.py" || return 0
+ sudo kill $(pgrep -f "python monitor.py")
+}
+
+function cleanup_monitor_sample {
+ rm monitor.py
+ return
+}
diff --git a/tests/logger.py b/tests/logger.py
index 72043ab3..b7a49fdb 100644
--- a/tests/logger.py
+++ b/tests/logger.py
@@ -36,12 +36,11 @@ class Logger(object):
ch.setLevel(logging.INFO)
self.logger.addHandler(ch)
- file_handler = logging.FileHandler('%s.log' % logger_name)
+ filename = '%s.log' % logger_name
+ file_handler = logging.FileHandler(filename, mode='w')
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.DEBUG)
self.logger.addHandler(file_handler)
-
def getLogger(self):
return self.logger
-
diff --git a/tests/main.py b/tests/main.py
index 7714d7da..edff16f2 100644
--- a/tests/main.py
+++ b/tests/main.py
@@ -8,16 +8,30 @@
##############################################################################
import os
from os.path import isfile, join
+import random
import sys
+import time
from alarm import Alarm
+from common.constants import Host
+from common.utils import match_rep_in_file
import config
+from consumer import get_consumer
+from identity_auth import get_identity_auth
+from identity_auth import get_session
from image import Image
from instance import Instance
from inspector import get_inspector
+from installer import get_installer
import logger as doctor_log
-from user import User
from network import Network
+from monitor import get_monitor
+from os_clients import nova_client
+from profiler_poc import main as profiler_main
+from scenario.common import calculate_notification_time
+from scenario.network_failure import NetworkFault
+from user import User
+
LOG = doctor_log.Logger('doctor').getLogger()
@@ -31,10 +45,21 @@ class DoctorTest(object):
self.network = Network(self.conf, LOG)
self.instance = Instance(self.conf, LOG)
self.alarm = Alarm(self.conf, LOG)
+ self.installer = get_installer(self.conf, LOG)
self.inspector = get_inspector(self.conf, LOG)
+ self.monitor = get_monitor(self.conf,
+ self.inspector.get_inspector_url(),
+ LOG)
+ self.consumer = get_consumer(self.conf, LOG)
+ self.fault = NetworkFault(self.conf, self.installer, LOG)
+ auth = get_identity_auth(project=self.conf.doctor_project)
+ self.nova = nova_client(self.conf.nova_version,
+ get_session(auth=auth))
+ self.down_host = None
def setup(self):
# prepare the cloud env
+ self.installer.setup()
# preparing VM image...
self.image.create()
@@ -54,30 +79,124 @@ class DoctorTest(object):
# starting doctor sample components...
self.inspector.start()
+ self.down_host = self.get_host_info_for_random_vm()
+ self.monitor.start(self.down_host)
+
+ self.consumer.start()
+
def run(self):
"""run doctor test"""
try:
LOG.info('doctor test starting.......')
+ # prepare test env
self.setup()
+ # wait for aodh alarms are updated in caches for event evaluator,
+ # sleep time should be larger than event_alarm_cache_ttl(default 60)
+ time.sleep(60)
+
# injecting host failure...
+ # NOTE (umar) add INTERFACE_NAME logic to host injection
- # verify the test results
+ self.fault.start(self.down_host)
+ time.sleep(10)
+ # verify the test results
+ # NOTE (umar) copy remote monitor.log file when monitor=collectd
+ self.check_host_status(self.down_host.name, 'down')
+
+ notification_time = calculate_notification_time()
+ if notification_time < 1 and notification_time > 0:
+ LOG.info('doctor test successfully, notification_time=%s' % notification_time)
+ else:
+ LOG.error('doctor test failed, notification_time=%s' % notification_time)
+ sys.exit(1)
+
+ if self.conf.profiler_type:
+ LOG.info('doctor test begin to run profile.......')
+ self.collect_logs()
+ self.run_profiler()
except Exception as e:
LOG.error('doctor test failed, Exception=%s' % e)
sys.exit(1)
finally:
self.cleanup()
+ def get_host_info_for_random_vm(self):
+ num = random.randint(0, self.conf.instance_count - 1)
+ vm_name = "%s%d" % (self.conf.instance_basename, num)
+
+ servers = \
+ {getattr(server, 'name'): server
+ for server in self.nova.servers.list()}
+ server = servers.get(vm_name)
+ if not server:
+ raise \
+ Exception('Can not find instance: vm_name(%s)' % vm_name)
+ host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
+ host_ip = self.installer.get_host_ip_from_hostname(host_name)
+
+ LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
+ % (host_name, host_ip, vm_name))
+ return Host(host_name, host_ip)
+
+ def check_host_status(self, hostname, state):
+ service = self.nova.services.list(host=hostname, binary='nova-compute')
+ host_state = service[0].__dict__.get('state')
+ assert host_state == state
+
+ def unset_forced_down_hosts(self):
+ if self.down_host:
+ self.nova.services.force_down(self.down_host.name, 'nova-compute', False)
+ time.sleep(2)
+ self.check_host_status(self.down_host.name, 'up')
+
+ def collect_logs(self):
+ self.fault.get_disable_network_log()
+
+ def run_profiler(self):
+
+ log_file = '{0}/{1}'.format(sys.path[0], 'disable_network.log')
+ reg = '(?<=doctor set link down at )\d+.\d+'
+ linkdown = float(match_rep_in_file(reg, log_file).group(0))
+
+ log_file = '{0}/{1}'.format(sys.path[0], 'doctor.log')
+ reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
+ vmdown = float(match_rep_in_file(reg, log_file).group(2))
+
+ reg = '(?<=doctor mark host.* down at )\d+.\d+'
+ hostdown = float(match_rep_in_file(reg, log_file).group(2))
+
+ reg = '(?<=doctor monitor detected at )\d+.\d+'
+ detected = float(match_rep_in_file(reg, log_file).group(0))
+
+ reg = '(?<=doctor consumer notified at )\d+.\d+'
+ notified = float(match_rep_in_file(reg, log_file).group(0))
+
+ # TODO(yujunz) check the actual delay to verify time sync status
+ # expected ~1s delay from $trigger to $linkdown
+ relative_start = linkdown
+ os.environ['DOCTOR_PROFILER_T00'] = str(int((linkdown - relative_start)*1000))
+ os.environ['DOCTOR_PROFILER_T01'] = str(int((detected - relative_start) * 1000))
+ os.environ['DOCTOR_PROFILER_T03'] = str(int((vmdown - relative_start) * 1000))
+ os.environ['DOCTOR_PROFILER_T04'] = str(int((hostdown - relative_start) * 1000))
+ os.environ['DOCTOR_PROFILER_T09'] = str(int((notified - relative_start) * 1000))
+
+ profiler_main(log=LOG)
+
def cleanup(self):
+ self.unset_forced_down_hosts()
+ self.inspector.stop()
+ self.monitor.stop()
+ self.consumer.stop()
+ self.installer.cleanup()
self.alarm.delete()
self.instance.delete()
self.network.delete()
self.image.delete()
+ self.fault.cleanup()
self.user.delete()
- self.inspector.stop()
def main():
diff --git a/tests/monitor/__init__.py b/tests/monitor/__init__.py
new file mode 100644
index 00000000..e268907f
--- /dev/null
+++ b/tests/monitor/__init__.py
@@ -0,0 +1,29 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from oslo_config import cfg
+from oslo_utils import importutils
+
+OPTS = [
+ cfg.StrOpt('type',
+ default='sample',
+ choices=['sample', 'collectd'],
+ help='the type of doctor monitor component',
+ required=True),
+]
+
+
+_monitor_name_class_mapping = {
+ 'sample': 'monitor.sample.SampleMonitor',
+ 'collectd': 'monitor.collectd.CollectdMonitor'
+}
+
+def get_monitor(conf, inspector_url, log):
+ monitor_class = _monitor_name_class_mapping.get(conf.monitor.type)
+ return importutils.import_object(monitor_class, conf,
+ inspector_url, log)
diff --git a/tests/monitor/base.py b/tests/monitor/base.py
new file mode 100644
index 00000000..119c8a1c
--- /dev/null
+++ b/tests/monitor/base.py
@@ -0,0 +1,27 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseMonitor(object):
+ """Monitor computer fault and report error to the inspector"""
+ def __init__(self, conf, inspector_url, log):
+ self.conf = conf
+ self.log = log
+ self.inspector_url = inspector_url
+
+ @abc.abstractmethod
+ def start(self, host):
+ pass
+
+ @abc.abstractmethod
+ def stop(self):
+ pass
diff --git a/tests/monitor/collectd.py b/tests/monitor/collectd.py
new file mode 100644
index 00000000..e2a800ea
--- /dev/null
+++ b/tests/monitor/collectd.py
@@ -0,0 +1,138 @@
+##############################################################################
+# Copyright (c) 2017 NEC Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+import os
+import socket
+import getpass
+import sys
+
+from monitor.base import BaseMonitor
+
+
+class CollectdMonitor(BaseMonitor):
+ def __init__(self, conf, inspector_url, log):
+ super(CollectdMonitor, self).__init__(conf, inspector_url, log)
+ self.top_dir = os.path.dirname(sys.path[0])
+ tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+ tmp_sock.connect(("8.8.8.8", 80))
+
+ ## control_ip is the IP of primary interface of control node i.e.
+ ## eth0, eno1. It is used by collectd monitor to communicate with
+ ## sample inspector.
+ ## TODO (umar) see if mgmt IP of control is a better option. Also
+ ## primary interface may not be the right option
+ self.control_ip = tmp_sock.getsockname()[0]
+ self.compute_user = getpass.getuser()
+ self.interface_name = os.environ.get('INTERFACE_NAME') or ''
+ self.inspector_type = os.environ.get('INSPECTOR_TYPE', 'sample')
+ self.auth_url = os.environ.get('OS_AUTH_URL')
+ self.username = os.environ.get('OS_USERNAME')
+ self.password = os.environ.get('OS_PASSWORD')
+ self.project_name = os.environ.get('OS_PROJECT_NAME')
+ self.user_domain_name = os.environ.get('OS_USER_DOMAIN_NAME') or 'default'
+ self.user_domain_id = os.environ.get('OS_USER_DOMAIN_ID')
+ self.project_domain_name = os.environ.get('OS_PROJECT_DOMAIN_NAME') or 'default'
+ self.project_domain_id = os.environ.get('OS_PROJECT_DOMAIN_ID')
+ self.ssh_opts_cpu = '-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no'
+
+ def start(self, host):
+ self.log.info("Collectd monitor start.........")
+ self.compute_host = host.name
+ self.compute_ip = host.ip
+ f = open("%s/tests/collectd.conf" % self.top_dir, 'w')
+ collectd_conf_file = """
+Hostname %s
+FQDNLookup false
+Interval 1
+MaxReadInterval 2
+
+<LoadPlugin python>
+Globals true
+</LoadPlugin>
+LoadPlugin ovs_events
+LoadPlugin logfile
+
+<Plugin logfile>
+ File \"/var/log/collectd.log\"
+ Timestamp true
+ LogLevel \"info\"
+</Plugin>
+
+<Plugin python>
+ ModulePath \"/home/%s\"
+ LogTraces true
+ Interactive false
+ Import \"collectd_plugin\"
+ <Module \"collectd_plugin\">
+ control_ip \"%s\"
+ compute_ip \"%s\"
+ compute_host \"%s\"
+ compute_user \"%s\"
+ inspector_type \"%s\"
+ os_auth_url \"%s\"
+ os_username \"%s\"
+ os_password \"%s\"
+ os_project_name \"%s\"
+ os_user_domain_name \"%s\"
+ os_user_domain_id \"%s\"
+ os_project_domain_name \"%s\"
+ os_project_domain_id \"%s\"
+ </Module>
+</Plugin>
+
+<Plugin ovs_events>
+ Port 6640
+ Socket \"/var/run/openvswitch/db.sock\"
+ Interfaces \"@INTERFACE_NAME@\"
+ SendNotification true
+ DispatchValues false
+</Plugin>
+ """ % (self.compute_host, self.compute_user, self.control_ip, self.compute_ip, self.compute_host, self.compute_user,
+ self.inspector_type, self.auth_url, self.username, self.password, self.project_name, self.user_domain_name,
+ self.user_domain_id, self.project_domain_name, self.project_domain_id)
+ f.write(collectd_conf_file)
+ f.close()
+
+ os.system(" scp %s %s/tests/collectd.conf %s@%s: " % (self.ssh_opts_cpu, self.top_dir, self.compute_user, self.compute_ip))
+ self.log.info("after first scp")
+ ## @TODO (umar) Always assuming that the interface is assigned an IP if
+ ## interface name is not provided. See if there is a better approach
+ os.system(""" ssh %s %s@%s \"if [ -n \"%s\" ]; then
+ dev=%s
+ else
+ dev=\$(sudo ip a | awk '/ %s\//{print \$NF}')
+ fi
+ sed -i -e \"s/@INTERFACE_NAME@/\$dev/\" collectd.conf
+ collectd_conf=/opt/collectd/etc/collectd.conf
+ if [ -e \$collectd_conf ]; then
+ sudo cp \$collectd_conf \${collectd_conf}-doctor-saved
+ else
+ sudo touch \${collectd_conf}-doctor-created
+ fi
+ sudo mv collectd.conf /opt/collectd/etc/collectd.conf\" """ % (self.ssh_opts_cpu, self.compute_user, self.compute_ip, self.interface_name, self.interface_name, self.compute_ip))
+ self.log.info("after first ssh")
+ os.system(" scp %s %s/tests/lib/monitors/collectd/collectd_plugin.py %s@%s:collectd_plugin.py " % (self.ssh_opts_cpu, self.top_dir, self.compute_user, self.compute_ip))
+ self.log.info("after sec scp")
+ os.system(" ssh %s %s@%s \"sudo pkill collectd; sudo /opt/collectd/sbin/collectd\" " % (self.ssh_opts_cpu, self.compute_user, self.compute_ip))
+ self.log.info("after sec ssh")
+
+ def stop(self):
+ os.system(" ssh %s %s@%s \"sudo pkill collectd\" " % (self.ssh_opts_cpu, self.compute_user, self.compute_ip))
+
+ def cleanup(self):
+ os.system(""" ssh %s %s@%s \"
+ collectd_conf=/opt/collectd/etc/collectd.conf
+ if [ -e \"\${collectd_conf}-doctor-created\" ]; then
+ sudo rm \"\${collectd_conf}-doctor-created\"
+ sudo rm \$collectd_conf
+ elif [ -e \"\${collectd_conf}-doctor-saved\" ]; then
+ sudo cp -f \"\${collectd_conf}-doctor-saved\" \$collectd_conf
+ sudo rm \"\${collectd_conf}-doctor-saved\"
+ fi\" """ % (self.ssh_opts_cpu, self.compute_user, self.compute_ip))
+ os.remove("%s/tests/collectd.conf" % self.top_dir)
diff --git a/tests/monitor/sample.py b/tests/monitor/sample.py
new file mode 100644
index 00000000..9ac1bccf
--- /dev/null
+++ b/tests/monitor/sample.py
@@ -0,0 +1,106 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from datetime import datetime
+import json
+import requests
+import socket
+from threading import Thread
+import time
+
+from identity_auth import get_session
+from monitor.base import BaseMonitor
+
+
+class SampleMonitor(BaseMonitor):
+ event_type = "compute.host.down"
+
+ def __init__(self, conf, inspector_url, log):
+ super(SampleMonitor, self).__init__(conf, inspector_url, log)
+ self.session = get_session()
+ self.pinger = None
+
+ def start(self, host):
+ self.log.info('sample monitor start......')
+ self.pinger = Pinger(host.name, host.ip, self, self.log)
+ self.pinger.start()
+
+ def stop(self):
+ self.log.info('sample monitor stop......')
+ if self.pinger is not None:
+ self.pinger.stop()
+ self.pinger.join()
+
+ def report_error(self, hostname):
+ self.log.info('sample monitor report error......')
+ data = [
+ {
+ 'id': 'monitor_sample_id1',
+ 'time': datetime.now().isoformat(),
+ 'type': self.event_type,
+ 'details': {
+ 'hostname': hostname,
+ 'status': 'down',
+ 'monitor': 'monitor_sample',
+ 'monitor_event_id': 'monitor_sample_event1'
+ },
+ },
+ ]
+
+ auth_token = self.session.get_token() if \
+ self.conf.inspector.type != 'sample' else None
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ 'X-Auth-Token': auth_token,
+ }
+
+ url = '%s%s' % (self.inspector_url, 'events') \
+ if self.inspector_url.endswith('/') else \
+ '%s%s' % (self.inspector_url, '/events')
+ requests.put(url, data=json.dumps(data), headers=headers)
+
+
+class Pinger(Thread):
+ interval = 0.1 # second
+ timeout = 0.1 # second
+ ICMP_ECHO_MESSAGE = bytes([0x08, 0x00, 0xf7, 0xff, 0x00, 0x00, 0x00, 0x00])
+
+ def __init__(self, host_name, host_ip, monitor, log):
+ Thread.__init__(self)
+ self.monitor = monitor
+ self.hostname = host_name
+ self.ip_addr = host_ip or socket.gethostbyname(self.hostname)
+ self.log = log
+ self._stopped = False
+
+ def run(self):
+ self.log.info("Starting Pinger host_name(%s), host_ip(%s)"
+ % (self.hostname, self.ip_addr))
+
+ sock = socket.socket(socket.AF_INET, socket.SOCK_RAW,
+ socket.IPPROTO_ICMP)
+ sock.settimeout(self.timeout)
+ while True:
+ if self._stopped:
+ return
+ try:
+ sock.sendto(self.ICMP_ECHO_MESSAGE, (self.ip_addr, 0))
+ sock.recv(4096)
+ except socket.timeout:
+ self.log.info("doctor monitor detected at %s" % time.time())
+ self.monitor.report_error(self.hostname)
+ self.log.info("ping timeout, quit monitoring...")
+ self._stopped = True
+ return
+ time.sleep(self.interval)
+
+ def stop(self):
+ self.log.info("Stopping Pinger host_name(%s), host_ip(%s)"
+ % (self.hostname, self.ip_addr))
+ self._stopped = True
diff --git a/tests/os_clients.py b/tests/os_clients.py
index 2699930f..44fa3aad 100644
--- a/tests/os_clients.py
+++ b/tests/os_clients.py
@@ -9,6 +9,7 @@
from oslo_config import cfg
import aodhclient.client as aodhclient
+from congressclient.v1 import client as congressclient
import glanceclient.client as glanceclient
from keystoneclient.v2_0 import client as ks_client
from neutronclient.v2_0 import client as neutronclient
@@ -42,3 +43,8 @@ def neutron_client(session):
def aodh_client(version, session):
return aodhclient.Client(version, session=session)
+
+
+def congress_client(session):
+ return congressclient.Client(session=session,
+ service_type='policy')
diff --git a/tests/profiler-poc.py b/tests/profiler_poc.py
index 408cb09a..ea36eaeb 100644
--- a/tests/profiler-poc.py
+++ b/tests/profiler_poc.py
@@ -21,6 +21,16 @@ See also: https://goo.gl/98Osig
import json
import os
+from oslo_config import cfg
+
+
+OPTS = [
+ cfg.StrOpt('profiler_type',
+ default=os.environ.get('PROFILER_TYPE', 'poc'),
+ help='the type of installer'),
+]
+
+
OUTPUT = 'doctor_profiling_output'
PREFIX = 'DOCTOR_PROFILER'
TOTAL_CHECK_POINTS = 10
@@ -46,7 +56,7 @@ link down:{T00}| | | | | | | | |
"""
-def main():
+def main(log=None):
check_points = ["T{:02d}".format(i) for i in range(TOTAL_CHECK_POINTS)]
module_map = {"M{:02d}".format(i):
(MODULE_CHECK_POINTS[i], MODULE_CHECK_POINTS[i + 1])
@@ -59,7 +69,7 @@ def main():
def format_tag(tag):
return TAG_FORMAT.format(tag or '?')
- tags = {cp: format_tag(ms) for cp, ms in elapsed_ms.iteritems()}
+ tags = {cp: format_tag(ms) for cp, ms in elapsed_ms.items()}
def time_cost(cp):
if elapsed_ms[cp[0]] and elapsed_ms[cp[1]]:
@@ -69,10 +79,10 @@ def main():
# module time cost tags
modules_cost_ms = {module: time_cost(cp)
- for module, cp in module_map.iteritems()}
+ for module, cp in module_map.items()}
tags.update({module: format_tag(cost)
- for module, cost in modules_cost_ms.iteritems()})
+ for module, cost in modules_cost_ms.items()})
tags.update({'total': time_cost((check_points[0], check_points[-1]))})
@@ -81,7 +91,10 @@ def main():
logfile = open('{}.json'.format(OUTPUT), 'w')
logfile.write(json.dumps(tags))
- print profile
+ print(profile)
+ if log:
+ log.info('%s' % profile)
+
if __name__ == '__main__':
main()
diff --git a/tests/run.sh b/tests/run.sh
index 5c922265..e1875e09 100755
--- a/tests/run.sh
+++ b/tests/run.sh
@@ -29,7 +29,7 @@ DOCTOR_USER=doctor
DOCTOR_PW=doctor
DOCTOR_PROJECT=doctor
DOCTOR_ROLE=_member_
-PROFILER_TYPE=${PROFILER_TYPE:-none}
+PROFILER_TYPE=${PROFILER_TYPE:-poc}
PYTHON_ENABLE=${PYTHON_ENABLE:-false}
TOP_DIR=$(cd $(dirname "$0") && pwd)
@@ -212,17 +212,6 @@ create_alarm() {
done
}
-start_monitor() {
- pgrep -f "python monitor.py" && return 0
- sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \
- > monitor.log 2>&1 &
-}
-
-stop_monitor() {
- pgrep -f "python monitor.py" || return 0
- sudo kill $(pgrep -f "python monitor.py")
-}
-
start_consumer() {
pgrep -f "python consumer.py" && return 0
python consumer.py "$CONSUMER_PORT" > consumer.log 2>&1 &
@@ -294,8 +283,12 @@ inject_failure() {
echo "disabling network of compute host [$COMPUTE_HOST] for 3 mins..."
cat > disable_network.sh << 'END_TXT'
#!/bin/bash -x
-dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $NF}')
sleep 1
+if [ -n "@INTERFACE_NAME@" ]; then
+ dev=@INTERFACE_NAME@
+else
+ dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $NF}')
+fi
sudo ip link set $dev down
echo "doctor set link down at" $(date "+%s.%N")
sleep 180
@@ -303,6 +296,7 @@ sudo ip link set $dev up
sleep 1
END_TXT
sed -i -e "s/@COMPUTE_IP@/$COMPUTE_IP/" disable_network.sh
+ sed -i -e "s/@INTERFACE_NAME@/$INTERFACE_NAME/" disable_network.sh
chmod +x disable_network.sh
scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:"
ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &'
@@ -327,8 +321,11 @@ calculate_notification_time() {
wait_consumer 60
#keep 'at' as the last keyword just before the value, and
#use regex to get value instead of the fixed column
+ if [ ! -f monitor.log ]; then
+ scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:monitor.log" .
+ fi
detected=$(grep "doctor monitor detected at" monitor.log |\
- sed -e "s/^.* at //")
+ sed -e "s/^.* at //" | tail -1)
notified=$(grep "doctor consumer notified at" consumer.log |\
sed -e "s/^.* at //" | tail -1)
@@ -424,18 +421,18 @@ run_profiler() {
export DOCTOR_PROFILER_T09=$(python -c \
"print(int(($notified-$relative_start)*1000))")
- python profiler-poc.py >doctor_profiler.log 2>&1
+ python profiler_poc.py > doctor_profiler.log 2>&1
fi
}
cleanup() {
set +e
echo "cleanup..."
- stop_monitor
stop_inspector
stop_consumer
unset_forced_down_hosts
+ stop_monitor
collect_logs
vms=$(openstack $as_doctor_user server list)
@@ -467,6 +464,7 @@ cleanup() {
cleanup_installer
cleanup_inspector
+ cleanup_monitor
# NOTE: Temporal log printer.
for f in $(find . -name '*.log')
@@ -478,6 +476,13 @@ cleanup() {
done
}
+setup_python_packages() {
+ sudo pip install flask==0.10.1
+ command -v openstack || sudo pip install python-openstackclient==2.3.0
+ command -v ceilometer || sudo pip install python-ceilometerclient==2.6.2
+ command -v congress || sudo pip install python-congressclient==1.5.0
+}
+
# Main process
if [[ $PYTHON_ENABLE == [Tt]rue ]]; then
@@ -499,9 +504,14 @@ git log --oneline -1 || true # ignore even you don't have git installed
trap cleanup EXIT
+setup_python_packages
+
source $TOP_DIR/functions-common
source $TOP_DIR/lib/installer
source $TOP_DIR/lib/inspector
+source $TOP_DIR/lib/monitor
+
+rm -f *.log
setup_installer
@@ -531,8 +541,8 @@ echo "injecting host failure..."
inject_failure
check_host_status "(DOWN|UNKNOWN)" 60
-calculate_notification_time
unset_forced_down_hosts
+calculate_notification_time
collect_logs
run_profiler
diff --git a/tests/scenario/__init__.py b/tests/scenario/__init__.py
new file mode 100644
index 00000000..48893ae6
--- /dev/null
+++ b/tests/scenario/__init__.py
@@ -0,0 +1,8 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
diff --git a/tests/scenario/common.py b/tests/scenario/common.py
new file mode 100644
index 00000000..a33c50ff
--- /dev/null
+++ b/tests/scenario/common.py
@@ -0,0 +1,28 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import sys
+from common.utils import match_rep_in_file
+
+
+def calculate_notification_time():
+ log_file = '{0}/{1}'.format(sys.path[0], 'doctor.log')
+
+ reg = '(?<=doctor monitor detected at )\d+.\d+'
+ result = match_rep_in_file(reg, log_file)
+ if not result:
+ raise Exception('Can not match detected time')
+ detected = result.group(0)
+
+ reg = '(?<=doctor consumer notified at )\d+.\d+'
+ result = match_rep_in_file(reg, log_file)
+ if not result:
+ raise Exception('Can not match notified time')
+ notified = result.group(0)
+
+ return float(notified) - float(detected) \ No newline at end of file
diff --git a/tests/scenario/network_failure.py b/tests/scenario/network_failure.py
new file mode 100644
index 00000000..e9a239db
--- /dev/null
+++ b/tests/scenario/network_failure.py
@@ -0,0 +1,71 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from identity_auth import get_session
+from os_clients import nova_client
+from common.utils import SSHClient
+
+LINK_DOWN_SCRIPT = """
+#!/bin/bash -x
+dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
+sleep 1
+sudo ip link set $dev down
+echo "doctor set link down at" $(date "+%s.%N")
+sleep 10
+sudo ip link set $dev up
+sleep 1
+"""
+
+
+class NetworkFault(object):
+
+ def __init__(self, conf, installer, log):
+ self.conf = conf
+ self.log = log
+ self.installer = installer
+ self.nova = nova_client(self.conf.nova_version, get_session())
+ self.host = None
+ self.GetLog = False
+
+ def start(self, host):
+ self.log.info('fault inject start......')
+ self._set_link_down(host.ip)
+ self.host = host
+ self.log.info('fault inject end......')
+
+ def cleanup(self):
+ self.log.info('fault inject cleanup......')
+ self.get_disable_network_log()
+
+ def get_disable_network_log(self):
+ if self.GetLog:
+ self.log.info('Already get the disable_netork.log from down_host......')
+ return
+ if self.host is not None:
+ client = SSHClient(self.host.ip,
+ self.installer.node_user_name,
+ key_filename=self.installer.get_ssh_key_from_installer(),
+ look_for_keys=True,
+ log=self.log)
+ client.scp('disable_network.log', './disable_network.log', method='get')
+ self.log.info('Get the disable_netork.log from down_host(host_name:%s, host_ip:%s)'
+ % (self.host.name, self.host.ip))
+ self.GetLog = True
+
+ def _set_link_down(self, compute_ip):
+ file_name = './disable_network.sh'
+ with open(file_name, 'w') as file:
+ file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip))
+ client = SSHClient(compute_ip,
+ self.installer.node_user_name,
+ key_filename=self.installer.get_ssh_key_from_installer(),
+ look_for_keys=True,
+ log=self.log)
+ client.scp('./disable_network.sh', 'disable_network.sh')
+ command = 'bash disable_network.sh > disable_network.log 2>&1 &'
+ client.ssh(command)
diff --git a/tox.ini b/tox.ini
index c6262ed5..c8cebe8b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -23,5 +23,8 @@ passenv =
PROFILER_TYPE
PYTHON_ENABLE
CI_DEBUG
+ INSTALLER_TYPE
+ INSTALLER_IP
+ PROFILER_TYPE
changedir = {toxinidir}/tests
commands = doctor-test