summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--.yamllint6
-rw-r--r--INFO30
-rw-r--r--INFO.yaml86
-rw-r--r--UPSTREAM9
-rw-r--r--devstack/README.rst4
-rw-r--r--devstack/local.conf.sample120
-rw-r--r--docs/conf.py2
-rw-r--r--docs/conf.yaml3
-rw-r--r--docs/development/design/images/maintenance-workflow.pngbin0 -> 81286 bytes
-rw-r--r--docs/development/design/maintenance-design-guideline.rst426
-rw-r--r--docs/development/index.rst14
-rwxr-xr-xdocs/development/overview/functest_scenario/images/figure-p1.pngbin60756 -> 0 bytes
-rw-r--r--docs/development/overview/index.rst7
-rw-r--r--docs/development/overview/overview.rst52
-rw-r--r--docs/development/overview/testing.rst96
-rw-r--r--docs/development/requirements/index.rst6
-rw-r--r--docs/index.rst17
-rw-r--r--docs/release/configguide/feature.configuration.rst116
-rw-r--r--docs/release/configguide/index.rst6
-rw-r--r--docs/release/index.rst12
-rw-r--r--docs/release/installation/index.rst (renamed from docs/development/manuals/index.rst)12
-rw-r--r--docs/release/installation/installation.rst44
-rw-r--r--docs/release/release-notes/index.rst2
-rw-r--r--docs/release/release-notes/release-notes.rst146
-rw-r--r--docs/release/release-notes/releasenotes_euphrates.rst (renamed from docs/release/release-notes/releasenotes.rst)0
-rw-r--r--docs/release/release-notes/releasenotes_fraser.rst100
-rw-r--r--docs/release/release-notes/releasenotes_gambia.rst303
-rw-r--r--docs/release/release-notes/releasenotes_iruya.rst129
-rw-r--r--docs/release/scenarios/fault_management/fault_management.rst (renamed from docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst)78
-rw-r--r--docs/release/scenarios/maintenance/images/Fault-management-design.pngbin0 -> 237110 bytes
-rw-r--r--docs/release/scenarios/maintenance/images/LICENSE (renamed from docs/development/overview/functest_scenario/images/LICENSE)0
-rw-r--r--docs/release/scenarios/maintenance/images/Maintenance-design.pngbin0 -> 316640 bytes
-rw-r--r--docs/release/scenarios/maintenance/images/Maintenance-workflow.pngbin0 -> 81286 bytes
-rw-r--r--docs/release/scenarios/maintenance/maintenance.rst120
-rw-r--r--docs/release/userguide/feature.userguide.rst64
-rw-r--r--docs/release/userguide/get-valid-server-state.rst (renamed from docs/development/manuals/get-valid-server-state.rst)0
-rw-r--r--docs/release/userguide/index.rst3
-rw-r--r--docs/release/userguide/mark-host-down_manual.rst (renamed from docs/development/manuals/mark-host-down_manual.rst)0
-rw-r--r--docs/release/userguide/monitors.rst (renamed from docs/development/manuals/monitors.rst)3
-rw-r--r--docs/requirements.txt2
-rw-r--r--docs/testing/developer/index.rst13
-rw-r--r--docs/testing/developer/testing.rst82
-rw-r--r--docs/testing/index.rst15
-rw-r--r--docs/testing/user/index.rst13
-rw-r--r--docs/testing/user/testing.rst30
-rw-r--r--doctor_tests/admin_tool/__init__.py37
-rw-r--r--doctor_tests/admin_tool/base.py26
-rw-r--r--doctor_tests/admin_tool/fenix/Dockerfile34
-rwxr-xr-xdoctor_tests/admin_tool/fenix/run32
-rw-r--r--doctor_tests/admin_tool/sample.py739
-rw-r--r--doctor_tests/app_manager/__init__.py40
-rw-r--r--doctor_tests/app_manager/base.py26
-rw-r--r--doctor_tests/app_manager/sample.py265
-rw-r--r--doctor_tests/app_manager/vnfm.py441
-rw-r--r--doctor_tests/common/constants.py4
-rw-r--r--doctor_tests/common/utils.py26
-rw-r--r--doctor_tests/config.py4
-rw-r--r--doctor_tests/consumer/__init__.py2
-rw-r--r--doctor_tests/consumer/base.py9
-rw-r--r--doctor_tests/consumer/sample.py4
-rw-r--r--doctor_tests/identity_auth.py3
-rw-r--r--doctor_tests/image.py13
-rw-r--r--doctor_tests/inspector/__init__.py8
-rw-r--r--doctor_tests/inspector/base.py18
-rw-r--r--doctor_tests/inspector/congress.py19
-rw-r--r--doctor_tests/inspector/sample.py94
-rw-r--r--doctor_tests/installer/__init__.py17
-rw-r--r--doctor_tests/installer/apex.py252
-rw-r--r--doctor_tests/installer/base.py154
-rw-r--r--doctor_tests/installer/common/congress.py51
-rw-r--r--doctor_tests/installer/common/restart_aodh.py42
-rw-r--r--doctor_tests/installer/common/restore_aodh.py32
-rw-r--r--doctor_tests/installer/common/restore_ceilometer.py27
-rw-r--r--doctor_tests/installer/common/restore_compute_config.py26
-rw-r--r--doctor_tests/installer/common/restore_config.py48
-rw-r--r--doctor_tests/installer/common/restore_congress.py29
-rw-r--r--doctor_tests/installer/common/set_ceilometer.py45
-rw-r--r--doctor_tests/installer/common/set_compute_config.py53
-rw-r--r--doctor_tests/installer/common/set_config.py163
-rw-r--r--doctor_tests/installer/common/set_congress.py39
-rw-r--r--doctor_tests/installer/common/set_fenix.sh106
-rw-r--r--doctor_tests/installer/common/vitrage.py5
-rw-r--r--doctor_tests/installer/daisy.py75
-rw-r--r--doctor_tests/installer/devstack.py151
-rw-r--r--doctor_tests/installer/local.py118
-rw-r--r--doctor_tests/installer/mcp.py207
-rw-r--r--doctor_tests/main.py246
-rw-r--r--doctor_tests/maintenance_hot_tpl.yaml119
-rw-r--r--doctor_tests/monitor/base.py9
-rw-r--r--doctor_tests/monitor/sample.py4
-rw-r--r--doctor_tests/os_clients.py7
-rw-r--r--doctor_tests/scenario/__init__.py12
-rw-r--r--doctor_tests/scenario/common.py26
-rw-r--r--doctor_tests/scenario/fault_management.py233
-rw-r--r--doctor_tests/scenario/maintenance.py250
-rw-r--r--doctor_tests/scenario/network_failure.py84
-rw-r--r--doctor_tests/stack.py118
-rw-r--r--doctor_tests/user.py66
-rw-r--r--etc/doctor.sample.conf3
-rw-r--r--requirements.txt31
-rw-r--r--tox.ini27
102 files changed, 5773 insertions, 1119 deletions
diff --git a/.gitignore b/.gitignore
index 6fc25628..b21ec655 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@
/*.egg-info/
.eggs/
/build/
-/docs_build/
/docs_output/
/releng/
/tests/*.img
@@ -16,3 +15,4 @@
#Build results
.tox
+docs/_build/*
diff --git a/.yamllint b/.yamllint
new file mode 100644
index 00000000..3517e7f2
--- /dev/null
+++ b/.yamllint
@@ -0,0 +1,6 @@
+extends: default
+rules:
+ line-length:
+ max: 120
+ allow-non-breakable-words: true
+ allow-non-breakable-inline-mappings: true
diff --git a/INFO b/INFO
deleted file mode 100644
index f3d4e582..00000000
--- a/INFO
+++ /dev/null
@@ -1,30 +0,0 @@
-Project: Fault Management project (doctor)
-Project Creation Date: December 2, 2014
-Project Category: Requirement
-Lifecycle State: Mature
-Primary Contact: Ryota Mibu (r-mibu@cq.jp.nec.com)
-Project Lead: Ryota Mibu (r-mibu@cq.jp.nec.com)
-Jira Project Name: Fault Management project
-Jira Project Prefix: DOCTOR
-Mailing list tag: [doctor]
-IRC: Server:freenode.net Channel:#opnfv-doctor
-Repository: doctor
-
-Committers:
-Ashiq Khan (NTT DOCOMO, khan@nttdocomo.com)
-Bertrand Souville (NTT DOCOMO, souville@docomolab-euro.com)
-Carlos Goncalves (NEC, Carlos.Goncalves@neclab.eu)
-Dong Wenjuan (ZTE, dong.wenjuan@zte.com.cn)
-Gerald Kunzmann (NTT DOCOMO, kunzmann@docomolab-euro.com)
-Mario Cho (hephaex@gmail.com)
-Peter Lee (Corenova Technologies, peter@corenova.com)
-Ryota Mibu (NEC, r-mibu@cq.jp.nec.com)
-Serge Manning (Sprint, Serge.Manning@sprint.com)
-Tomi Juvonen (Nokia, tomi.juvonen@nokia.com)
-
-Link to TSC approval of the project: http://meetbot.opnfv.org/meetings/opnfv-meeting/2014/opnfv-meeting.2014-12-02-14.58.html
-Link(s) to approval of committer update:
-http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html
-http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2015-June/003165.html
-http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-June/011245.html
-http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-July/011771.html
diff --git a/INFO.yaml b/INFO.yaml
new file mode 100644
index 00000000..97acb69f
--- /dev/null
+++ b/INFO.yaml
@@ -0,0 +1,86 @@
+---
+project: 'Fault Management project (doctor)'
+project_creation_date: 'December 2, 2014'
+project_category: 'Requirement'
+lifecycle_state: 'Mature'
+project_lead: &opnfv_doctor_ptl
+ name: 'Tomi Juvonen'
+ email: 'tomi.juvonen@nokia.com'
+ company: 'Nokia'
+ id: 'only1road'
+ timezone: 'Unknown'
+primary_contact: *opnfv_doctor_ptl
+issue_tracking:
+ type: 'jira'
+ url: 'https://jira.opnfv.org/projects/DOCTOR'
+ key: 'DOCTOR'
+mailing_list:
+ type: 'mailman2'
+ url: 'opnfv-tech-discuss@lists.opnfv.org'
+ tag: '[doctor]'
+realtime_discussion:
+ type: irc
+ server: 'freenode.net'
+ channel: '#opnfv-doctor'
+meetings:
+ - type: 'gotomeeting+irc'
+ agenda: https://etherpad.opnfv.org/p/doctor_meetings
+ url: https://global.gotomeeting.com/join/391235029
+ server: 'freenode.net'
+ channel: '#opnfv-doctor'
+ repeats: 'weekly'
+ time: '11:00 UTC'
+repositories:
+ - 'doctor'
+committers:
+ - <<: *opnfv_doctor_ptl
+ - name: 'wenjuan dong'
+ email: 'dong.wenjuan@zte.com.cn'
+ company: 'ZTE'
+ id: 'dongwenjuan'
+tsc:
+ # yamllint disable rule:line-length
+ approval: 'http//meetbot.opnfv.org/meetings/opnfv-meeting/2014/opnfv-meeting.2014-12-02-14.58.html'
+ changes:
+ - type: 'removal'
+ name: 'Dirk Kutscher'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html'
+ - type: 'removal'
+ name: 'Palani Chinnakannan'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html'
+ - type: 'removal'
+ name: 'Petri Kemppainen'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html'
+ - type: 'removal'
+ name: 'Tapio Tallgren'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html'
+ - type: 'removal'
+ name: 'Zhangyu'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html'
+ - type: 'promotion'
+ name: 'Gerald Kunzmann'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2015-June/003165.html'
+ - type: 'promotion'
+ name: 'Dong Wenjuan'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-June/011245.html'
+ - type: 'promotion'
+ name: 'Bertrand Souville'
+ link: 'http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-July/011771.html'
+ - type: 'removal'
+ name: 'Tommy Lindgren'
+ link: 'https://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-December/014216.html'
+ - type: 'removal'
+ name: 'Ulrich Kleber'
+ link: 'https://lists.opnfv.org/pipermail/opnfv-tech-discuss/2017-October/018687.html'
+ - type: 'removal'
+ name: 'Carlos Goncalves'
+ link: 'https://lists.opnfv.org/pipermail/opnfv-tsc/2018-January/004069.html'
+ - type: 'removal'
+ name: 'Mario Cho'
+ link: 'https://lists.opnfv.org/pipermail/opnfv-tsc/2018-March/004177.html'
+ - type: 'removal'
+ name: 'Peter Lee'
+ link: 'https://lists.opnfv.org/pipermail/opnfv-tsc/2018-March/004190.html'
+ - type: 'removal'
+ name: 'Bertrand Souville'
+ link: 'https://lists.opnfv.org/g/opnfv-tech-discuss/message/22344'
diff --git a/UPSTREAM b/UPSTREAM
index de01cbfc..04ca5116 100644
--- a/UPSTREAM
+++ b/UPSTREAM
@@ -52,3 +52,12 @@
-
url: https://bugs.launchpad.net/python-openstackclient/+bug/1684989
system: Launchpad-bug
+-
+ url: https://blueprints.launchpad.net/vitrage/+spec/doctor-datasource
+ system: Launchpad
+-
+ url: https://blueprints.launchpad.net/vitrage/+spec/support-inspector-sb-api
+ system: Launchpad
+-
+ url: https://blueprints.launchpad.net/vitrage/+spec/support-mark-down-action-for-instances
+ system: Launchpad
diff --git a/devstack/README.rst b/devstack/README.rst
index 91e8abfe..aaa18a7f 100644
--- a/devstack/README.rst
+++ b/devstack/README.rst
@@ -18,7 +18,9 @@ OPNFV Doctor in DevStack.
enable_plugin osprofiler https://git.openstack.org/openstack/osprofiler
enable_plugin doctor https://git.opnfv.org/doctor
-to the ``[[local|localrc]]`` section.
+to the ``[[local|localrc]]`` section. Or, you can copy the local.conf.sample::
+
+ cp /<path-to-doctor>/devstack/local.conf.sample ${DEVSTACK_DIR}/local.conf
.. note:: The order of enabling plugins matters.
diff --git a/devstack/local.conf.sample b/devstack/local.conf.sample
new file mode 100644
index 00000000..2967714a
--- /dev/null
+++ b/devstack/local.conf.sample
@@ -0,0 +1,120 @@
+# Sample ``local.conf`` for user-configurable variables in ``stack.sh``
+
+# NOTE: Copy this file to the root DevStack directory for it to work properly.
+
+# ``local.conf`` is a user-maintained settings file that is sourced from ``stackrc``.
+# This gives it the ability to override any variables set in ``stackrc``.
+# Also, most of the settings in ``stack.sh`` are written to only be set if no
+# value has already been set; this lets ``local.conf`` effectively override the
+# default values.
+
+# This is a collection of some of the settings we have found to be useful
+# in our DevStack development environments. Additional settings are described
+# in https://docs.openstack.org/devstack/latest/configuration.html#local-conf
+# These should be considered as samples and are unsupported DevStack code.
+
+# The ``localrc`` section replaces the old ``localrc`` configuration file.
+# Note that if ``localrc`` is present it will be used in favor of this section.
+[[local|localrc]]
+
+# Minimal Contents
+# ----------------
+
+# While ``stack.sh`` is happy to run without ``localrc``, devlife is better when
+# there are a few minimal variables set:
+
+# If the ``*_PASSWORD`` variables are not set here you will be prompted to enter
+# values for them by ``stack.sh``and they will be added to ``local.conf``.
+ADMIN_PASSWORD=devstack
+DATABASE_PASSWORD=$ADMIN_PASSWORD
+RABBIT_PASSWORD=$ADMIN_PASSWORD
+SERVICE_PASSWORD=$ADMIN_PASSWORD
+
+# ``HOST_IP`` and ``HOST_IPV6`` should be set manually for best results if
+# the NIC configuration of the host is unusual, i.e. ``eth1`` has the default
+# route but ``eth0`` is the public interface. They are auto-detected in
+# ``stack.sh`` but often is indeterminate on later runs due to the IP moving
+# from an Ethernet interface to a bridge on the host. Setting it here also
+# makes it available for ``openrc`` to include when setting ``OS_AUTH_URL``.
+# Neither is set by default.
+HOST_IP=127.0.0.1
+#HOST_IPV6=2001:db8::7
+
+
+# Logging
+# -------
+
+# By default ``stack.sh`` output only goes to the terminal where it runs. It can
+# be configured to additionally log to a file by setting ``LOGFILE`` to the full
+# path of the destination log file. A timestamp will be appended to the given name.
+LOGFILE=$DEST/logs/stack.sh.log
+
+# Old log files are automatically removed after 7 days to keep things neat. Change
+# the number of days by setting ``LOGDAYS``.
+LOGDAYS=2
+
+# Nova logs will be colorized if ``SYSLOG`` is not set; turn this off by setting
+# ``LOG_COLOR`` false.
+#LOG_COLOR=False
+
+
+# Using milestone-proposed branches
+# ---------------------------------
+
+# Uncomment these to grab the milestone-proposed branches from the
+# repos:
+#CINDER_BRANCH=milestone-proposed
+#GLANCE_BRANCH=milestone-proposed
+#HORIZON_BRANCH=milestone-proposed
+#KEYSTONE_BRANCH=milestone-proposed
+#KEYSTONECLIENT_BRANCH=milestone-proposed
+#NOVA_BRANCH=milestone-proposed
+#NOVACLIENT_BRANCH=milestone-proposed
+#NEUTRON_BRANCH=milestone-proposed
+#SWIFT_BRANCH=milestone-proposed
+
+# Using git versions of clients
+# -----------------------------
+# By default clients are installed from pip. See LIBS_FROM_GIT in
+# stackrc for details on getting clients from specific branches or
+# revisions. e.g.
+# LIBS_FROM_GIT="python-ironicclient"
+# IRONICCLIENT_BRANCH=refs/changes/44/2.../1
+
+# Swift
+# -----
+
+# Swift is now used as the back-end for the S3-like object store. Setting the
+# hash value is required and you will be prompted for it if Swift is enabled
+# so just set it to something already:
+SWIFT_HASH=66a3d6b56c1f479c8b4e70ab5c2000f5
+
+# For development purposes the default of 3 replicas is usually not required.
+# Set this to 1 to save some resources:
+SWIFT_REPLICAS=1
+
+# The data for Swift is stored by default in (``$DEST/data/swift``),
+# or (``$DATA_DIR/swift``) if ``DATA_DIR`` has been set, and can be
+# moved by setting ``SWIFT_DATA_DIR``. The directory will be created
+# if it does not exist.
+SWIFT_DATA_DIR=$DEST/data
+
+# OPNFV Doctor
+# ------------
+
+# Enable the required plugins
+# The order of enabling plugins matters
+enable_plugin aodh http://git.openstack.org/openstack/aodh
+enable_plugin panko https://git.openstack.org/openstack/panko
+enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer
+enable_plugin osprofiler https://git.openstack.org/openstack/osprofiler
+enable_plugin doctor https://git.opnfv.org/doctor
+
+# To enable Python 3
+# USE_PYTHON3=True
+
+# To enable Congress as Doctor Inspector
+# enable_plugin congress https://git.openstack.org/openstack/congress
+
+# To enable Neutron port data plane status
+# Q_ML2_PLUGIN_EXT_DRIVERS=data_plane_status
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 00000000..3c9978bb
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,2 @@
+from docs_conf.conf import * # noqa: F401,F403
+master_doc = 'index'
diff --git a/docs/conf.yaml b/docs/conf.yaml
new file mode 100644
index 00000000..e08a41ab
--- /dev/null
+++ b/docs/conf.yaml
@@ -0,0 +1,3 @@
+---
+project_cfg: opnfv
+project: DOCTOR
diff --git a/docs/development/design/images/maintenance-workflow.png b/docs/development/design/images/maintenance-workflow.png
new file mode 100644
index 00000000..9b65fd59
--- /dev/null
+++ b/docs/development/design/images/maintenance-workflow.png
Binary files differ
diff --git a/docs/development/design/maintenance-design-guideline.rst b/docs/development/design/maintenance-design-guideline.rst
index 93c3cf4e..47002b96 100644
--- a/docs/development/design/maintenance-design-guideline.rst
+++ b/docs/development/design/maintenance-design-guideline.rst
@@ -5,151 +5,329 @@
Planned Maintenance Design Guideline
====================================
-.. NOTE::
- This is spec draft of design guideline for planned maintenance.
- JIRA ticket to track the update and collect comments: `DOCTOR-52`_.
-
-This document describes how one can implement planned maintenance by utilizing
-the `OPNFV Doctor project`_. framework and to meet the set requirements.
+This document describes how one can implement infrastructure maintenance in
+interaction with VNFM by utilizing the `OPNFV Doctor project`_ framework and to
+meet the set requirements. Document concentrates to OpenStack and VMs while
+the concept designed is generic for any payload or even different VIM. Admin
+tool should be also for controller and other cloud hardware, but that is not the
+main focus in OPNFV Doctor and should be defined better in the upstream
+implementation. Same goes for any more detailed work to be done.
Problem Description
===================
-Telco application need to know when planned maintenance is going to happen in
-order to guarantee zero down time in its operation. It needs to be possible to
-make own actions to have application running on not affected resource or give
+Telco application need to know when infrastructure maintenance is going to happen
+in order to guarantee zero down time in its operation. It needs to be possible
+to make own actions to have application running on not affected resource or give
guidance to admin actions like migration. More details are defined in
requirement documentation: `use cases`_, `architecture`_ and `implementation`_.
-Also discussion in the OPNFV summit about `planned maintenance session`_.
Guidelines
==========
-Cloud admin needs to make a notification about planned maintenance including
-all details that application needs in order to make decisions upon his affected
-service. This notification payload can be consumed by application by subscribing
-to corresponding event alarm trough alarming service like OpenStack AODH.
+Concepts used:
+
+- `event`: Notification to rabbitmq with particular event type.
+
+- `state event`: Notification to rabbitmq with particular event type including
+ payload with variable defined for state.
+
+- `project event`: Notification to rabbitmq that is meant for project. Single
+ event type is used with different payload and state information.
+
+- `admin event`: Notification to rabbitmq that is meant for admin or as for any
+ infrastructure service. Single event type is used with different state
+ information.
+
+- `rolling maintenance`: Node by Node rolling maintenance and upgrade where
+ a single node at a time will be maintained after a possible application
+ payload is moved away from the node.
+
+- `project` stands for `application` in OpenStack contents and both are used in
+ this document. `tenant` is many times used for the same.
+
+Infrastructure admin needs to make notification with two different event types.
+One is meant for admin and one for project. Notification payload can be consumed
+by application and admin by subscribing to corresponding event alarm trough
+alarming service like OpenStack AODH.
+
+- Infrastructure admin needs to make a notification about infrastructure
+ maintenance including all details that application needs in order to make
+ a decisions upon his affected service. Alarm Payload can hold a link to
+ infrastructure admin tool API for reply and for other possible information.
+ There is many steps of communication between admin tool and application, thus
+ the payload needed for the information passed is very similar. Because of
+ this, the same event type can be used, but there can be a variable like
+ `state` to tell application what is needed as action for each event.
+ If a project have not subscribed to alarm, admin tool responsible for the
+ maintenance will assume it can do maintenance operations without interaction
+ with application on top of it.
+
+- Infrastructure admin needs to make an event about infrastructure maintenance
+ telling when the maintenance starts and another when it ends. This admin level
+ event should include the host name. This could be consumed by any admin level
+ infrastructure entity. In this document we consume this in `Inspector` that
+ is in `OPNFV Doctor project`_ terms infrastructure entity responsible for
+ automatic host fault management. Automated actions surely needs to be disabled
+ during planned maintenance.
Before maintenance starts application needs to be able to make switch over for
his ACT-STBY service affected, do operation to move service to not effected part
-of infra or give a hint for admin operation like migration that can be
+of infrastructure or give a hint for admin operation like migration that can be
automatically issued by admin tool according to agreed policy.
-Flow diagram::
-
- admin alarming project controller inspector
- | service app manager | |
- | 1. | | | |
- +------------------------->+ |
- +<-------------------------+ |
- | 2. | | | |
- +------>+ 3. | | |
- | +-------->+ 4. | |
- | | +------->+ |
- | | 5. +<-------+ |
- +<----------------+ | |
- | | 6. | |
- +------------------------->+ |
- +<-------------------------+ 7. |
- +------------------------------------->+
- | 8. | | | |
- +------>+ 9. | | |
- | +-------->+ | |
- +--------------------------------------+
- | 10. |
- +--------------------------------------+
- | 11. | | | |
- +------------------------->+ |
- +<-------------------------+ |
- | 12. | | | |
- +------>+-------->+ | 13. |
- +------------------------------------->+
- +-------+---------+--------+-----------+
-
-Concepts used below:
-
-- `full maintenance`: This means maintenance will take a longer time and
- resource should be emptied, meaning container or VM need to be moved or
- deleted. Admin might need to test resource to work after maintenance.
-
-- `reboot`: Only a reboot is needed and admin does not need separate testing
- after that. Container or VM can be left in place if so wanted.
-
-- `notification`: Notification to rabbitmq.
-
-Admin makes a planned maintenance session where he sets
-a `maintenance_session_id` that is a unique ID for all the hardware resources he
-is going to have the maintenance at the same time. Mostly maintenance should be
-done node by node, meaning a single compute node at a time would be in single
-planned maintenance session having unique `maintenance_session_id`. This ID will
-be carried trough the whole session in all places and can be used to query
-maintenance in admin tool API. Project running a Telco application should set
-a specific role for admin tool to know it cannot do planned maintenance unless
-project has agreed actions to be done for its VMs or containers. This means the
-project has configured itself to get alarms upon planned maintenance and it is
-capable of agreeing needed actions. Admin is supposed to use an admin tool to
-automate maintenance process partially or entirely.
-
-The flow of a successful planned maintenance session as in OpenStack example
-case:
-
-1. Admin disables nova-compute in order to do planned maintenance on a compute
- host and gets ACK from the API call. This action needs to be done to ensure
- no thing will be placed in this compute host by any user. Action is always
- done regardless the whole compute will be affected or not.
-2. Admin sends a project specific maintenance notification with state
- `planned maintenance`. This includes detailed information about maintenance,
- like when it is going to start, is it `reboot` or `full maintenance`
- including the information about project containers or VMs running on host or
- the part of it that will need maintenance. Also default action like
- migration will be mentioned that will be issued by admin before maintenance
- starts if no other action is set by project. In case project has a specific
- role set, planned maintenance cannot start unless project has agreed the
- admin action. Available admin actions are also listed in notification.
-3. Application manager of the project receives AODH alarm about the same.
-4. Application manager can do switch over to his ACT-STBY service, delete and
- re-instantiate his service on not affected resource if so wanted.
-5. Application manager may call admin tool API to give preferred instructions
- for leaving VMs and containers in place or do admin action to migrate them.
- In case admin does not receive this instruction before maintenance is to
- start it will do the pre-configured default action like migration to
- projects without a specific role to say project need to agree the action.
- VMs or Containers can be left on host if type of maintenance is just `reboot`.
-6. Admin does possible actions to VMs and containers and receives an ACK.
-7. In case everything went ok, Admin sends admin type of maintenance
- notification with state `in maintenance`. This notification can be consumed
- by Inspector and other cloud services to know there is ongoing maintenance
- which means things like automatic fault management actions for the hardware
- resources should be disabled.
-8. If maintenance type is `reboot` and project is still having containers or
- VMs running on affected hardware resource, Admin sends project specific
- maintenance notification with state updated to `in maintenance`. If project
- do not have anything left running on affected hardware resource, state will
- be `maintenance over` instead. If maintenance can not be performed for some
- reason state should be `maintenance cancelled`. In this case last operation
- remaining for admin is to re-enable nova-compute service, ensure
- everything is running and not to proceed any further steps.
-9. Application manager of the project receives AODH alarm about the same.
-10. Admin will do the maintenance. This is out of Doctor scope.
-11. Admin enables nova-compute service when maintenance is over and host can be
- put back to production. An ACK is received from API call.
-12. In case project had left containers or VMs on hardware resource over
- maintenance, Admin sends project specific maintenance notification with
- state updated to `maintenance over`.
-13. Admin sends admin type of maintenance notification with state updated to
- `maintenance over`. Inspector and other
- cloud services can consume this to know hardware resource is back in use.
+There should be at least one empty host compatible to host under maintenance in
+order to have a smooth `rolling maintenance` done. For this to be possible also
+down scaling the application instances should be possible.
+
+Infrastructure admin should have a tool that is responsible for hosting a
+maintenance work flow session with needed APIs for admin and for applications.
+The Group of hosts in single maintenance session should always have the same
+physical capabilities, so the rolling maintenance can be guaranteed.
+
+Flow diagram is meant to be as high level as possible. It currently does not try
+to be perfect, but to show the most important interfaces needed between VNFM and
+infrastructure admin. This can be seen e.g. as missing error handling that can
+be defined later on.
+
+Flow diagram:
+
+.. figure:: images/maintenance-workflow.png
+ :alt: Work flow in OpenStack
+
+Flow diagram step by step:
+
+- Infrastructure admin makes a maintenance session to maintain and upgrade
+ certain group of hardware. At least compute hardware in single session should
+ be having same capabilities like the amount number of VCPUs to ensure
+ the maintenance can be done node by node in rolling fashion. Maintenance
+ session need to have a `session_id` that is a unique ID to be carried
+ throughout all events and can be used in APIs needed when interacting with
+ the session. Maintenance session needs to have knowledge about when
+ maintenance will start and what capabilities the possible upgrade to
+ infrastructure will bring to application payload on top of it. It will be
+ matter of the implementation to define in more detail whether some more data is
+ needed when creating a session or if it is defined in the admin tool
+ configuration.
+
+ There can be several parallel maintenance sessions and a single session can
+ include multiple projects payload. Typically maintenance session should include
+ similar type of compute hardware, so you can guarantee moving of instances on
+ top of them can work between the compute hosts.
+
+- State `MAINTENANCE` `project event` and reply `ACK_MAINTENANCE`. Immediately
+ after a maintenance session is created, infrastructure admin tool will send
+ a project specific 'notification' which application manager can consume by
+ subscribing to AODH alarm for this event. As explained already earlier all
+ `project event`s will only be sent in case the project subscribes to alarm and
+ otherwise the interaction with application will simply not be done and
+ operations could be forced.
+
+ The state `MAINTENANCE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `MAINTENANCE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances will be affected by
+ the maintenance. This might be a link to admin tool project specific API
+ as AODH variables are limited to string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_MAINTENANCE` including the `session_id`.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when maintenance work flow will start.
+ `ACK_MAINTENANCE` reply is needed before that time.
+ - `metadata` to include key values pairs of a capabilities coming over the
+ maintenance operation like 'openstack_version': 'Queens'
+
+- Optional state `DOWN_SCALE` `project event` and reply `ACK_DOWN_SCALE`. When it
+ is time to start the maintenance work flow as the time reaches the `actions_at`
+ defined in previous `state event`, admin tool needs to check if there is already
+ an empty compute host needed by the `rolling maintenance`. In case there is no
+ empty host, admin tool can ask application to down scale by sending project
+ specific `DOWN_SCALE` `state event`.
+
+ The state `DOWN_SCALE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `DOWN_SCALE` to identify event action needed.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_DOWN_SCALE` including the `session_id`.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when is the last moment to send
+ `ACK_DOWN_SCALE`. This means application can have time to finish some
+ ongoing transactions before down scaling his instances. This guarantees
+ a zero downtime for his service.
+
+- Optional state `PREPARE_MAINTENANCE` `project event` and reply
+ `ACK_PREPARE_MAINTENANCE`. In case still after down scaling the applications
+ there is still no empty compute host, admin tools needs to analyze the
+ situation on compute host under maintenance. It needs to choose compute node
+ that is now almost empty or has otherwise least critical instances running if
+ possible, like looking if there is floating IPs. When compute host is chosen,
+ a `PREPARE_MAINTENANCE` `state event` can be sent to projects having instances
+ running on this host to migrate them to other compute hosts. It might also be
+ possible to have another round of `DOWN_SCALE` `state event` if necessary, but
+ this is not proposed here.
+
+ The state `PREPARE_MAINTENANCE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `PREPARE_MAINTENANCE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances will be affected by
+ the `state event`. This might be a link to admin tool project specific API
+ as AODH variables are limited to string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_PREPARE_MAINTENANCE` including the `session_id` and
+ `instance_ids` with list of key value pairs with key as `instance_id` and
+ chosen action from allowed actions given via `allowed_actions` as value.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when is the last moment to send
+ `ACK_PREPARE_MAINTENANCE`. This means application can have time to finish
+ some ongoing transactions within his instances and make possible
+ switch over. This guarantees a zero downtime for his service.
+ - `allowed_actions` to tell what admin tool supports as action to move
+ instances to another compute host. Typically a list like: `['MIGRATE', 'LIVE_MIGRATE']`
+
+- Optional state `INSTANCE_ACTION_DONE` `project event`. In case admin tool needed
+ to make action to move instance like migrating it to another compute host, this
+ `state event` will be sent to tell the operation is complete.
+
+ The state `INSTANCE_ACTION_DONE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `instance_ids` to tell project which of his instance had the admin action
+ done.
+ - `project_id` to identify project.
+
+- At this state it is guaranteed there is an empty compute host. It would be
+ maintained first trough `IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` steps, but
+ following the flow chart `PLANNED_MAINTENANCE` will be explained next.
+
+- Optional state `PLANNED_MAINTENANCE` `project event` and reply
+ `ACK_PLANNED_MAINTENANCE`. In case compute host to be maintained has
+ instances, projects owning those should have this `state event`. When project
+ receives this `state event` it knows instances moved to other compute host as
+ resulting actions will now go to host that is already maintained. This means
+ it might have new capabilities that project can take into use. This gives the
+ project the possibility to upgrade his instances also to support new
+ capabilities over the action chosen to move instances.
+
+ The state `PLANNED_MAINTENANCE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `PLANNED_MAINTENANCE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances will be affected by
+ the event. This might be a link to admin tool project specific API as AODH
+ variables are limited to string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_PLANNED_MAINTENANCE` including the `session_id` and
+ `instance_ids` with list of key value pairs with key as `instance_id` and
+ chosen action from allowed actions given via `allowed_actions` as value.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when is the last moment to send
+ `ACK_PLANNED_MAINTENANCE`. This means application can have time to finish
+ some ongoing transactions within his instances and make possible switch
+ over. This guarantees a zero downtime for his service.
+ - `allowed_actions` to tell what admin tool supports as action to move
+ instances to another compute host. Typically a list like: `['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']`
+ `OWN_ACTION` means that application may want to re-instantiate his
+ instance perhaps to take into use the new capability coming over the
+ infrastructure maintenance. Re-instantiated instance will go to already
+ maintained host having the new capability.
+ - `metadata` to include key values pairs of a capabilities coming over the
+ maintenance operation like 'openstack_version': 'Queens'
+
+- `State IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` `admin event`s. Just before
+ host goes to maintenance the IN_MAINTENANCE` `state event` will be send to
+ indicate host is entering to maintenance. Host is then taken out of production
+ and can be powered off, replaced, or rebooted during the operation.
+ During the maintenance and upgrade host might be moved to admin's own host
+ aggregate, so it can be tested to work before putting back to production.
+ After maintenance is complete `MAINTENANCE_COMPLETE` `state event` will be sent
+ to know host is back in use. Adding or removing of a host is yet not
+ included in this concept, but can be addressed later.
+
+ The state `IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` event should at least
+ include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `IN_MAINTENANCE` or `MAINTENANCE_COMPLETE` to indicate host
+ state.
+ - `project_id` to identify admin project needed by AODH alarm.
+ - `host` to indicate the host name.
+
+- State `MAINTENANCE_COMPLETE` `project event` and reply
+ `MAINTENANCE_COMPLETE_ACK`. After all compute nodes in the maintenance session
+ have gone trough maintenance operation this `state event` can be send to all
+ projects that had instances running on any of those nodes. If there was a down
+ scale done, now the application could up scale back to full operation.
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `MAINTENANCE_COMPLETE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances are currently
+ running on hosts maintained in this maintenance session. This might be a
+ link to admin tool project specific API as AODH variables are limited to
+ string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_MAINTENANCE` including the `session_id`.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when maintenance work flow will start.
+ - `metadata` to include key values pairs of a capabilities coming over the
+ maintenance operation like 'openstack_version': 'Queens'
+
+- At the end admin tool maintenance session can enter to `MAINTENANCE_COMPLETE`
+ state and session can be removed.
+
+Benefits
+========
+
+- Application is guaranteed zero downtime as it is aware of the maintenance
+ action affecting its payload. The application is made aware of the maintenance
+ time window to make sure it can prepare for it.
+- Application gets to know new capabilities over infrastructure maintenance and
+ upgrade and can utilize those (like do its own upgrade)
+- Any application supporting the interaction being defined could be running on
+ top of the same infrastructure provider. No vendor lock-in for application.
+- Any infrastructure component can be aware of host(s) under maintenance via
+ `admin event`s about host state. No vendor lock-in for infrastructure
+ components.
+- Generic messaging making it possible to use same concept in different type of
+ clouds and application payloads. `instance_ids` will uniquely identify any
+ type of instance and similar notification payload can be used regardless we
+ are in OpenStack. Work flow just need to support different cloud
+ infrastructure management to support different cloud.
+- No additional hardware is needed during maintenance operations as down- and
+ up-scaling can be supported for the applications. Optional, if no extensive
+ spare capacity is available for the maintenance - as typically the case in
+ Telco environments.
+- Parallel maintenance sessions for different group of hardware. Same session
+ should include hardware with same capabilities to guarantee `rolling
+ maintenance` actions.
+- Multi-tenancy support. Project specific messaging about maintenance.
+
+Future considerations
+=====================
+
+- Pluggable architecture for infrastructure admin tool to handle different
+ clouds and payloads.
+- Pluggable architecture to handle specific maintenance/upgrade cases like
+ OpenStack upgrade between specific versions or admin testing before giving
+ host back to production.
+- Support for user specific details need to be taken into account in admin side
+ actions (e.g. run a script, ...).
+- (Re-)Use existing implementations like Mistral for work flows.
+- Scaling hardware resources. Allow critical application to be scaled at the
+ same time in controlled fashion or retire application.
POC
---
-There was a `Maintenance POC`_ for planned maintenance in the OPNFV Beijing
-summit to show the basic concept of using framework defined by the project.
+There was a `Maintenance POC`_ demo 'How to gain VNF zero down-time during
+Infrastructure Maintenance and Upgrade' in the OCP and ONS summit March 2018.
+Similar concept is also being made as `OPNFV Doctor project`_ new test case
+scenario.
-.. _DOCTOR-52: https://jira.opnfv.org/browse/DOCTOR-52
.. _OPNFV Doctor project: https://wiki.opnfv.org/doctor
.. _use cases: http://artifacts.opnfv.org/doctor/docs/requirements/02-use_cases.html#nvfi-maintenance
.. _architecture: http://artifacts.opnfv.org/doctor/docs/requirements/03-architecture.html#nfvi-maintenance
.. _implementation: http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html#nfvi-maintenance
-.. _planned maintenance session: https://lists.opnfv.org/pipermail/opnfv-tech-discuss/2017-June/016677.html
-.. _Maintenance POC: https://wiki.opnfv.org/download/attachments/5046291/Doctor%20Maintenance%20PoC%202017.pptx?version=1&modificationDate=1498182869000&api=v2
+.. _Maintenance POC: https://youtu.be/7q496Tutzlo
diff --git a/docs/development/index.rst b/docs/development/index.rst
index 2dc16a82..a7d2817b 100644
--- a/docs/development/index.rst
+++ b/docs/development/index.rst
@@ -2,18 +2,18 @@
.. http://creativecommons.org/licenses/by/4.0
.. (c) 2016 OPNFV.
+.. _development:
-======
-Doctor
-======
+===========
+Development
+===========
.. toctree::
:maxdepth: 2
- ./design/index.rst
- ./requirements/index.rst
- ./manuals/index.rst
- ./overview/functest_scenario/index.rst
+ ./design/index
+ ./overview/index
+ ./requirements/index
Indices
=======
diff --git a/docs/development/overview/functest_scenario/images/figure-p1.png b/docs/development/overview/functest_scenario/images/figure-p1.png
deleted file mode 100755
index e963d8bd..00000000
--- a/docs/development/overview/functest_scenario/images/figure-p1.png
+++ /dev/null
Binary files differ
diff --git a/docs/development/overview/index.rst b/docs/development/overview/index.rst
index 956e73e3..f6d78d57 100644
--- a/docs/development/overview/index.rst
+++ b/docs/development/overview/index.rst
@@ -3,11 +3,12 @@
.. _doctor-overview:
-************************
-Doctor Development Guide
-************************
+********
+Overview
+********
.. toctree::
:maxdepth: 2
+ overview.rst
testing.rst
diff --git a/docs/development/overview/overview.rst b/docs/development/overview/overview.rst
new file mode 100644
index 00000000..21f5439e
--- /dev/null
+++ b/docs/development/overview/overview.rst
@@ -0,0 +1,52 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Platform overview
+"""""""""""""""""
+
+Doctor platform provides these features since `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_:
+
+* Immediate Notification
+* Consistent resource state awareness for compute host down
+* Valid compute host status given to VM owner
+
+These features enable high availability of Network Services on top of
+the virtualized infrastructure. Immediate notification allows VNF managers
+(VNFM) to process recovery actions promptly once a failure has occurred.
+Same framework can also be utilized to have VNFM awareness about
+infrastructure maintenance.
+
+Consistency of resource state is necessary to execute recovery actions
+properly in the VIM.
+
+Ability to query host status gives VM owner the possibility to get
+consistent state information through an API in case of a compute host
+fault.
+
+The Doctor platform consists of the following components:
+
+* OpenStack Compute (Nova)
+* OpenStack Networking (Neutron)
+* OpenStack Telemetry (Ceilometer)
+* OpenStack Alarming (AODH)
+* Doctor Sample Inspector, OpenStack Congress or OpenStack Vitrage
+* Doctor Sample Monitor or any monitor supported by Congress or Vitrage
+
+.. note::
+ Doctor Sample Monitor is used in Doctor testing. However in real
+ implementation like Vitrage, there are several other monitors supported.
+
+You can see an overview of the Doctor platform and how components interact in
+:numref:`figure-p1`.
+
+
+Maintenance use case provides these features since `Iruya Release <https://wiki.opnfv.org/display/SWREL/Iruya>`_:
+
+* Infrastructure maintenance and upgrade workflow
+* Interaction between VNFM and infrastructe workflow
+
+Since `Jerma Release <https://wiki.opnfv.org/display/SWREL/Jerma>`_ maintenance
+use case also supports 'ETSI FEAT03' implementation to have the infrastructure
+maintenance and upgrade fully optimized while keeping zero impact on VNF
+service.
+
diff --git a/docs/development/overview/testing.rst b/docs/development/overview/testing.rst
deleted file mode 100644
index 8d0e90e6..00000000
--- a/docs/development/overview/testing.rst
+++ /dev/null
@@ -1,96 +0,0 @@
-.. This work is licensed under a Creative Commons Attribution 4.0 International License.
-.. http://creativecommons.org/licenses/by/4.0
-
-==============
-Testing Doctor
-==============
-
-You have two options to test Doctor functions with the script developed
-for doctor CI.
-
-You need to install OpenStack and other OPNFV components except Doctor Sample
-Inspector, Sample Monitor and Sample Consumer, as these will be launched in
-this script. You are encouraged to use OPNFV offcial installers, but you can
-also deploy all components with other installers such as devstack or manual
-operation. In those cases, the versions of all components shall be matched with
-the versions of them in OPNFV specific release.
-
-Run Test Script
-===============
-
-Doctor project has own testing script under `doctor/tests`_. This test script
-can be used for functional testing agained an OPNFV deployment.
-
-.. _doctor/tests: https://gerrit.opnfv.org/gerrit/gitweb?p=doctor.git;a=tree;f=tests;
-
-Before running this script, make sure OpenStack env parameters are set properly
-following `OpenStack CLI manual`_, so that Doctor Inspector can operate
-OpenStack services.
-
-.. _OpenStack CLI manual: https://docs.openstack.org/user-guide/common/cli-set-environment-variables-using-openstack-rc.html
-
-Run Bash Test Script
-~~~~~~~~~~~~~~~~~~~~
-
-You can run the bash script as follows:
-
-.. code-block:: bash
-
- git clone https://gerrit.opnfv.org/gerrit/doctor
- cd doctor/tests
- export INSTALLER_TYPE=local
- export INSPECTOR_TYPE=sample
- ./run.sh
-
-INSTALLER_TYPE can be 'apex', 'fuel', 'joid' and 'local'(default). If you are
-not using OPNFV installers in this option, chose 'local'.
-INSPECTOR_TYPE can be specified either 'sample'(default) or 'congress'.
-
-For testing with stable version, checkout stable branch of doctor repo before
-'./run.sh'.
-
-The bash test script will be deprecated(only bug fixes) after E Release.
-
-Run Python Test Script
-~~~~~~~~~~~~~~~~~~~~~~
-
-You can run the python script as follows:
-
-.. code-block:: bash
-
- git clone https://gerrit.opnfv.org/gerrit/doctor
- cd doctor && tox
-
-You can see all the configurations with default values in sample configuration
-file `doctor.sample.conf`_. And you can also modify the file to meet your
-environment and then run the test.
-
-.. _doctor.sample.conf: https://git.opnfv.org/doctor/tree/etc/doctor.sample.conf
-
-Run Functest Suite
-==================
-
-Functest supports Doctor testing by triggering the test script above in a
-Functest container. You can run the Doctor test with the following steps:
-
-.. code-block:: bash
-
- DOCKER_TAG=latest
- docker pull docker.io/opnfv/functest-features:${DOCKER_TAG}
- docker run --privileged=true -id \
- -e INSTALLER_TYPE=${INSTALLER_TYPE} \
- -e INSTALLER_IP=${INSTALLER_IP} \
- -e INSPECTOR_TYPE=sample \
- docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash
- docker exec <container_id> functest env prepare
- docker exec <container_id> functest testcase run doctor-notification
-
-See `Functest Userguide`_ for more information.
-
-.. _Functest Userguide: http://docs.opnfv.org/en/latest/submodules/functest/docs/testing/user/userguide/index.html
-
-For testing with stable version, change DOCKER_TAG to 'stable' or other release
-tag identifier.
-
-Tips
-====
diff --git a/docs/development/requirements/index.rst b/docs/development/requirements/index.rst
index fceaebf0..ccc35cb8 100644
--- a/docs/development/requirements/index.rst
+++ b/docs/development/requirements/index.rst
@@ -3,9 +3,9 @@
.. _doctor-requirements:
-****************************************
-Doctor: Fault Management and Maintenance
-****************************************
+**********************************************
+Requirements: Fault Management and Maintenance
+**********************************************
:Project: Doctor, https://wiki.opnfv.org/doctor
:Editors: Ashiq Khan (NTT DOCOMO), Gerald Kunzmann (NTT DOCOMO)
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 00000000..b8e8bfd0
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,17 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+.. _doctor:
+
+=========================================
+Fault Management and Maintenance (Doctor)
+=========================================
+
+.. toctree::
+ :numbered:
+ :maxdepth: 2
+
+ development/index
+ release/index
+ testing/index
diff --git a/docs/release/configguide/feature.configuration.rst b/docs/release/configguide/feature.configuration.rst
index ddd9c7c1..8fbff50e 100644
--- a/docs/release/configguide/feature.configuration.rst
+++ b/docs/release/configguide/feature.configuration.rst
@@ -10,34 +10,38 @@ Ceilometer and Aodh (Doctor Notifier) except Doctor Monitor.
After major components of OPNFV are deployed, you can setup Doctor functions
by following instructions in this section. You can also learn detailed
-steps in setup_installer() under `doctor/tests`_.
+steps for all supported installers under `doctor/doctor_tests/installer`_.
-.. _doctor/tests: https://gerrit.opnfv.org/gerrit/gitweb?p=doctor.git;a=tree;f=tests;
+.. _doctor/doctor_tests/installer: https://git.opnfv.org/doctor/tree/doctor_tests/installer
Doctor Inspector
----------------
-You need to configure one of Doctor Inspector below.
+You need to configure one of Doctor Inspectors below. You can also learn detailed steps for
+all supported Inspectors under `doctor/doctor_tests/inspector`_.
-**Doctor Sample Inspector**
+.. _doctor/doctor_tests/inspector: https://git.opnfv.org/doctor/tree/doctor_tests/inspector
+
+
+**Sample Inspector**
Sample Inspector is intended to show minimum functions of Doctor Inspector.
-Doctor Sample Inspector suggested to be placed in one of the controller nodes,
-but it can be put on any host where Doctor Monitor can reach and access
-the OpenStack Controller (Nova).
+Sample Inspector is suggested to be placed in one of the controller nodes,
+but it can be put on any host where Sample Inspector can reach and access
+the OpenStack Controllers (e.g. Nova, Neutron).
-Make sure OpenStack env parameters are set properly, so that Doctor Inspector
+Make sure OpenStack env parameters are set properly, so that Sample Inspector
can issue admin actions such as compute host force-down and state update of VM.
-Then, you can configure Doctor Inspector as follows:
+Then, you can configure Sample Inspector as follows:
.. code-block:: bash
- git clone https://gerrit.opnfv.org/gerrit/doctor -b stable/danube
- cd doctor/tests
+ git clone https://gerrit.opnfv.org/gerrit/doctor
+ cd doctor/doctor_tests/inspector
INSPECTOR_PORT=12345
- python inspector.py $INSPECTOR_PORT > inspector.log 2>&1 &
+ python sample.py $INSPECTOR_PORT > inspector.log 2>&1 &
**Congress**
@@ -45,9 +49,9 @@ OpenStack `Congress`_ is a Governance as a Service (previously Policy as a
Service). Congress implements Doctor Inspector as it can inspect a fault
situation and propagate errors onto other entities.
-.. _Congress: https://wiki.openstack.org/wiki/Congress
+.. _Congress: https://governance.openstack.org/tc/reference/projects/congress.html
-Congress is deployed by OPNFV installers. You need to enable doctor
+Congress is deployed by OPNFV Apex installer. You need to enable doctor
datasource driver and set policy rules. By the example configuration below,
Congress will force down nova compute service when it received a fault event
of that compute host. Also, Congress will set the state of all VMs running on
@@ -55,7 +59,12 @@ that host from ACTIVE to ERROR state.
.. code-block:: bash
- openstack congress datasource create doctor doctor
+ openstack congress datasource create doctor "doctor"
+
+ openstack congress datasource create --config api_version=$NOVA_MICRO_VERSION \
+ --config username=$OS_USERNAME --config tenant_name=$OS_TENANT_NAME \
+ --config password=$OS_PASSWORD --config auth_url=$OS_AUTH_URL \
+ nova "nova21"
openstack congress policy rule create \
--name host_down classification \
@@ -125,27 +134,82 @@ support the Doctor use case. This can be done in a few steps:
4. Restart the vitrage-graph and vitrage-notifier services
-Doctor Monitor
---------------
-
-**Doctor Sample Monitor**
+Doctor Monitors
+---------------
Doctor Monitors are suggested to be placed in one of the controller nodes,
but those can be put on any host which is reachable to target compute host and
accessible by the Doctor Inspector.
-You need to configure Monitors for all compute hosts one by one.
+You need to configure Monitors for all compute hosts one by one. You can also learn detailed
+steps for all supported monitors under `doctor/doctor_tests/monitor`_.
-Make sure OpenStack env parameters are set properly, so that Doctor Inspector
-can issue admin actions such as compute host force-down and state update of VM.
+.. _doctor/doctor_tests/monitor: https://git.opnfv.org/doctor/tree/doctor_tests/monitor
-Then, you can configure the Doctor Monitor as follows (Example for Apex deployment):
+**Sample Monitor**
+You can configure the Sample Monitor as follows (Example for Apex deployment):
.. code-block:: bash
- git clone https://gerrit.opnfv.org/gerrit/doctor -b stable/danube
- cd doctor/tests
+ git clone https://gerrit.opnfv.org/gerrit/doctor
+ cd doctor/doctor_tests/monitor
INSPECTOR_PORT=12345
COMPUTE_HOST='overcloud-novacompute-1.localdomain.com'
COMPUTE_IP=192.30.9.5
- sudo python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" \
+ sudo python sample.py "$COMPUTE_HOST" "$COMPUTE_IP" \
"http://127.0.0.1:$INSPECTOR_PORT/events" > monitor.log 2>&1 &
+
+**Collectd Monitor**
+
+OpenStack components
+====================
+
+In OPNFV and with Doctor testing you can have all OpenStack components configured
+as needed. Here is sample of the needed configuration modifications.
+
+Ceilometer
+----------
+
+/etc/ceilometer/event_definitions.yaml:
+# Maintenance use case needs new alarm definitions to be added
+- event_type: maintenance.scheduled
+ traits:
+ actions_at:
+ fields: payload.maintenance_at
+ type: datetime
+ allowed_actions:
+ fields: payload.allowed_actions
+ host_id:
+ fields: payload.host_id
+ instances:
+ fields: payload.instances
+ metadata:
+ fields: payload.metadata
+ project_id:
+ fields: payload.project_id
+ reply_url:
+ fields: payload.reply_url
+ session_id:
+ fields: payload.session_id
+ state:
+ fields: payload.state
+- event_type: maintenance.host
+ traits:
+ host:
+ fields: payload.host
+ project_id:
+ fields: payload.project_id
+ session_id:
+ fields: payload.session_id
+ state:
+ fields: payload.state
+
+/etc/ceilometer/event_pipeline.yaml:
+# Maintenance and Fault management both needs these to be added
+ - notifier://
+ - notifier://?topic=alarm.all
+
+Nova
+----
+
+/etc/nova/nova.conf
+cpu_allocation_ratio=1.0
diff --git a/docs/release/configguide/index.rst b/docs/release/configguide/index.rst
index b1e7c33d..c2331115 100644
--- a/docs/release/configguide/index.rst
+++ b/docs/release/configguide/index.rst
@@ -3,9 +3,9 @@
.. _doctor-configguide:
-*************************
-Doctor Installation Guide
-*************************
+**************************
+Doctor Configuration Guide
+**************************
.. toctree::
:maxdepth: 2
diff --git a/docs/release/index.rst b/docs/release/index.rst
index 8a1bf405..67eb4c5f 100644
--- a/docs/release/index.rst
+++ b/docs/release/index.rst
@@ -2,14 +2,18 @@
.. http://creativecommons.org/licenses/by/4.0
.. (c) 2017 OPNFV.
+.. _release:
-======
-Doctor
-======
+=======
+Release
+=======
.. toctree::
:maxdepth: 2
+ ./configguide/index.rst
./installation/index.rst
+ ./release-notes/index.rst
+ ./scenarios/fault_management/fault_management.rst
+ ./scenarios/maintenance/maintenance.rst
./userguide/index.rst
-
diff --git a/docs/development/manuals/index.rst b/docs/release/installation/index.rst
index f705f94a..f6527e5d 100644
--- a/docs/development/manuals/index.rst
+++ b/docs/release/installation/index.rst
@@ -1,13 +1,13 @@
.. This work is licensed under a Creative Commons Attribution 4.0 International License.
.. http://creativecommons.org/licenses/by/4.0
-.. _doctor-manuals:
+.. _doctor-configguide:
-*******
-Manuals
-*******
+*************************
+Doctor Installation Guide
+*************************
.. toctree::
+ :maxdepth: 2
-.. include:: mark-host-down_manual.rst
-.. include:: get-valid-server-state.rst
+ installation.rst
diff --git a/docs/release/installation/installation.rst b/docs/release/installation/installation.rst
new file mode 100644
index 00000000..564f19fd
--- /dev/null
+++ b/docs/release/installation/installation.rst
@@ -0,0 +1,44 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Doctor Installation
+====================
+
+You can clone doctor project in OPNFV installer jumphost or if you are not
+in OPNFV environment you can clone Doctor to DevStack controller node
+
+git clone https://gerrit.opnfv.org/gerrit/doctor
+
+In DevStack controller here is a sample of including what Doctor testing
+will require for sample fault management testing and for maintenance
+testing using Fenix
+
+.. code-block:: bash
+
+ git clone https://github.com/openstack/devstack -b stable/train
+
+.. code-block:: bash
+
+ cd devstack vi local.conf
+
+.. code-block:: bash
+
+ [[local|localrc]]
+ GIT_BASE=https://git.openstack.org
+ HOST_IP=<host_ip>
+ ADMIN_PASSWORD=admin
+ DATABASE_PASSWORD=admin
+ RABBIT_PASSWORD=admin
+ SERVICE_PASSWORD=admin
+ LOGFILE=/opt/stack/stack.sh.log
+
+ PUBLIC_INTERFACE=eth0
+
+ CEILOMETER_EVENT_ALARM=True
+
+ ENABLED_SERVICES=key,rabbit,mysql,fenix-engine,fenix-api,aodh-evaluator,aodh-notifier,aodh-api
+
+ enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer stable/train
+ enable_plugin aodh https://git.openstack.org/openstack/aodh stable/train
+ enable_plugin gnocchi https://github.com/openstack/gnocchi
+ enable_plugin fenix https://opendev.org/x/fenix master
diff --git a/docs/release/release-notes/index.rst b/docs/release/release-notes/index.rst
index 2e6d46e1..a0e30501 100644
--- a/docs/release/release-notes/index.rst
+++ b/docs/release/release-notes/index.rst
@@ -10,4 +10,4 @@ Doctor Release Notes
.. toctree::
:maxdepth: 2
- releasenotes.rst
+ release-notes.rst
diff --git a/docs/release/release-notes/release-notes.rst b/docs/release/release-notes/release-notes.rst
new file mode 100644
index 00000000..b525335e
--- /dev/null
+++ b/docs/release/release-notes/release-notes.rst
@@ -0,0 +1,146 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+This document provides the release notes for Iruya version of Doctor.
+
+Important notes
+===============
+
+Jerma release has mainly been for finalizing maintenance use case testing
+supporting the ETSI FEAT03 defined interactino between VNFM and infrastructure.
+This is mainly to have infrastructure maintenance and upgrade operations
+opttimized as fast as they can while keeping VNFs on top with zero impact
+on their service.
+
+Further more this is the final release of Doctor and the more deep testing is
+moving more to upstream projects like Fenix for the maintenance. Also in
+this release we have made sure that all Doctor testing and any deeper testing
+with ehe upstream projects can be done in DevStack. This also makes DevStack
+the most important installer.
+
+Summary
+=======
+
+Jerma Doctor framework uses OpenStack Train integrated into its test cases.
+
+Release Data
+============
+
+Doctor changes
+
+- Maintenance use case updated to support latest version of Fenix.
+- Maintenance use case now supports ETSI FEAT03 optimization with Fenix.
+- Doctor testing is now preferred to be done in DevStack environment
+ where one can easily select OpenStack release from Rocky to Ussuri to
+ test Doctor functionality. Latest OPNFV Fuel can also be used for the
+ OpenStack version it supports.
+
+Doctor CI
+
+- Doctor tested with fuel installer.
+- Fault management use case is tested with sample inspector.
+- Maintenance use case is tested with sample implementation and towards
+ the latest Fenix version. The includes the new ETSI FEAT03 optimization.
+
+Version change
+^^^^^^^^^^^^^^
+
+Module version changes
+~~~~~~~~~~~~~~~~~~~~~~
+
+- OpenStack has changed Train
+
+Document version changes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+All documentation is updated to OPNFV unified format according to
+documentation guidelines. Small updates in many documents.
+
+Reason for version
+^^^^^^^^^^^^^^^^^^
+
+N/A
+
+Feature additions
+~~~~~~~~~~~~~~~~~
+
++--------------------+--------------------------------------------+
+| **JIRA REFERENCE** | **SLOGAN** |
++--------------------+--------------------------------------------+
+| DOCTOR-137 | VNFM maintenance with ETSI changes |
++--------------------+--------------------------------------------+
+| DOCTOR-136 | DevStack support |
++--------------------+--------------------------------------------+
+
+
+Deliverables
+------------
+
+Software deliverables
+=====================
+
+None
+
+Documentation deliverables
+==========================
+
+https://git.opnfv.org/doctor/tree/docs
+
+Known Limitations, Issues and Workarounds
+=========================================
+
+System Limitations
+^^^^^^^^^^^^^^^^^^
+
+Maintenance test case requirements:
+
+- Minimum number of nodes: 1 Controller, 3 Computes
+- Min number of VCPUs: 2 VCPUs for each compute
+
+Known issues
+^^^^^^^^^^^^
+
+None
+
+Workarounds
+^^^^^^^^^^^
+
+None
+
+Test Result
+===========
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
++--------------------------------------+--------------+
+
+Doctor CI results with TEST_CASE='maintenance' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
+| ADMIN_TOOL_TYPE='fenix' *) | |
++--------------------------------------+--------------+
+
+*) Sample implementation not updated according to latest upstream Fenix
+ and is currently not being tested.
+
+References
+==========
+
+For more information about the OPNFV Doctor latest work, please see:
+
+https://wiki.opnfv.org/display/doctor/Doctor+Home
+
+Further information about ETSI FEAT03 optimization can be found from Fenix
+Documentation:
+
+https://fenix.readthedocs.io/en/latest
diff --git a/docs/release/release-notes/releasenotes.rst b/docs/release/release-notes/releasenotes_euphrates.rst
index 29565953..29565953 100644
--- a/docs/release/release-notes/releasenotes.rst
+++ b/docs/release/release-notes/releasenotes_euphrates.rst
diff --git a/docs/release/release-notes/releasenotes_fraser.rst b/docs/release/release-notes/releasenotes_fraser.rst
new file mode 100644
index 00000000..f1cf9d7e
--- /dev/null
+++ b/docs/release/release-notes/releasenotes_fraser.rst
@@ -0,0 +1,100 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+===================================
+OPNFV Doctor release notes (Fraser)
+===================================
+
+This document provides an overview of the Doctor project in the OPNFV Fraser
+release, including new features, known issues and documentation updates.
+
+Version history
+===============
+
++------------+----------+--------------+-------------+
+| **Date** | **Ver.** | **Author** | **Comment** |
++============+==========+==============+=============+
+| 2018-06-25 | 6.2.0 | Tomi Juvonen | |
+| 2018-05-25 | 6.1.0 | Tomi Juvonen | |
+| 2018-04-23 | 6.0.0 | Tomi Juvonen | |
++------------+----------+--------------+-------------+
+
+Important notes
+===============
+
+OPNFV Doctor project started as a requirement project and identified gaps
+between "as-is" open source software (OSS) and an "ideal" platform for NFV.
+Based on this analysis, the Doctor project proposed missing features to
+upstream OSS projects. After those features were implemented, OPNFV installer
+projects integrated the features to the OPNFV platform and the OPNFV
+infra/testing projects verified the functionalities in the OPNFV Labs. After
+Euphrates release Doctor also graduated and became a mature project. This means
+it has completed the implementation of the fault management use case. Based on
+this implementation, Doctor has now started to implement the second use case on
+maintenance.
+
+For Fraser release, the Doctor project completed re-factoring testing code by
+python, added support for installers and started working the maintenance use
+case. Doctor now supports Apex, Fuel, Joid, Compass and Daisy installer.
+
+New features
+============
+
+Doctor now supports Vitrage as Inspector for local installer.
+
+Installer support and verification status
+=========================================
+
+Integrated features
+-------------------
+
+- The enhancement work for Doctor testing code done by re-factoring in python is
+ now complete.
+- Lint support for the code changes was added.
+- Doctor now supports Vitrage as Inspector for local installer.
+
+OPNFV installer support matrix
+------------------------------
+
+Doctor has already support for several installers for fault management testing.
+This work also continued in the Fraser release. Here is latest additions [*]
+
++-----------+--------------+--------------+-----------------+--------------+--------------+
+| Installer | Aodh | Nova: Force | Nova: Get valid | Congress | Vitrage |
+| | integration | compute down | service status | integration | integration |
++===========+==============+==============+=================+==============+==============+
+| Apex | Available | Available | Available | Available | N/A |
++-----------+--------------+--------------+-----------------+--------------+--------------+
+| Fuel | Available | Available | Available | TBC | N/A |
+| (MCP) | | | | | |
++-----------+--------------+--------------+-----------------+--------------+--------------+
+| Joid | Available | TBC | TBC | Available | N/A |
+| | Not verified | | | Not verified | |
++-----------+--------------+--------------+-----------------+--------------+--------------+
+| Compass | Available | TBC | TBC | Available | N/A |
+| | Not verified | | | Not verified | |
++-----------+--------------+--------------+-----------------+--------------+--------------+
+| Daisy* | Available | TBC | TBC | TBC | N/A |
+| | | | | | |
++-----------+--------------+--------------+-----------------+--------------+--------------+
+| Local | Available | TBC | TBC | Available | Available* |
+| | Not verified | | | Not verified | Not verified |
++-----------+--------------+--------------+-----------------+--------------+--------------+
+
+Note: Local installer is devstack.
+
+Note: 'Not verified' means that we didn't verify the functionality by having
+our own test scenario running in OPNFV CI pipeline yet.
+
+Documentation updates
+=====================
+
+No major updates
+
+Known issues
+============
+
+- Testing code for `port-data-plane-status` in Doctor repository was disabled
+ in 5.0, as we have problem in neutron client load in CI job container.
+- Maintenance test case work was started in Fraser. Some initial test case code
+ is available, however it is yet not fully implemented in this release.
diff --git a/docs/release/release-notes/releasenotes_gambia.rst b/docs/release/release-notes/releasenotes_gambia.rst
new file mode 100644
index 00000000..142bfacf
--- /dev/null
+++ b/docs/release/release-notes/releasenotes_gambia.rst
@@ -0,0 +1,303 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+This document provides the release notes for Gambia of Doctor.
+
+Important notes
+===============
+
+In Gambia release, Doctor has been working with our second use case over
+maintenance. Design guideline is now done and test case exists with sample
+maintenance workflow code implemented in Doctor. Work has also started to have
+the real implementation done in the OpenStack Fenix project
+https://wiki.openstack.org/wiki/Fenix.
+
+Doctor CI testing has now moved to use tox on jumphots instead of running test
+through features container. Also in Apex we use OpenStack services running in
+containers. Functest daily testing supports Doctor fault management test case
+for Apex, Daisy and Fuel installers. This testing is done through features
+container.
+
+In this release, Doctor has not been working with the fault management use case as
+the basic framework has been already done. However, we might need to get back to
+it later to better meet the tough industry requirements as well as requirements
+from edge, containers and 5G.
+
+
+Summary
+=======
+
+Gambia Doctor framework uses OpenStack Queens integrated into its test cases.
+Compared to the previous release, the Heat project is also being used in the
+maintenance test case.
+
+Release Data
+============
+
+Doctor changes
+
++------------------------------------------+----------------------------------------------------------+
+| **commit-ID** | **Subject** |
++------------------------------------------+----------------------------------------------------------+
+| 5b3f5937e7b861fca46b2a6b2d6708866b800f95 | fix building docs |
++------------------------------------------+----------------------------------------------------------+
+| 2ca5924081ce4784f599437707bd32807aa155ce | Fix SSH client connection reset |
++------------------------------------------+----------------------------------------------------------+
+| baac6579556f8216b36db0d0f87f9c2d4f8b4ef5 | Support Apex with services in containers |
++------------------------------------------+----------------------------------------------------------+
+| 23bf63c4616040cb0d69cd26238af2a4a7c00a90 | fix the username to login undercloud in Apex |
++------------------------------------------+----------------------------------------------------------+
+| 61eb3927ada784cc3dffb5ddd17f66e47871f708 | Local Documentation Builds |
++------------------------------------------+----------------------------------------------------------+
+| 0f1dd4314b9e0247d9af7af6df2410462423aeca | Updated from global requirements |
++------------------------------------------+----------------------------------------------------------+
+| 2d4a9f0c0a93797da6534583f6e74553a4b634be | Fix links to remove references to submodules |
++------------------------------------------+----------------------------------------------------------+
+| 3ddc2392b0ed364eede49ff006d64df3ea456350 | Gambia release notes |
++------------------------------------------+----------------------------------------------------------+
+| 825a0a0dd5e8028129b782ed21c549586257b1c5 | delete doctor datasource in congress when cleanup |
++------------------------------------------+----------------------------------------------------------+
+| fcf53129ab2b18b84571faff13d7cb118b3a41b3 | run profile even the notification time is larger than 1S |
++------------------------------------------+----------------------------------------------------------+
+| 495965d0336d42fc36494c81fd15cee2f34c96e9 | Update and add test case |
++------------------------------------------+----------------------------------------------------------+
+| da25598a6a31abe0579ffed12d1719e5ff75f9a7 | bugfix: add doctor datasource in congress |
++------------------------------------------+----------------------------------------------------------+
+| f9e1e3b1ae4be80bc2dc61d9c4213c81c091ea72 | Update the maintenance design document |
++------------------------------------------+----------------------------------------------------------+
+| 4639f15e6db2f1480b41f6fbfd11d70312d4e421 | Add maintenance test code |
++------------------------------------------+----------------------------------------------------------+
+| b54cbc5dd2d32fcb27238680b4657ed384d021c5 | Add setup and cleanup for maintenance test |
++------------------------------------------+----------------------------------------------------------+
+| b2bb504032ac81a2ed3f404113b097d9ce3d7f14 | bugfix: kill the stunnel when cleanup |
++------------------------------------------+----------------------------------------------------------+
+| eaeb3c0f9dc9e6645a159d0a78b9fc181fce53d4 | add ssh_keyfile for connect to installer in Apex |
++------------------------------------------+----------------------------------------------------------+
+| dcbe7bf1c26052b0e95d209254e7273aa1eaace1 | Add tox and test case to testing document |
++------------------------------------------+----------------------------------------------------------+
+| 0f607cb5efd91ee497346b7f792dfa844d15595c | enlarge the time of link down |
++------------------------------------------+----------------------------------------------------------+
+| 1351038a65739b8d799820de515178326ad05f7b | bugfix: fix the filename of ssh tunnel |
++------------------------------------------+----------------------------------------------------------+
+| e70bf248daac03eee6b449cd1654d2ee6265dd8c | Use py34 instead of py35 |
++------------------------------------------+----------------------------------------------------------+
+| 2a60d460eaf018951456451077b7118b60219b32 | add INSPECTOR_TYPE and TEST_CASE to tox env |
++------------------------------------------+----------------------------------------------------------+
+| 2043ceeb08c1eca849daeb2b3696d385425ba061 | [consumer] fix default value for port number |
++------------------------------------------+----------------------------------------------------------+
+
+Releng changes
+
++------------------------------------------+-----------------------------------------------------------------------+
+| **commit-ID** | **Subject** |
++------------------------------------------+-----------------------------------------------------------------------+
+| c87309f5a75ccc5d595f708817b97793c24c4387 | Add Doctor maintenance job |
++------------------------------------------+-----------------------------------------------------------------------+
+| bd16a9756ffd0743e143f0f2f966da8dd666c7a3 | remove congress test in Daisy |
++------------------------------------------+-----------------------------------------------------------------------+
+| c47aaaa53c91aae93877f2532c72374beaa4eabe | remove fuel job in Doctor |
++------------------------------------------+-----------------------------------------------------------------------+
+| ab2fed2522eaf82ea7c63dd05008a37c56e825d0 | use 'workspace-cleanup' plugin in publisher |
++------------------------------------------+-----------------------------------------------------------------------+
+| 3aaed5cf40092744f1b87680b9205a2901baecf3 | clean the workspace in the publisher |
++------------------------------------------+-----------------------------------------------------------------------+
+| 50151eb3717edd4ddd996f3705fbe1732de7f3b7 | run tox with 'sudo' |
++------------------------------------------+-----------------------------------------------------------------------+
+| a3adc85ecb52f5d19ec4e9c49ca1ac35aa429ff9 | remove inspector variable form job template |
++------------------------------------------+-----------------------------------------------------------------------+
+| adfbaf2a3e8487e4c9152bf864a653a0425b8582 | run doctor tests with different inspectors in sequence |
++------------------------------------------+-----------------------------------------------------------------------+
+| 2e98e56224cd550cb3bf9798e420eece28139bd9 | add the ssh_key info if the key_file is exist |
++------------------------------------------+-----------------------------------------------------------------------+
+| c109c271018e9a85d94be1b9b468338d64589684 | prepare installer info for doctor test |
++------------------------------------------+-----------------------------------------------------------------------+
+| 57cbefc7160958eae1d49e4753779180a25864af | use py34 for tox |
++------------------------------------------+-----------------------------------------------------------------------+
+| 3547754e808a581b09c9d22e013a7d986d9f6cd1 | specify the cacert file when it exits |
++------------------------------------------+-----------------------------------------------------------------------+
+| ef4f36aa1c2ff0819d73cde44f84b99a42e15c7e | bugfix: wrong usage of '!include-raw' |
++------------------------------------------+-----------------------------------------------------------------------+
+| 0e0e0d4cb71fb27b1789a2bef2d3c4ff313e67ff | use tox instead of functest for doctor CI jobs |
++------------------------------------------+-----------------------------------------------------------------------+
+| 5b22f1b95feacaec0380f6a7543cbf510b628451 | pass value to parameters |
++------------------------------------------+-----------------------------------------------------------------------+
+| 44ab0cea07fa2a734c4f6b80776ad48fd006d1b8 | Doctor job bugfix: fix the scenario |
++------------------------------------------+-----------------------------------------------------------------------+
+| 17617f1c0a78c7bdad0d11d329a6c7e119cbbddd | bugfix: run doctor tests parallelly |
++------------------------------------------+-----------------------------------------------------------------------+
+| 811e4ef7f4c37b7bc246afc34ff880c014ecc05d | delete 'opnfv-build-ubuntu-defaults' parameters for doctor verify job |
++------------------------------------------+-----------------------------------------------------------------------+
+| 0705f31ab5bc54c073df120cbe0fe62cf10f9a81 | delete the 'node' parameter in 'doctor-slave-parameter' macro |
++------------------------------------------+-----------------------------------------------------------------------+
+| 304151b15f9d7241db8c5fea067cafe048287d84 | fix the default node label for doctor test |
++------------------------------------------+-----------------------------------------------------------------------+
+| a6963f92f015a33b44b27199886952205499b44c | Fix project name |
++------------------------------------------+-----------------------------------------------------------------------+
+| f122bfed998b3b0e0178106a7538377c609c6512 | add a default value for SSH_KEY |
++------------------------------------------+-----------------------------------------------------------------------+
+
+Version change
+^^^^^^^^^^^^^^
+
+Module version changes
+~~~~~~~~~~~~~~~~~~~~~~
+
+- OpenStack has changed from Pike-1 to Queens-1
+
+Document version changes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These documents have been updated in Gambia release
+
+- Testing document
+ docs/development/overview/testing.rst
+- Doctor scenario in functest
+ docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst
+- Maintenance design guideline
+ docs/development/design/maintenance-design-guideline.rst
+
+Reason for version
+^^^^^^^^^^^^^^^^^^
+
+Documentation is updated due to tox usage in testing and adding maintenance
+use case related documentation.
+
+Feature additions
+~~~~~~~~~~~~~~~~~
+
++--------------------+--------------------------------------------------------+
+| **JIRA REFERENCE** | **SLOGAN** |
++--------------------+--------------------------------------------------------+
+| DOCTOR-106 | Maintenance scenario |
++--------------------+--------------------------------------------------------+
+| DOCTOR-125 | Maintenance design document according to our test case |
++--------------------+--------------------------------------------------------+
+| DOCTOR-126 | Use Tox instead of Functest for doctor CI jobs |
++--------------------+--------------------------------------------------------+
+| DOCTOR-127 | Maintenance test POD |
++--------------------+--------------------------------------------------------+
+| DOCTOR-130 | Apex with containers |
++--------------------+--------------------------------------------------------+
+
+
+
+Deliverables
+------------
+
+
+Software deliverables
+=====================
+
+None
+
+Documentation deliverables
+==========================
+
+https://git.opnfv.org/doctor/tree/docs
+
+Known Limitations, Issues and Workarounds
+=========================================
+
+System Limitations
+^^^^^^^^^^^^^^^^^^
+
+Maintenance test case requirements:
+
+- Minimum number of nodes: 1 Controller, 3 Computes
+- Min number of VCPUs: 2 VCPUs for each compute
+
+Known issues
+^^^^^^^^^^^^
+
+None
+
+Workarounds
+^^^^^^^^^^^
+
+None
+
+Test Result
+===========
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | SUCCESS |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Compass' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Daisy' | SUCCESS |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Fuel' | No POD |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Joid' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Local' | N/A |
++--------------------------------------+--------------+
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=congress
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | FAILED |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Compass' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Daisy' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Fuel' | No POD |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Joid' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Local' | N/A |
++--------------------------------------+--------------+
+
+
+Doctor Functest results with TEST_CASE='fault_management'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | skipped |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Compass' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Daisy' | skipped |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Fuel' | skipped |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Joid' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Local' | N/A |
++--------------------------------------+--------------+
+
+Note: Installer Functest does not currently test features or skips running the
+project test cases
+
+Doctor CI results with TEST_CASE='maintenance'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | SUCCESS |
++--------------------------------------+--------------+
+
+Doctor Functest results with TEST_CASE='maintenance'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+N/A - Needs special target and currently there is only sample implementation
+
+References
+==========
+
+For more information about the OPNFV Doctor latest work, please see:
+
+https://wiki.opnfv.org/display/doctor/Doctor+Home
diff --git a/docs/release/release-notes/releasenotes_iruya.rst b/docs/release/release-notes/releasenotes_iruya.rst
new file mode 100644
index 00000000..92775557
--- /dev/null
+++ b/docs/release/release-notes/releasenotes_iruya.rst
@@ -0,0 +1,129 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+This document provides the release notes for Iruya version of Doctor.
+
+Important notes
+===============
+
+In Iruya release there has not been many changes.
+
+All testing is now being made with Fuel installer. Maintenance use case
+is now only tested against latest upstream Fenix. Only sample inspector is
+tested as Fuel do not support Vitrage or Congress.
+
+Summary
+=======
+
+Iruya Doctor framework uses OpenStack Stein integrated into its test cases.
+
+Release Data
+============
+
+Doctor changes
+
+- Maintenance use case updated to support latest version of Fenix running
+ in container on controller node
+- Maintenance use case now support Fuel installer
+- Doctor updated to use OpenStack Stein and only python 3.6
+- Testing only sample inspector as lacking installer support for
+ Vitrage and Congress
+
+Releng changes
+
+- Doctor testing running with python 3.6 and with sample inspector
+- Doctor is only tested with Fuel installer
+
+Version change
+^^^^^^^^^^^^^^
+
+Module version changes
+~~~~~~~~~~~~~~~~~~~~~~
+
+- OpenStack has changed from Rocky to Stein since previous Hunter release.
+
+Document version changes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+Reason for version
+^^^^^^^^^^^^^^^^^^
+
+N/A
+
+Feature additions
+~~~~~~~~~~~~~~~~~
+
++--------------------+--------------------------------------------------------------+
+| **JIRA REFERENCE** | **SLOGAN** |
++--------------------+--------------------------------------------------------------+
+| DOCTOR-134 | Update Doctor maintenance use case to work with latest Fenix |
++--------------------+--------------------------------------------------------------+
+
+Deliverables
+------------
+
+Software deliverables
+=====================
+
+None
+
+Documentation deliverables
+==========================
+
+https://git.opnfv.org/doctor/tree/docs
+
+Known Limitations, Issues and Workarounds
+=========================================
+
+System Limitations
+^^^^^^^^^^^^^^^^^^
+
+Maintenance test case requirements:
+
+- Minimum number of nodes: 1 Controller, 3 Computes
+- Min number of VCPUs: 2 VCPUs for each compute
+
+Known issues
+^^^^^^^^^^^^
+
+None
+
+Workarounds
+^^^^^^^^^^^
+
+None
+
+Test Result
+===========
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
++--------------------------------------+--------------+
+
+Doctor CI results with TEST_CASE='maintenance' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
+| ADMIN_TOOL_TYPE='fenix' *) | |
++--------------------------------------+--------------+
+
+*) Sample implementation not updated according to latest upstream Fenix
+ and is currently not being tested.
+
+References
+==========
+
+For more information about the OPNFV Doctor latest work, please see:
+
+https://wiki.opnfv.org/display/doctor/Doctor+Home
diff --git a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst b/docs/release/scenarios/fault_management/fault_management.rst
index b3d73d5c..99371201 100644
--- a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst
+++ b/docs/release/scenarios/fault_management/fault_management.rst
@@ -2,53 +2,19 @@
.. http://creativecommons.org/licenses/by/4.0
+Running test cases
+""""""""""""""""""
-Platform overview
-"""""""""""""""""
+Functest will call the "doctor_tests/main.py" in Doctor to run the test job.
+Doctor testing can also be triggered by tox on OPNFV installer jumphost. Tox
+is normally used for functional, module and coding style testing in Python
+project.
-Doctor platform provides these features in `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_:
+Currently 'MCP' and 'devstack' installer are supported.
-* Immediate Notification
-* Consistent resource state awareness for compute host down
-* Valid compute host status given to VM owner
-These features enable high availability of Network Services on top of
-the virtualized infrastructure. Immediate notification allows VNF managers
-(VNFM) to process recovery actions promptly once a failure has occurred.
-
-Consistency of resource state is necessary to execute recovery actions
-properly in the VIM.
-
-Ability to query host status gives VM owner the possibility to get
-consistent state information through an API in case of a compute host
-fault.
-
-The Doctor platform consists of the following components:
-
-* OpenStack Compute (Nova)
-* OpenStack Telemetry (Ceilometer)
-* OpenStack Alarming (Aodh)
-* Doctor Inspector
-* Doctor Monitor
-
-.. note::
- Doctor Inspector and Monitor are sample implementations for reference.
-
-You can see an overview of the Doctor platform and how components interact in
-:numref:`figure-p1`.
-
-.. figure:: ./images/figure-p1.png
- :name: figure-p1
- :width: 100%
-
- Doctor platform and typical sequence
-
-Detailed information on the Doctor architecture can be found in the Doctor
-requirements documentation:
-http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html
-
-Use case
-""""""""
+Fault management use case
+"""""""""""""""""""""""""
* A consumer of the NFVI wants to receive immediate notifications about faults
in the NFVI affecting the proper functioning of the virtual resources.
@@ -67,7 +33,8 @@ configuration.
Detailed workflow information is as follows:
* Consumer(VNFM): (step 0) creates resources (network, server/instance) and an
- event alarm on state down notification of that server/instance
+ event alarm on state down notification of that server/instance or Neutron
+ port.
* Monitor: (step 1) periodically checks nodes, such as ping from/to each
dplane nic to/from gw of node, (step 2) once it fails to send out event
@@ -75,29 +42,26 @@ Detailed workflow information is as follows:
* Inspector: when it receives an event, it will (step 3) mark the host down
("mark-host-down"), (step 4) map the PM to VM, and change the VM status to
- down
+ down. In network failure case, also Neutron port is changed to down.
-* Controller: (step 5) sends out instance update event to Ceilometer
+* Controller: (step 5) sends out instance update event to Ceilometer. In network
+ failure case, also Neutron port is changed to down and corresponding event is
+ sent to Ceilometer.
-* Notifier: (step 6) Ceilometer transforms and passes the event to Aodh,
- (step 7) Aodh will evaluate event with the registered alarm definitions,
+* Notifier: (step 6) Ceilometer transforms and passes the events to AODH,
+ (step 7) AODH will evaluate events with the registered alarm definitions,
then (step 8) it will fire the alarm to the "consumer" who owns the
instance
* Consumer(VNFM): (step 9) receives the event and (step 10) recreates a new
instance
-Test case
-"""""""""
-
-Functest will call the "run.sh" script in Doctor to run the test job.
+Fault management test case
+""""""""""""""""""""""""""
-Currently, only 'Apex' and 'local' installer are supported. The test also
-can run successfully in 'fuel' installer with the modification of some
-configurations of OpenStack in the script. But still need 'fuel' installer
-to support these configurations.
+Functest will call the 'doctor-test' command in Doctor to run the test job.
-The "run.sh" script will execute the following steps.
+The following steps are executed:
Firstly, get the installer ip according to the installer type. Then ssh to
the installer node to get the private key for accessing to the cloud. As
diff --git a/docs/release/scenarios/maintenance/images/Fault-management-design.png b/docs/release/scenarios/maintenance/images/Fault-management-design.png
new file mode 100644
index 00000000..6d98cdec
--- /dev/null
+++ b/docs/release/scenarios/maintenance/images/Fault-management-design.png
Binary files differ
diff --git a/docs/development/overview/functest_scenario/images/LICENSE b/docs/release/scenarios/maintenance/images/LICENSE
index 21a2d03d..21a2d03d 100644
--- a/docs/development/overview/functest_scenario/images/LICENSE
+++ b/docs/release/scenarios/maintenance/images/LICENSE
diff --git a/docs/release/scenarios/maintenance/images/Maintenance-design.png b/docs/release/scenarios/maintenance/images/Maintenance-design.png
new file mode 100644
index 00000000..8f21db6a
--- /dev/null
+++ b/docs/release/scenarios/maintenance/images/Maintenance-design.png
Binary files differ
diff --git a/docs/release/scenarios/maintenance/images/Maintenance-workflow.png b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png
new file mode 100644
index 00000000..9b65fd59
--- /dev/null
+++ b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png
Binary files differ
diff --git a/docs/release/scenarios/maintenance/maintenance.rst b/docs/release/scenarios/maintenance/maintenance.rst
new file mode 100644
index 00000000..ecfe76b1
--- /dev/null
+++ b/docs/release/scenarios/maintenance/maintenance.rst
@@ -0,0 +1,120 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+Maintenance use case
+""""""""""""""""""""
+
+* A consumer of the NFVI wants to interact with NFVI maintenance, upgrade,
+ scaling and to have graceful retirement. Receiving notifications over these
+ NFVI events and responding to those within given time window, consumer can
+ guarantee zero downtime to his service.
+
+The maintenance use case adds the Doctor platform an `admin tool` and an
+`app manager` component. Overview of maintenance components can be seen in
+:numref:`figure-p2`.
+
+.. figure:: ./images/Maintenance-design.png
+ :name: figure-p2
+ :width: 100%
+
+ Doctor platform components in maintenance use case
+
+In maintenance use case, `app manager` (VNFM) will subscribe to maintenance
+notifications triggered by project specific alarms through AODH. This is the way
+it gets to know different NFVI maintenance, upgrade and scaling operations that
+effect to its instances. The `app manager` can do actions depicted in `green
+color` or tell `admin tool` to do admin actions depicted in `orange color`
+
+Any infrastructure component like `Inspector` can subscribe to maintenance
+notifications triggered by host specific alarms through AODH. Subscribing to the
+notifications needs admin privileges and can tell when a host is out of use as
+in maintenance and when it is taken back to production.
+
+Maintenance test case
+"""""""""""""""""""""
+
+Maintenance test case is currently running in our Apex CI and executed by tox.
+This is because the special limitation mentioned below and also the fact we
+currently have only sample implementation as a proof of concept and we also
+support unofficial OpenStack project Fenix. Environment variable
+TEST_CASE='maintenance' needs to be used when executing "doctor_tests/main.py"
+and ADMIN_TOOL_TYPE='fenix' if want to test with Fenix instead of sample
+implementation. Test case workflow can be seen in :numref:`figure-p3`.
+
+.. figure:: ./images/Maintenance-workflow.png
+ :name: figure-p3
+ :width: 100%
+
+ Maintenance test case workflow
+
+In test case all compute capacity will be consumed with project (VNF) instances.
+For redundant services on instances and an empty compute needed for maintenance,
+test case will need at least 3 compute nodes in system. There will be 2
+instances on each compute, so minimum number of VCPUs is also 2. Depending on
+how many compute nodes there is application will always have 2 redundant
+instances (ACT-STDBY) on different compute nodes and rest of the compute
+capacity will be filled with non-redundant instances.
+
+For each project specific maintenance message there is a time window for
+`app manager` to make any needed action. This will guarantee zero
+down time for his service. All replies back are done by calling `admin tool` API
+given in the message.
+
+The following steps are executed:
+
+Infrastructure admin will call `admin tool` API to trigger maintenance for
+compute hosts having instances belonging to a VNF.
+
+Project specific `MAINTENANCE` notification is triggered to tell `app manager`
+that his instances are going to hit by infrastructure maintenance at a specific
+point in time. `app manager` will call `admin tool` API to answer back
+`ACK_MAINTENANCE`.
+
+When the time comes to start the actual maintenance workflow in `admin tool`,
+a `DOWN_SCALE` notification is triggered as there is no empty compute node for
+maintenance (or compute upgrade). Project receives corresponding alarm and scales
+down instances and call `admin tool` API to answer back `ACK_DOWN_SCALE`.
+
+As it might happen instances are not scaled down (removed) from a single
+compute node, `admin tool` might need to figure out what compute node should be
+made empty first and send `PREPARE_MAINTENANCE` to project telling which instance
+needs to be migrated to have the needed empty compute. `app manager` makes sure
+he is ready to migrate instance and call `admin tool` API to answer back
+`ACK_PREPARE_MAINTENANCE`. `admin tool` will make the migration and answer
+`ADMIN_ACTION_DONE`, so `app manager` knows instance can be again used.
+
+:numref:`figure-p3` has next a light blue section of actions to be done for each
+compute. However as we now have one empty compute, we will maintain/upgrade that
+first. So on first round, we can straight put compute in maintenance and send
+admin level host specific `IN_MAINTENANCE` message. This is caught by `Inspector`
+to know host is down for maintenance. `Inspector` can now disable any automatic
+fault management actions for the host as it can be down for a purpose. After
+`admin tool` has completed maintenance/upgrade `MAINTENANCE_COMPLETE` message
+is sent to tell host is back in production.
+
+Next rounds we always have instances on compute, so we need to have
+`PLANNED_MAINTANANCE` message to tell that those instances are now going to hit
+by maintenance. When `app manager` now receives this message, he knows instances
+to be moved away from compute will now move to already maintained/upgraded host.
+In test case no upgrade is done on application side to upgrade instances
+according to new infrastructure capabilities, but this could be done here as
+this information is also passed in the message. This might be just upgrading
+some RPMs, but also totally re-instantiating instance with a new flavor. Now if
+application runs an active side of a redundant instance on this compute,
+a switch over will be done. After `app manager` is ready he will call
+`admin tool` API to answer back `ACK_PLANNED_MAINTENANCE`. In test case the
+answer is `migrate`, so `admin tool` will migrate instances and reply
+`ADMIN_ACTION_DONE` and then `app manager` knows instances can be again used.
+Then we are ready to make the actual maintenance as previously trough
+`IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` steps.
+
+After all computes are maintained, `admin tool` can send `MAINTENANCE_COMPLETE`
+to tell maintenance/upgrade is now complete. For `app manager` this means he
+can scale back to full capacity.
+
+There is currently sample implementation on VNFM and test case. In
+infrastructure side there is sample implementation of 'admin_tool' and
+there is also support for the OpenStack Fenix that extends the use case to
+support 'ETSI FEAT03' for VNFM interaction and to optimize the whole
+infrastructure mainteannce and upgrade.
diff --git a/docs/release/userguide/feature.userguide.rst b/docs/release/userguide/feature.userguide.rst
index 0dde4f26..0783e0fd 100644
--- a/docs/release/userguide/feature.userguide.rst
+++ b/docs/release/userguide/feature.userguide.rst
@@ -9,7 +9,7 @@ Doctor. The implementation is based on OpenStack and related components. The
Monitor can be realized by a sample Python-based implementation provided in the
Doctor code repository. The Controller is realized by OpenStack Nova, Neutron
and Cinder for compute, network and storage, respectively. The Inspector can be
-realized by OpenStack Congress or a sample Python-based implementation also
+realized by OpenStack Congress, Vitrage or a sample Python-based implementation also
available in the code repository of Doctor. The Notifier is realized by
OpenStack Aodh.
@@ -26,18 +26,21 @@ Immediate Notification
Immediate notification can be used by creating 'event' type alarm via
OpenStack Alarming (Aodh) API with relevant internal components support.
-See, upstream spec document:
-http://specs.openstack.org/openstack/ceilometer-specs/specs/liberty/event-alarm-evaluator.html
+See:
+- Upstream spec document:
+https://specs.openstack.org/openstack/ceilometer-specs/specs/liberty/event-alarm-evaluator.html
+- Aodh official documentation:
+https://docs.openstack.org/aodh/latest
An example of a consumer of this notification can be found in the Doctor
repository. It can be executed as follows:
.. code-block:: bash
- git clone https://gerrit.opnfv.org/gerrit/doctor -b stable/danube
- cd doctor/tests
+ git clone https://gerrit.opnfv.org/gerrit/doctor
+ cd doctor/doctor_tests/consumer
CONSUMER_PORT=12346
- python consumer.py "$CONSUMER_PORT" > consumer.log 2>&1 &
+ python sample.py "$CONSUMER_PORT" > consumer.log 2>&1 &
Consistent resource state awareness
-----------------------------------
@@ -46,9 +49,10 @@ Resource state of compute host can be changed/updated according to a trigger
from a monitor running outside of OpenStack Compute (Nova) by using
force-down API.
-See
-http://artifacts.opnfv.org/doctor/danube/manuals/mark-host-down_manual.html
-for more detail.
+See:
+* Upstream spec document: https://specs.openstack.org/openstack/nova-specs/specs/liberty/implemented/mark-host-down.html
+* Upstream Compute API reference document: https://developer.openstack.org/api-ref/compute
+* Doctor Mark Host Down Manual: https://git.opnfv.org/doctor/tree/docs/development/manuals/mark-host-down_manual.rst
Valid compute host status given to VM owner
-------------------------------------------
@@ -56,6 +60,42 @@ Valid compute host status given to VM owner
The resource state of a compute host can be retrieved by a user with the
OpenStack Compute (Nova) servers API.
-See
-http://artifacts.opnfv.org/doctor/danube/manuals/get-valid-server-state.html
-for more detail.
+See:
+* Upstream spec document: https://specs.openstack.org/openstack/nova-specs/specs/mitaka/implemented/get-valid-server-state.html
+* Upstream Compute API reference document: https://developer.openstack.org/api-ref/compute
+* Doctor Get Valid Server State Manual: https://git.opnfv.org/doctor/tree/docs/development/manuals/get-valid-server-state.rst
+
+Port data plane status update
+-----------------------------
+
+Port data plane status can be changed/updated in the case of issues in the underlying data plane
+affecting connectivity from/to Neutron ports.
+
+See:
+* Upstream spec document: https://specs.openstack.org/openstack/neutron-specs/specs/pike/port-data-plane-status.html
+* Upstream Networking API reference document: https://developer.openstack.org/api-ref/network
+
+Doctor driver (Congress)
+------------------------
+
+The Doctor driver can be notified about NFVI failures that have been detected by monitoring systems.
+
+See:
+* Upstream spec document: https://specs.openstack.org/openstack/congress-specs/specs/mitaka/push-type-datasource-driver.html
+* Congress official documentation: https://docs.openstack.org/congress/latest
+
+Event API (Vitrage)
+-------------------
+With this API, monitoring systems can push events to the Doctor datasource.
+
+See:
+* Upstream spec document: https://specs.openstack.org/openstack/vitrage-specs/specs/ocata/event-api.html
+* Vitrage official documentation: https://docs.openstack.org/vitrage/latest
+
+Doctor datasource (Vitrage)
+---------------------------
+After receiving events from monitoring systems, the Doctor datasource identifies the affected resources based on the resource topology.
+
+See:
+* Upstream spec document: https://specs.openstack.org/openstack/vitrage-specs/specs/ocata/doctor-datasource.html
+
diff --git a/docs/development/manuals/get-valid-server-state.rst b/docs/release/userguide/get-valid-server-state.rst
index 824ea3c2..824ea3c2 100644
--- a/docs/development/manuals/get-valid-server-state.rst
+++ b/docs/release/userguide/get-valid-server-state.rst
diff --git a/docs/release/userguide/index.rst b/docs/release/userguide/index.rst
index eee855dc..577072c7 100644
--- a/docs/release/userguide/index.rst
+++ b/docs/release/userguide/index.rst
@@ -11,3 +11,6 @@ Doctor User Guide
:maxdepth: 2
feature.userguide.rst
+ get-valid-server-state.rst
+ mark-host-down_manual.rst
+ monitors.rst
diff --git a/docs/development/manuals/mark-host-down_manual.rst b/docs/release/userguide/mark-host-down_manual.rst
index 3815205d..3815205d 100644
--- a/docs/development/manuals/mark-host-down_manual.rst
+++ b/docs/release/userguide/mark-host-down_manual.rst
diff --git a/docs/development/manuals/monitors.rst b/docs/release/userguide/monitors.rst
index 0d22b1de..eeb5e226 100644
--- a/docs/development/manuals/monitors.rst
+++ b/docs/release/userguide/monitors.rst
@@ -23,7 +23,8 @@ calculated by using the difference of time at which compute node sends notificat
control node and the time at which consumer is notified. The time on control and compute
node has to be synchronized for this reason. For further details on setting up collectd
on the compute node, use the following link:
-http://docs.opnfv.org/en/stable-danube/submodules/barometer/docs/release/userguide/feature.userguide.html#id18
+:doc:`<barometer:release/userguide/feature.userguide>`
+
Collectd monitors an interface managed by OVS. If the interface is not be assigned
an IP, the user has to provide the name of interface to be monitored. The command to
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..9fde2df2
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+lfdocs-conf
+sphinx_opnfv_theme
diff --git a/docs/testing/developer/index.rst b/docs/testing/developer/index.rst
new file mode 100644
index 00000000..dfbcfa74
--- /dev/null
+++ b/docs/testing/developer/index.rst
@@ -0,0 +1,13 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+*********
+Developer
+*********
+
+.. toctree::
+ :numbered:
+ :maxdepth: 2
+
+ testing.rst
diff --git a/docs/testing/developer/testing.rst b/docs/testing/developer/testing.rst
new file mode 100644
index 00000000..6a929130
--- /dev/null
+++ b/docs/testing/developer/testing.rst
@@ -0,0 +1,82 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+==============
+Testing Doctor
+==============
+
+You have two options to test Doctor functions with the script developed
+for doctor CI.
+
+You need to install OpenStack and other OPNFV components except Doctor Sample
+Inspector, Sample Monitor and Sample Consumer, as these will be launched in
+this script. You are encouraged to use OPNFV official installers, but you can
+also deploy all components with other installers such as devstack or manual
+operation. In those cases, the versions of all components shall be matched with
+the versions of them in OPNFV specific release.
+
+Run Test Script
+===============
+
+Doctor project has own testing script under `doctor/doctor_tests`_. This test script
+can be used for functional testing agained an OPNFV deployment.
+
+.. _doctor/doctor_tests: https://git.opnfv.org/doctor/tree/doctor_tests
+
+Before running this script, make sure OpenStack env parameters are set properly
+(See e.g. `OpenStackClient Configuration`_), so that Doctor Inspector can operate
+OpenStack services.
+
+.. _OpenStackClient Configuration: https://docs.openstack.org/python-openstackclient/latest/configuration/index.html
+
+Doctor now supports different test cases and for that you might want to
+export TEST_CASE with different values:
+
+.. code-block:: bash
+
+ #Fault management (default)
+ export TEST_CASE='fault_management'
+ #Maintenance (requires 3 compute nodes)
+ export TEST_CASE='maintenance'
+ #Run both tests cases
+ export TEST_CASE='all'
+
+ #Use Fenix in maintenance testing instead of sample admin_tool
+ #This is only for 'mainteanance' test case
+ export ADMIN_TOOL_TYPE='fenix'
+ export APP_MANAGER_TYPE='vnfm'
+
+ #Run in different installer jumphost 'fuel' or 'apex'
+ #In multinode DevStack you run Doctor in controller node
+ #with value export APP_MANAGER_TYPE=vnfm
+ export INSTALLER_TYPE='fuel'
+
+Run Python Test Script
+~~~~~~~~~~~~~~~~~~~~~~
+
+You can run the python script as follows:
+
+.. code-block:: bash
+
+ git clone https://gerrit.opnfv.org/gerrit/doctor
+ cd doctor && tox
+
+You can see all the configurations with default values in sample configuration
+file `doctor.sample.conf`_. And you can also modify the file to meet your
+environment and then run the test.
+
+.. _doctor.sample.conf: https://git.opnfv.org/doctor/tree/etc/doctor.sample.conf
+
+In OPNFV testing environment jumphost you can run Doctor testing as follows
+using tox:
+
+.. code-block:: bash
+
+ source overcloudrc
+ export INSTALLER_IP=${INSTALLER_IP}
+ export INSTALLER_TYPE=${INSTALLER_TYPE}
+ git clone https://gerrit.opnfv.org/gerrit/doctor
+ cd doctor
+ sudo -E tox
+
+Note! In DevStack you run Doctor in controller node.
diff --git a/docs/testing/index.rst b/docs/testing/index.rst
new file mode 100644
index 00000000..3fae9568
--- /dev/null
+++ b/docs/testing/index.rst
@@ -0,0 +1,15 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+.. _testing:
+
+=======
+Testing
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ ./developer/index.rst
+ ./user/index.rst
diff --git a/docs/testing/user/index.rst b/docs/testing/user/index.rst
new file mode 100644
index 00000000..1be9c7eb
--- /dev/null
+++ b/docs/testing/user/index.rst
@@ -0,0 +1,13 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+****
+User
+****
+
+.. toctree::
+ :numbered:
+ :maxdepth: 2
+
+ testing.rst
diff --git a/docs/testing/user/testing.rst b/docs/testing/user/testing.rst
new file mode 100644
index 00000000..6172d26a
--- /dev/null
+++ b/docs/testing/user/testing.rst
@@ -0,0 +1,30 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Run Functest Suite (obsolete)
+=============================
+
+Functest supports Doctor testing by triggering the test script above in a
+Functest container. You can run the Doctor test with the following steps:
+
+.. code-block:: bash
+
+ DOCKER_TAG=latest
+ docker pull docker.io/opnfv/functest-features:${DOCKER_TAG}
+ docker run --privileged=true -id \
+ -e INSTALLER_TYPE=${INSTALLER_TYPE} \
+ -e INSTALLER_IP=${INSTALLER_IP} \
+ -e INSPECTOR_TYPE=sample \
+ docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash
+ docker exec <container_id> functest testcase run doctor-notification
+
+See `Functest Userguide`_ for more information.
+
+.. _Functest Userguide: :doc:`<functest:testing/user/userguide>`
+
+
+For testing with stable version, change DOCKER_TAG to 'stable' or other release
+tag identifier.
+
+Tips
+====
diff --git a/doctor_tests/admin_tool/__init__.py b/doctor_tests/admin_tool/__init__.py
new file mode 100644
index 00000000..3417a334
--- /dev/null
+++ b/doctor_tests/admin_tool/__init__.py
@@ -0,0 +1,37 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from oslo_config import cfg
+from oslo_utils import importutils
+import os
+
+OPTS = [
+ cfg.StrOpt('type',
+ default=os.environ.get('ADMIN_TOOL_TYPE', 'sample'),
+ choices=['sample', 'fenix'],
+ help='the component of doctor admin_tool',
+ required=True),
+ cfg.StrOpt('ip',
+ default='0.0.0.0',
+ help='the ip of admin_tool',
+ required=True),
+ cfg.IntOpt('port',
+ default='12347',
+ help='the port of doctor admin_tool',
+ required=True),
+]
+
+
+_admin_tool_name_class_mapping = {
+ 'sample': 'doctor_tests.admin_tool.sample.SampleAdminTool'
+}
+
+
+def get_admin_tool(trasport_url, conf, log):
+ admin_tool_class = _admin_tool_name_class_mapping.get(conf.admin_tool.type)
+ return importutils.import_object(admin_tool_class, trasport_url, conf, log)
diff --git a/doctor_tests/admin_tool/base.py b/doctor_tests/admin_tool/base.py
new file mode 100644
index 00000000..0f0b2dcd
--- /dev/null
+++ b/doctor_tests/admin_tool/base.py
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseAdminTool(object):
+
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+
+ @abc.abstractmethod
+ def start(self):
+ pass
+
+ @abc.abstractmethod
+ def stop(self):
+ pass
diff --git a/doctor_tests/admin_tool/fenix/Dockerfile b/doctor_tests/admin_tool/fenix/Dockerfile
new file mode 100644
index 00000000..202380eb
--- /dev/null
+++ b/doctor_tests/admin_tool/fenix/Dockerfile
@@ -0,0 +1,34 @@
+FROM gliderlabs/alpine:3.6
+
+ARG BRANCH=master
+ARG OPENSTACK=master
+
+EXPOSE 12347
+
+RUN echo "Building Fenix container against OpenStack $OPENSTACK" && \
+ echo "Building Fenix with $BRANCH" && \
+ mkdir /etc/fenix && \
+ mkdir -p /var/tmp/fenix
+WORKDIR /var/tmp/fenix
+COPY fenix*.conf /etc/fenix/
+
+RUN apk --no-cache add ca-certificates && \
+ apk --no-cache add --update python3 sshpass py-pip git curl && \
+ apk --no-cache add --virtual .build-deps --update \
+ python3-dev build-base linux-headers libffi-dev \
+ openssl-dev libjpeg-turbo-dev && \
+ curl https://opendev.org/openstack/requirements/raw/branch/$OPENSTACK/upper-constraints.txt > upper-constraints.txt && \
+ if [ ! -e /usr/bin/pip ]; then ln -s pip3 /usr/bin/pip ; fi && \
+ if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python3 /usr/bin/python; fi && \
+ pip3 install --upgrade pip && \
+ pip3 install alembic aodhclient decorator flask Flask-RESTful eventlet jsonschema \
+ keystoneauth1 keystonemiddleware python-novaclient oslo.config pecan \
+ oslo.db oslo.log oslo.messaging oslo.serialization oslo.service oslo_policy \
+ oslotest oslo.utils pbr pymysql six sqlalchemy -cupper-constraints.txt && \
+ git clone https://opendev.org/x/fenix -b $BRANCH /fenix && \
+ rm -fr /var/tmp/fenix
+COPY run /fenix
+COPY keystonercv3 /fenix
+WORKDIR /fenix
+RUN python3 setup.py install
+CMD ./run
diff --git a/doctor_tests/admin_tool/fenix/run b/doctor_tests/admin_tool/fenix/run
new file mode 100755
index 00000000..50ae68e7
--- /dev/null
+++ b/doctor_tests/admin_tool/fenix/run
@@ -0,0 +1,32 @@
+#!/bin/sh
+. keystonercv3
+
+# Start the first process
+nohup python3 /fenix/fenix/cmd/engine.py > /var/log/fenix-engine.log&
+status=$?
+if [ $status -ne 0 ]; then
+ echo "Failed to start engine.py: $status"
+ exit $status
+fi
+
+# Start the second process
+nohup python3 /fenix/fenix/cmd/api.py > /var/log/fenix-api.log&
+status=$?
+if [ $status -ne 0 ]; then
+ echo "Failed to start api.py: $status"
+ exit $status
+fi
+
+echo "started Fenix: engine and api"
+while sleep 60; do
+ ps aux |grep "cmd/engine.py" |grep -q -v grep
+ PROCESS_1_STATUS=$?
+ ps aux |grep "cmd/api.py" |grep -q -v grep
+ PROCESS_2_STATUS=$?
+ # If the greps above find anything, they exit with 0 status
+ # If they are not both 0, then something is wrong
+ if [ $PROCESS_1_STATUS -ne 0 -o $PROCESS_2_STATUS -ne 0 ]; then
+ echo "One of the processes has already exited."
+ exit 1
+ fi
+done
diff --git a/doctor_tests/admin_tool/sample.py b/doctor_tests/admin_tool/sample.py
new file mode 100644
index 00000000..a71f43a1
--- /dev/null
+++ b/doctor_tests/admin_tool/sample.py
@@ -0,0 +1,739 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import datetime
+from flask import Flask
+from flask import request
+import json
+from novaclient.exceptions import BadRequest
+import oslo_messaging as messaging
+import requests
+import time
+from threading import Thread
+from traceback import format_exc
+from uuid import uuid1 as generate_uuid
+
+from doctor_tests.admin_tool.base import BaseAdminTool
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import aodh_client
+from doctor_tests.os_clients import nova_client
+
+
+class SampleAdminTool(BaseAdminTool):
+
+ def __init__(self, trasport_url, conf, log):
+ super(SampleAdminTool, self).__init__(conf, log)
+ self.trasport_url = trasport_url
+ self.app = None
+
+ def start(self):
+ self.log.info('sample admin tool start......')
+ self.app = AdminTool(self.trasport_url, self.conf, self, self.log)
+ self.app.start()
+
+ def stop(self):
+ self.log.info('sample admin tool stop......')
+ if not self.app:
+ return
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ }
+ url = 'http://%s:%d/shutdown'\
+ % (self.conf.admin_tool.ip,
+ self.conf.admin_tool.port)
+ requests.post(url, data='', headers=headers)
+
+
+class AdminMain(Thread):
+
+ def __init__(self, trasport_url, session_id, data, parent, conf, log):
+ Thread.__init__(self)
+ self.session_id = session_id
+ self.parent = parent
+ self.log = log
+ self.conf = conf
+ self.url = 'http://%s:%s' % (conf.admin_tool.ip, conf.admin_tool.port)
+ self.projects_state = dict() # current state for each project
+ self.proj_server_actions = dict() # actions for each project server
+ self.projects_servers = dict() # servers processed in current state
+ self.maint_proj_servers = dict() # servers under whole maintenance
+ self.hosts = data['hosts']
+ self.maintenance_at = data['maintenance_at']
+ self.computes_disabled = list()
+ self.metadata = data['metadata']
+ self.auth = get_identity_auth(project=self.conf.doctor_project)
+ self.state = data['state']
+ self.aodh = aodh_client(self.conf.aodh_version,
+ get_session(auth=self.auth))
+ self.nova = nova_client(self.conf.nova_version,
+ get_session(auth=self.auth))
+ self.log.info('transport_url %s' % trasport_url)
+ transport = messaging.get_transport(self.conf, trasport_url)
+ self.notif_proj = messaging.Notifier(transport,
+ 'maintenance.planned',
+ driver='messaging',
+ topics=['notifications'])
+ self.notif_proj = self.notif_proj.prepare(publisher_id='admin_tool')
+ self.notif_admin = messaging.Notifier(transport,
+ 'maintenance.host',
+ driver='messaging',
+ topics=['notifications'])
+ self.notif_admin = self.notif_admin.prepare(publisher_id='admin_tool')
+ self.stopped = False
+ self.log.info('Admin tool session %s initialized' % self.session_id)
+
+ def cleanup(self):
+ for host in self.computes_disabled:
+ self.log.info('enable nova-compute on %s' % host)
+ self.nova.services.enable(host, 'nova-compute')
+
+ def _projects_not_in_wanted_states(self, wanted_states):
+ if len([v for v in self.projects_state.values()
+ if v not in wanted_states]):
+ return True
+ else:
+ return False
+
+ def projects_not_in_state(self, state):
+ if len([v for v in self.projects_state.values()
+ if v != state]):
+ return True
+ else:
+ return False
+
+ def wait_projects_state(self, wanted_states, wait_seconds):
+ retries = wait_seconds
+ while (retries > 0 and
+ self._projects_not_in_wanted_states(wanted_states)):
+ time.sleep(1)
+ retries = retries - 1
+ if self._projects_not_in_wanted_states(wanted_states):
+ self.log.error('Admin tool session %s: projects in invalid states '
+ '%s' % (self.session_id, self.projects_state))
+ return False
+ else:
+ self.log.info('all projects replied')
+ return True
+
+ def _project_notify(self, project_id, instance_ids, allowed_actions,
+ actions_at, state, metadata):
+ reply_url = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project_id)
+
+ payload = dict(project_id=project_id,
+ instance_ids=instance_ids,
+ allowed_actions=allowed_actions,
+ state=state,
+ actions_at=actions_at,
+ session_id=self.session_id,
+ metadata=metadata,
+ reply_url=reply_url)
+
+ self.log.debug('Sending "maintenance.planned" to project: %s' %
+ payload)
+
+ self.notif_proj.info({'some': 'context'}, 'maintenance.scheduled',
+ payload)
+
+ def _admin_notify(self, project, host, state, session_id):
+ payload = dict(project_id=project, host=host, state=state,
+ session_id=session_id)
+
+ self.log.debug('Sending "maintenance.host": %s' % payload)
+
+ self.notif_admin.info({'some': 'context'}, 'maintenance.host', payload)
+
+ def in_scale(self):
+ for project in self.projects_servers:
+ self.log.info('SCALE_IN to project %s' % project)
+ self.log.debug('instance_ids %s' % self.projects_servers[project])
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = []
+ wait_seconds = 120
+ actions_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=wait_seconds)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+ state = self.state
+ metadata = self.metadata
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = ['ACK_SCALE_IN', 'NACK_SCALE_IN']
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ if self.projects_not_in_state('ACK_SCALE_IN'):
+ self.log.error('%s: all states not ACK_SCALE_IN' %
+ self.session_id)
+ self.state = 'MAINTENANCE_FAILED'
+
+ def maintenance(self):
+ for project in self.projects_servers:
+ self.log.info('\nMAINTENANCE to project %s\n' % project)
+ self.log.debug('instance_ids %s' % self.projects_servers[project])
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = []
+ actions_at = self.maintenance_at
+ state = self.state
+ metadata = self.metadata
+ maint_at = self.str_to_datetime(self.maintenance_at)
+ td = maint_at - datetime.datetime.utcnow()
+ wait_seconds = int(td.total_seconds())
+ if wait_seconds < 10:
+ raise Exception('Admin tool session %s: No time for project to'
+ ' answer: %s' %
+ (self.session_id, wait_seconds))
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = ['ACK_MAINTENANCE', 'NACK_MAINTENANCE']
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ if self.projects_not_in_state('ACK_MAINTENANCE'):
+ self.log.error('%s: all states not ACK_MAINTENANCE' %
+ self.session_id)
+ self.state = 'MAINTENANCE_FAILED'
+
+ def maintenance_complete(self):
+ for project in self.projects_servers:
+ self.log.info('MAINTENANCE_COMPLETE to project %s' % project)
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = []
+ wait_seconds = 120
+ actions_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=wait_seconds)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+ state = 'MAINTENANCE_COMPLETE'
+ metadata = self.metadata
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = ['ACK_MAINTENANCE_COMPLETE',
+ 'NACK_MAINTENANCE_COMPLETE']
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ if self.projects_not_in_state('ACK_MAINTENANCE_COMPLETE'):
+ self.log.error('%s: all states not ACK_MAINTENANCE_COMPLETE' %
+ self.session_id)
+ self.state = 'MAINTENANCE_FAILED'
+
+ def need_in_scale(self, host_servers):
+ room_for_instances = 0
+ for host in host_servers:
+ instances = 0
+ for project in host_servers[host]:
+ for instance in host_servers[host][project]:
+ instances += 1
+ room_for_instances += (2 - instances)
+ self.log.info('there is room for %d instances' % room_for_instances)
+ if room_for_instances > 1:
+ return False
+ else:
+ return True
+
+ def find_host_to_be_empty(self, host_servers):
+ host_to_be_empty = None
+ host_nonha_instances = 0
+ for host in host_servers:
+ ha_instances = 0
+ nonha_instances = 0
+ for project in host_servers[host]:
+ for instance in host_servers[host][project]:
+ if ('doctor_ha_app_' in
+ host_servers[host][project][instance]):
+ ha_instances += 1
+ else:
+ nonha_instances += 1
+ self.log.info('host %s has %d ha and %d non ha instances' %
+ (host, ha_instances, nonha_instances))
+ if ha_instances == 0:
+ if host_to_be_empty:
+ if nonha_instances < host_nonha_instances:
+ host_to_be_empty = host
+ host_nonha_instances = nonha_instances
+ else:
+ host_to_be_empty = host
+ host_nonha_instances = nonha_instances
+ self.log.info('host %s selected to be empty' % host_to_be_empty)
+ return host_to_be_empty
+
+ def make_compute_host_empty(self, host, projects_servers, statebase):
+ state = statebase
+ state_ack = 'ACK_%s' % statebase
+ state_nack = 'NACK_%s' % statebase
+ for project in projects_servers:
+ # self.projects_servers must have servers under action
+ self.projects_servers[project] = projects_servers[project].copy()
+ self.log.info('%s to project %s' % (state, project))
+ self.project_servers_log_info(project, projects_servers)
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']
+ wait_seconds = 120
+ actions_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=wait_seconds)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+ metadata = self.metadata
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = [state_ack, state_nack]
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ elif self.projects_not_in_state(state_ack):
+ self.log.error('%s: all states not %s' %
+ (self.session_id, state_ack))
+ self.state = 'MAINTENANCE_FAILED'
+ else:
+ self.actions_to_have_empty_host(host)
+
+ def notify_action_done(self, project, instance_id):
+ instance_ids = instance_id
+ allowed_actions = []
+ actions_at = None
+ state = "INSTANCE_ACTION_DONE"
+ metadata = None
+ self._project_notify(project, instance_ids, allowed_actions,
+ actions_at, state, metadata)
+
+ def actions_to_have_empty_host(self, host):
+ retry = 0
+ while len(self.proj_server_actions) == 0:
+ time.sleep(2)
+ if retry == 10:
+ raise Exception('Admin tool session %s: project server actions'
+ ' not set' % self.session_id)
+ retry += 1
+ for project in self.proj_server_actions:
+ for server, action in self.proj_server_actions[project].items():
+ self.log.info('Action %s server %s: %s' % (action, server,
+ self.projects_servers[project][server]))
+ if action == 'MIGRATE':
+ self.migrate_server(server)
+ self.notify_action_done(project, server)
+ elif action == 'OWN_ACTION':
+ pass
+ else:
+ raise Exception('Admin tool session %s: server %s action '
+ '%s not supported' %
+ (self.session_id, server, action))
+ self.proj_server_actions = dict()
+ self._wait_host_empty(host)
+
+ def migrate_server(self, server_id):
+ server = self.nova.servers.get(server_id)
+ vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
+ self.log.info('server %s state %s' % (server_id, vm_state))
+ last_vm_state = vm_state
+ retry_migrate = 5
+ while True:
+ try:
+ server.migrate()
+ time.sleep(5)
+ retries = 36
+ while vm_state != 'resized' and retries > 0:
+ # try to confirm within 3min
+ server = self.nova.servers.get(server_id)
+ vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
+ if vm_state == 'resized':
+ server.confirm_resize()
+ self.log.info('server %s migration confirmed' %
+ server_id)
+ return
+ if last_vm_state != vm_state:
+ self.log.info('server %s state: %s' % (server_id,
+ vm_state))
+ if vm_state == 'error':
+ raise Exception('server %s migration failed, state: %s'
+ % (server_id, vm_state))
+ time.sleep(5)
+ retries = retries - 1
+ last_vm_state = vm_state
+ # Timout waiting state to change
+ break
+
+ except BadRequest:
+ if retry_migrate == 0:
+ raise Exception('server %s migrate failed' % server_id)
+ # Might take time for scheduler to sync inconsistent instance
+ # list for host
+ retry_time = 180 - (retry_migrate * 30)
+ self.log.info('server %s migrate failed, retry in %s sec'
+ % (server_id, retry_time))
+ time.sleep(retry_time)
+ except Exception as e:
+ self.log.error('server %s migration failed, Exception=%s' %
+ (server_id, e))
+ self.log.error(format_exc())
+ raise Exception('server %s migration failed, state: %s' %
+ (server_id, vm_state))
+ finally:
+ retry_migrate = retry_migrate - 1
+ raise Exception('server %s migration timeout, state: %s' %
+ (server_id, vm_state))
+
+ def _wait_host_empty(self, host):
+ hid = self.nova.hypervisors.search(host)[0].id
+ vcpus_used_last = 0
+ # wait 4min to get host empty
+ for j in range(48):
+ hvisor = self.nova.hypervisors.get(hid)
+ vcpus_used = hvisor.__getattr__('vcpus_used')
+ if vcpus_used > 0:
+ if vcpus_used_last == 0:
+ self.log.info('%s still has %d vcpus reserved. wait...'
+ % (host, vcpus_used))
+ elif vcpus_used != vcpus_used_last:
+ self.log.info('%s still has %d vcpus reserved. wait...'
+ % (host, vcpus_used))
+ vcpus_used_last = vcpus_used
+ time.sleep(5)
+ else:
+ self.log.info('%s empty' % host)
+ return
+ raise Exception('%s host not empty' % host)
+
+ def projects_listen_alarm(self, match_event):
+ match_projects = ([str(alarm['project_id']) for alarm in
+ self.aodh.alarm.list() if
+ str(alarm['event_rule']['event_type']) ==
+ match_event])
+ all_projects_match = True
+ for project in list(self.projects_state):
+ if project not in match_projects:
+ self.log.error('Admin tool session %s: project %s not '
+ 'listening to %s' %
+ (self.session_id, project, match_event))
+ all_projects_match = False
+ return all_projects_match
+
+ def project_servers_log_info(self, project, host_servers):
+ info = 'Project servers:\n'
+ for server in host_servers[project]:
+ info += (' %s: %s\n' %
+ (server, host_servers[project][server]))
+ self.log.info('%s' % info)
+
+ def servers_log_info(self, host_servers):
+ info = '\n'
+ for host in self.hosts:
+ info += '%s:\n' % host
+ if host in host_servers:
+ for project in host_servers[host]:
+ info += ' %s:\n' % project
+ for server in host_servers[host][project]:
+ info += (' %s: %s\n' %
+ (server, host_servers[host][project][server]))
+ self.log.info('%s' % info)
+
+ def update_server_info(self):
+ opts = {'all_tenants': True}
+ servers = self.nova.servers.list(search_opts=opts)
+ self.projects_servers = dict()
+ host_servers = dict()
+ for server in servers:
+ try:
+ host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
+ project = str(server.tenant_id)
+ server_name = str(server.name)
+ server_id = str(server.id)
+ except Exception:
+ raise Exception('can not get params from server=%s' %
+ server)
+ if host not in self.hosts:
+ continue
+ if host not in host_servers:
+ host_servers[host] = dict()
+ if project not in host_servers[host]:
+ host_servers[host][project] = dict()
+ if project not in self.projects_servers:
+ self.projects_servers[project] = dict()
+ if project not in self.projects_state:
+ self.projects_state[project] = None
+ host_servers[host][project][server_id] = server_name
+ self.projects_servers[project][server_id] = server_name
+ return host_servers
+
+ def str_to_datetime(self, dt_str):
+ mdate, mtime = dt_str.split()
+ year, month, day = map(int, mdate.split('-'))
+ hours, minutes, seconds = map(int, mtime.split(':'))
+ return datetime.datetime(year, month, day, hours, minutes, seconds)
+
+ def host_maintenance(self, host):
+ self.log.info('maintaining host %s' % host)
+ # no implementation to make real maintenance
+ time.sleep(5)
+
+ def run(self):
+ while (self.state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and
+ not self.stopped):
+ self.log.info('--==session %s: processing state %s==--' %
+ (self.session_id, self.state))
+ if self.state == 'MAINTENANCE':
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+
+ if not self.projects_listen_alarm('maintenance.scheduled'):
+ raise Exception('all projects do not listen maintenance '
+ 'alarm')
+ self.maintenance()
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ maint_at = self.str_to_datetime(self.maintenance_at)
+ if maint_at > datetime.datetime.utcnow():
+ time_now = (datetime.datetime.utcnow().strftime(
+ '%Y-%m-%d %H:%M:%S'))
+ self.log.info('Time now: %s maintenance starts: %s....' %
+ (time_now, self.maintenance_at))
+ td = maint_at - datetime.datetime.utcnow()
+ time.sleep(td.total_seconds())
+ time_now = (datetime.datetime.utcnow().strftime(
+ '%Y-%m-%d %H:%M:%S'))
+ self.log.info('Time to start maintenance starts: %s' %
+ time_now)
+
+ # check if we have empty compute host
+ # True -> PLANNED_MAINTENANCE
+ # False -> check if we can migrate VMs to get empty host
+ # True -> PREPARE_MAINTENANCE
+ # False -> SCALE_IN
+ maintenance_empty_hosts = ([h for h in self.hosts if h not in
+ host_servers])
+
+ if len(maintenance_empty_hosts) == 0:
+ if self.need_in_scale(host_servers):
+ self.log.info('Need to down scale')
+ self.state = 'SCALE_IN'
+ else:
+ self.log.info('Free capacity, but need empty host')
+ self.state = 'PREPARE_MAINTENANCE'
+ else:
+ self.log.info('Free capacity, but need empty host')
+ self.state = 'PLANNED_MAINTENANCE'
+ self.log.info('--==State change from MAINTENANCE to %s==--'
+ % self.state)
+ elif self.state == 'SCALE_IN':
+ # Test case is hard coded to have all compute capacity used
+ # We need to down scale to have one empty compute host
+ self.update_server_info()
+ self.in_scale()
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ self.state = 'PREPARE_MAINTENANCE'
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ self.log.info('--==State change from SCALE_IN to'
+ ' %s==--' % self.state)
+
+ elif self.state == 'PREPARE_MAINTENANCE':
+ # It might be down scale did not free capacity on a single
+ # compute host, so we need to arrange free capacity to a single
+ # compute host
+ self.maint_proj_servers = self.projects_servers.copy()
+ maintenance_empty_hosts = ([h for h in self.hosts if h not in
+ host_servers])
+ if len(maintenance_empty_hosts) == 0:
+ self.log.info('no empty hosts for maintenance')
+ if self.need_in_scale(host_servers):
+ raise Exception('Admin tool session %s: Not enough '
+ 'free capacity for maintenance' %
+ self.session_id)
+ host = self.find_host_to_be_empty(host_servers)
+ if host:
+ self.make_compute_host_empty(host, host_servers[host],
+ 'PREPARE_MAINTENANCE')
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ else:
+ # We do not currently support another down scale if
+ # first was not enough
+ raise Exception('Admin tool session %s: No host '
+ 'candidate to be emptied' %
+ self.session_id)
+ else:
+ for host in maintenance_empty_hosts:
+ self.log.info('%s already empty '
+ 'for maintenance' % host)
+ self.state = 'PLANNED_MAINTENANCE'
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ self.log.info('--==State change from PREPARE_MAINTENANCE to %s'
+ '==--' % self.state)
+ elif self.state == 'PLANNED_MAINTENANCE':
+ maintenance_hosts = list()
+ maintenance_empty_hosts = list()
+ # TODO This should be admin. hack for now to have it work
+ admin_project = list(self.projects_state)[0]
+ for host in self.hosts:
+ self.log.info('disable nova-compute on host %s' % host)
+ self.nova.services.disable_log_reason(host, 'nova-compute',
+ 'maintenance')
+ self.computes_disabled.append(host)
+ if host in host_servers and len(host_servers[host]):
+ maintenance_hosts.append(host)
+ else:
+ maintenance_empty_hosts.append(host)
+ self.log.info('--==Start to maintain empty hosts==--\n%s' %
+ maintenance_empty_hosts)
+ self.update_server_info()
+ for host in maintenance_empty_hosts:
+ # scheduler has problems, let's see if just down scaled
+ # host is really empty
+ self._wait_host_empty(host)
+ self.log.info('IN_MAINTENANCE host %s' % host)
+ self._admin_notify(admin_project, host, 'IN_MAINTENANCE',
+ self.session_id)
+ self.host_maintenance(host)
+ self._admin_notify(admin_project, host,
+ 'MAINTENANCE_COMPLETE',
+ self.session_id)
+ self.nova.services.enable(host, 'nova-compute')
+ self.computes_disabled.remove(host)
+ self.log.info('MAINTENANCE_COMPLETE host %s' % host)
+ self.log.info('--==Start to maintain occupied hosts==--\n%s' %
+ maintenance_hosts)
+ for host in maintenance_hosts:
+ self.log.info('PLANNED_MAINTENANCE host %s' % host)
+ self.make_compute_host_empty(host, host_servers[host],
+ 'PLANNED_MAINTENANCE')
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ self.log.info('IN_MAINTENANCE host %s' % host)
+ self._admin_notify(admin_project, host, 'IN_MAINTENANCE',
+ self.session_id)
+ self.host_maintenance(host)
+ self._admin_notify(admin_project, host,
+ 'MAINTENANCE_COMPLETE',
+ self.session_id)
+ self.nova.services.enable(host, 'nova-compute')
+ self.computes_disabled.remove(host)
+ self.log.info('MAINTENANCE_COMPLETE host %s' % host)
+ self.state = 'PLANNED_MAINTENANCE_COMPLETE'
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ elif self.state == 'PLANNED_MAINTENANCE_COMPLETE':
+ self.log.info('Projects still need to up scale back to full '
+ 'capcity')
+ self.maintenance_complete()
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ self.state = 'MAINTENANCE_DONE'
+ else:
+ raise Exception('Admin tool session %s: session in invalid '
+ 'state %s' % (self.session_id, self.state))
+ self.log.info('--==Maintenance session %s: %s==--' %
+ (self.session_id, self.state))
+
+ def project_input(self, project_id, data):
+ self.log.debug('Admin tool session %s: project %s input' %
+ (self.session_id, project_id))
+ if 'instance_actions' in data:
+ self.proj_server_actions[project_id] = (
+ data['instance_actions'].copy())
+ self.projects_state[project_id] = data['state']
+
+ def project_get_instances(self, project_id):
+ ret = list(self.projects_servers[project_id])
+ self.log.debug('Admin tool session %s: project %s GET return: %s' %
+ (self.session_id, project_id, ret))
+ return ret
+
+ def stop(self):
+ self.stopped = True
+
+
+class AdminTool(Thread):
+
+ def __init__(self, trasport_url, conf, admin_tool, log):
+ Thread.__init__(self)
+ self.admin_tool = admin_tool
+ self.log = log
+ self.conf = conf
+ self.maint_sessions = {}
+ self.projects = {}
+ self.maintenance_hosts = []
+ self.trasport_url = trasport_url
+
+ def run(self):
+ app = Flask('admin_tool')
+
+ @app.route('/maintenance', methods=['POST'])
+ def admin_maintenance_api_post():
+ data = json.loads(request.data.decode('utf8'))
+ self.log.info('maintenance message: %s' % data)
+ session_id = str(generate_uuid())
+ self.log.info('creating session: %s' % session_id)
+ self.maint_sessions[session_id] = (
+ AdminMain(self.trasport_url,
+ session_id,
+ data,
+ self,
+ self.conf,
+ self.log))
+ self.maint_sessions[session_id].start()
+ reply = json.dumps({'session_id': session_id,
+ 'state': 'ACK_%s' % data['state']})
+ self.log.debug('reply: %s' % reply)
+ return reply, 200, None
+
+ @app.route('/maintenance/<session_id>', methods=['GET'])
+ def admin_maintenance_api_get(session_id=None):
+ self.log.debug('Admin get maintenance')
+ reply = json.dumps({'state':
+ self.maint_sessions[session_id].state})
+ self.log.info('reply: %s' % reply)
+ return reply, 200, None
+
+ @app.route('/maintenance/<session_id>/<projet_id>', methods=['PUT'])
+ def project_maintenance_api_put(session_id=None, projet_id=None):
+ data = json.loads(request.data.decode('utf8'))
+ self.log.debug('%s project put: %s' % (projet_id, data))
+ self.project_input(session_id, projet_id, data)
+ return 'OK'
+
+ @app.route('/maintenance/<session_id>/<projet_id>', methods=['GET'])
+ def project_maintenance_api_get(session_id=None, projet_id=None):
+ self.log.debug('%s project get %s' % (projet_id, session_id))
+ instances = self.project_get_instances(session_id, projet_id)
+ reply = json.dumps({'instance_ids': instances})
+ self.log.debug('%s reply: %s' % (projet_id, reply))
+ return reply, 200, None
+
+ @app.route('/maintenance/<session_id>', methods=['DELETE'])
+ def remove_session(session_id=None):
+ self.log.info('remove session %s'
+ % session_id)
+ self.maint_sessions[session_id].cleanup()
+ self.maint_sessions[session_id].stop()
+ del self.maint_sessions[session_id]
+ return 'OK'
+
+ @app.route('/shutdown', methods=['POST'])
+ def shutdown():
+ self.log.info('shutdown admin_tool server at %s' % time.time())
+ func = request.environ.get('werkzeug.server.shutdown')
+ if func is None:
+ raise RuntimeError('Not running with the Werkzeug Server')
+ func()
+ return 'admin_tool app shutting down...'
+
+ app.run(host=self.conf.admin_tool.ip, port=self.conf.admin_tool.port)
+
+ def project_input(self, session_id, project_id, data):
+ self.maint_sessions[session_id].project_input(project_id, data)
+
+ def project_get_instances(self, session_id, project_id):
+ return self.maint_sessions[session_id].project_get_instances(
+ project_id)
diff --git a/doctor_tests/app_manager/__init__.py b/doctor_tests/app_manager/__init__.py
new file mode 100644
index 00000000..c2f75918
--- /dev/null
+++ b/doctor_tests/app_manager/__init__.py
@@ -0,0 +1,40 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from oslo_config import cfg
+from oslo_utils import importutils
+import os
+
+
+OPTS = [
+ cfg.StrOpt('type',
+ default=os.environ.get('APP_MANAGER_TYPE', 'sample'),
+ choices=['sample', 'vnfm'],
+ help='the component of doctor app manager',
+ required=True),
+ cfg.StrOpt('ip',
+ default='127.0.0.1',
+ help='the ip of app manager',
+ required=True),
+ cfg.IntOpt('port',
+ default='12348',
+ help='the port of doctor app manager',
+ required=True),
+]
+
+
+_app_manager_name_class_mapping = {
+ 'sample': 'doctor_tests.app_manager.sample.SampleAppManager',
+ 'vnfm': 'doctor_tests.app_manager.vnfm.VNFM',
+}
+
+
+def get_app_manager(stack, conf, log):
+ app_manager_class = (
+ _app_manager_name_class_mapping.get(conf.app_manager.type))
+ return importutils.import_object(app_manager_class, stack, conf, log)
diff --git a/doctor_tests/app_manager/base.py b/doctor_tests/app_manager/base.py
new file mode 100644
index 00000000..0d424083
--- /dev/null
+++ b/doctor_tests/app_manager/base.py
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseAppManager(object):
+
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+
+ @abc.abstractmethod
+ def start(self):
+ pass
+
+ @abc.abstractmethod
+ def stop(self):
+ pass
diff --git a/doctor_tests/app_manager/sample.py b/doctor_tests/app_manager/sample.py
new file mode 100644
index 00000000..7ca35b97
--- /dev/null
+++ b/doctor_tests/app_manager/sample.py
@@ -0,0 +1,265 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from flask import Flask
+from flask import request
+import json
+import yaml
+import time
+from threading import Thread
+import requests
+
+from doctor_tests.app_manager.base import BaseAppManager
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import neutron_client
+from doctor_tests.os_clients import nova_client
+
+
+class SampleAppManager(BaseAppManager):
+
+ def __init__(self, stack, conf, log):
+ super(SampleAppManager, self).__init__(conf, log)
+ self.stack = stack
+ self.app = None
+
+ def start(self):
+ self.log.info('sample app manager start......')
+ self.app = AppManager(self.stack, self.conf, self, self.log)
+ self.app.start()
+
+ def stop(self):
+ self.log.info('sample app manager stop......')
+ if not self.app:
+ return
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ }
+ url = 'http://%s:%d/shutdown'\
+ % (self.conf.app_manager.ip,
+ self.conf.app_manager.port)
+ requests.post(url, data='', headers=headers)
+
+
+class AppManager(Thread):
+
+ def __init__(self, stack, conf, app_manager, log):
+ Thread.__init__(self)
+ self.stack = stack
+ self.conf = conf
+ self.port = self.conf.app_manager.port
+ self.app_manager = app_manager
+ self.log = log
+ self.intance_ids = None
+ self.auth = get_identity_auth(project=self.conf.doctor_project)
+ self.session = get_session(auth=self.auth)
+ self.nova = nova_client(self.conf.nova_version,
+ self.session)
+ self.neutron = neutron_client(session=self.session)
+ self.headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ self.orig_number_of_instances = self.number_of_instances()
+ self.ha_instances = self.get_ha_instances()
+ self.floating_ip = None
+ self.active_instance_id = self.active_instance_id()
+
+ def active_instance_id(self):
+ for instance in self.ha_instances:
+ network_interfaces = next(iter(instance.addresses.values()))
+ for network_interface in network_interfaces:
+ _type = network_interface.get('OS-EXT-IPS:type')
+ if _type == "floating":
+ if not self.floating_ip:
+ self.floating_ip = network_interface.get('addr')
+ self.log.debug('active_instance: %s %s' %
+ (instance.name, instance.id))
+ return instance.id
+ raise Exception("No active instance found")
+
+ def switch_over_ha_instance(self):
+ for instance in self.ha_instances:
+ if instance.id != self.active_instance_id:
+ self.log.info('Switch over to: %s %s' % (instance.name,
+ instance.id))
+ # Deprecated, need to use neutron instead
+ # instance.add_floating_ip(self.floating_ip)
+ port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa
+ floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa
+ self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa
+ # Have to update ha_instances as floating_ip changed
+ self.ha_instances = self.get_ha_instances()
+ self.active_instance_id = instance.id
+ break
+
+ def get_instance_ids(self):
+ ret = list()
+ for instance in self.nova.servers.list(detailed=False):
+ ret.append(instance.id)
+ return ret
+
+ def get_ha_instances(self):
+ ha_instances = list()
+ for instance in self.nova.servers.list(detailed=True):
+ if "doctor_ha_app_" in instance.name:
+ ha_instances.append(instance)
+ self.log.debug('ha_instances: %s' % instance.name)
+ return ha_instances
+
+ def _alarm_data_decoder(self, data):
+ if "[" in data or "{" in data:
+ # string to list or dict removing unicode
+ data = yaml.load(data.replace("u'", "'"))
+ return data
+
+ def _alarm_traits_decoder(self, data):
+ return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+ for t in data['reason_data']['event']['traits']})
+
+ def get_session_instance_ids(self, url, session_id):
+ ret = requests.get(url, data=None, headers=self.headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ self.log.info('get_instance_ids %s' % ret.json())
+ return ret.json()['instance_ids']
+
+ def scale_instances(self, number_of_instances):
+ number_of_instances_before = self.number_of_instances()
+
+ parameters = self.stack.parameters
+ parameters['nonha_intances'] += number_of_instances
+ self.stack.update(self.stack.stack_name,
+ self.stack.stack_id,
+ self.stack.template,
+ parameters=parameters,
+ files=self.stack.files)
+
+ number_of_instances_after = self.number_of_instances()
+ if (number_of_instances_before + number_of_instances !=
+ number_of_instances_after):
+ self.log.error('scale_instances with: %d from: %d ends up to: %d'
+ % (number_of_instances, number_of_instances_before,
+ number_of_instances_after))
+ raise Exception('scale_instances failed')
+
+ self.log.info('scaled insances from %d to %d' %
+ (number_of_instances_before,
+ number_of_instances_after))
+
+ def number_of_instances(self):
+ return len(self.nova.servers.list(detailed=False))
+
+ def run(self):
+ app = Flask('app_manager')
+
+ @app.route('/maintenance', methods=['POST'])
+ def maintenance_alarm():
+ data = json.loads(request.data.decode('utf8'))
+ try:
+ payload = self._alarm_traits_decoder(data)
+ except Exception:
+ payload = ({t[0]: t[2] for t in
+ data['reason_data']['event']['traits']})
+ self.log.error('cannot parse alarm data: %s' % payload)
+ raise Exception('sample app manager cannot parse alarm.'
+ 'Possibly trait data over 256 char')
+
+ self.log.info('sample app manager received data = %s' % payload)
+
+ state = payload['state']
+ reply_state = None
+ reply = dict()
+
+ self.log.info('sample app manager state: %s' % state)
+
+ if state == 'MAINTENANCE':
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ reply['instance_ids'] = instance_ids
+ reply_state = 'ACK_MAINTENANCE'
+
+ elif state == 'SCALE_IN':
+ # scale down 2 isntances that is VCPUS equaling to single
+ # compute node
+ self.scale_instances(-2)
+ reply['instance_ids'] = self.get_instance_ids()
+ reply_state = 'ACK_SCALE_IN'
+
+ elif state == 'MAINTENANCE_COMPLETE':
+ # possibly need to upscale
+ number_of_instances = self.number_of_instances()
+ if self.orig_number_of_instances > number_of_instances:
+ scale_instances = (self.orig_number_of_instances -
+ number_of_instances)
+ self.scale_instances(scale_instances)
+ reply_state = 'ACK_MAINTENANCE_COMPLETE'
+
+ elif state == 'PREPARE_MAINTENANCE':
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ self.log.info('sample app manager got instances: %s' %
+ instance_ids)
+ instance_actions = dict()
+ for instance_id in instance_ids:
+ instance_actions[instance_id] = "MIGRATE"
+ if instance_id == self.active_instance_id:
+ self.switch_over_ha_instance()
+ reply['instance_actions'] = instance_actions
+ reply_state = 'ACK_PREPARE_MAINTENANCE'
+
+ elif state == 'PLANNED_MAINTENANCE':
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ self.log.info('sample app manager got instances: %s' %
+ instance_ids)
+ instance_actions = dict()
+ for instance_id in instance_ids:
+ instance_actions[instance_id] = "MIGRATE"
+ if instance_id == self.active_instance_id:
+ self.switch_over_ha_instance()
+ reply['instance_actions'] = instance_actions
+ reply_state = 'ACK_PLANNED_MAINTENANCE'
+
+ elif state == 'INSTANCE_ACTION_DONE':
+ self.log.info('%s' % payload['instance_ids'])
+
+ else:
+ raise Exception('sample app manager received event with'
+ ' unknown state %s' % state)
+
+ if reply_state:
+ reply['session_id'] = payload['session_id']
+ reply['state'] = reply_state
+ url = payload['reply_url']
+ self.log.info('sample app manager reply: %s' % reply)
+ requests.put(url, data=json.dumps(reply), headers=self.headers)
+
+ return 'OK'
+
+ @app.route('/shutdown', methods=['POST'])
+ def shutdown():
+ self.log.info('shutdown app manager server at %s' % time.time())
+ func = request.environ.get('werkzeug.server.shutdown')
+ if func is None:
+ raise RuntimeError('Not running with the Werkzeug Server')
+ func()
+ return 'app manager shutting down...'
+
+ app.run(host="0.0.0.0", port=self.port)
diff --git a/doctor_tests/app_manager/vnfm.py b/doctor_tests/app_manager/vnfm.py
new file mode 100644
index 00000000..68fdbb88
--- /dev/null
+++ b/doctor_tests/app_manager/vnfm.py
@@ -0,0 +1,441 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from flask import Flask
+from flask import request
+import json
+import requests
+from threading import Thread
+import time
+import uuid
+import yaml
+
+from doctor_tests.app_manager.base import BaseAppManager
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import neutron_client
+from doctor_tests.os_clients import nova_client
+from doctor_tests.os_clients import keystone_client
+
+
+class VNFM(BaseAppManager):
+
+ def __init__(self, stack, conf, log):
+ super(VNFM, self).__init__(conf, log)
+ self.stack = stack
+ self.app = None
+
+ def start(self):
+ self.log.info('VNFM start......')
+ self.app = VNFManager(self.stack, self.conf, self, self.log)
+ self.app.start()
+
+ def stop(self):
+ self.log.info('VNFM stop......')
+ if not self.app:
+ return
+ self.app.delete_constraints()
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ }
+ url = 'http://%s:%d/shutdown'\
+ % (self.conf.app_manager.ip,
+ self.conf.app_manager.port)
+ requests.post(url, data='', headers=headers)
+
+
+class VNFManager(Thread):
+
+ def __init__(self, stack, conf, app_manager, log):
+ Thread.__init__(self)
+ self.stack = stack
+ self.conf = conf
+ self.port = self.conf.app_manager.port
+ self.app_manager = app_manager
+ self.log = log
+ self.intance_ids = None
+ self.auth = get_identity_auth(project=self.conf.doctor_project)
+ self.session = get_session(auth=self.auth)
+ self.keystone = keystone_client(
+ self.conf.keystone_version, self.session)
+ self.nova = nova_client(self.conf.nova_version,
+ self.session)
+ self.neutron = neutron_client(session=self.session)
+ self.headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ self.orig_number_of_instances = self.number_of_instances()
+ # List of instances
+ self.ha_instances = []
+ self.nonha_instances = []
+ # Different instance_id specific constraints {instanse_id: {},...}
+ self.instance_constraints = None
+ # Update existing instances to instance lists
+ self.update_instances()
+ nonha_instances = len(self.nonha_instances)
+ if nonha_instances < 7:
+ self.scale = 2
+ self.max_impacted = 2
+ else:
+ self.scale = int((nonha_instances) / 2)
+ self.max_impacted = self.scale - 1
+ self.log.info('Init nonha_instances: %s scale: %s: max_impacted %s' %
+ (nonha_instances, self.scale, self.max_impacted))
+ # Different instance groups constraints dict
+ self.ha_group = None
+ self.nonha_group = None
+ # Floating IP used in HA instance
+ self.floating_ip = None
+ # VNF project_id
+ self.project_id = None
+ # HA instance_id that is active / has floating IP
+ self.active_instance_id = self.active_instance_id()
+
+ services = self.keystone.services.list()
+ for service in services:
+ if service.type == 'maintenance':
+ self.log.info('maintenance service: %s:%s type %s'
+ % (service.name, service.id, service.type))
+ maint_id = service.id
+ self.maint_endpoint = [ep.url for ep in self.keystone.endpoints.list()
+ if ep.service_id == maint_id and
+ ep.interface == 'public'][0]
+ self.log.info('maintenance endpoint: %s' % self.maint_endpoint)
+ self.update_constraints_lock = False
+ self.update_constraints()
+
+ def delete_remote_instance_constraints(self, instance_id):
+ url = "%s/instance/%s" % (self.maint_endpoint, instance_id)
+ self.log.info('DELETE: %s' % url)
+ ret = requests.delete(url, data=None, headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def update_remote_instance_constraints(self, instance):
+ url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"])
+ self.log.info('PUT: %s' % url)
+ ret = requests.put(url, data=json.dumps(instance),
+ headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def delete_remote_group_constraints(self, instance_group):
+ url = "%s/instance_group/%s" % (self.maint_endpoint,
+ instance_group["group_id"])
+ self.log.info('DELETE: %s' % url)
+ ret = requests.delete(url, data=None, headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def update_remote_group_constraints(self, instance_group):
+ url = "%s/instance_group/%s" % (self.maint_endpoint,
+ instance_group["group_id"])
+ self.log.info('PUT: %s' % url)
+ ret = requests.put(url, data=json.dumps(instance_group),
+ headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def delete_constraints(self):
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ for instance_id in self.instance_constraints:
+ self.delete_remote_instance_constraints(instance_id)
+ self.delete_remote_group_constraints(self.nonha_group)
+ self.delete_remote_group_constraints(self.ha_group)
+
+ def update_constraints(self):
+ while self.update_constraints_lock:
+ self.log.info('Waiting update_constraints_lock...')
+ time.sleep(1)
+ self.update_constraints_lock = True
+ self.log.info('Update constraints')
+ if self.project_id is None:
+ self.project_id = self.keystone.projects.list(
+ name=self.conf.doctor_project)[0].id
+ if self.nonha_group is None:
+ # Nova does not support groupping instances that do not belong to
+ # anti-affinity server_groups. Anyhow all instances need groupping
+ self.nonha_group = {
+ "group_id": str(uuid.uuid4()),
+ "project_id": self.project_id,
+ "group_name": "doctor_nonha_app_group",
+ "anti_affinity_group": False,
+ "max_instances_per_host": 0,
+ "max_impacted_members": self.max_impacted,
+ "recovery_time": 2,
+ "resource_mitigation": True}
+ self.log.info('create doctor_nonha_app_group constraints: %s'
+ % self.nonha_group)
+ self.update_remote_group_constraints(self.nonha_group)
+ if self.ha_group is None:
+ group_id = [sg.id for sg in self.nova.server_groups.list()
+ if sg.name == "doctor_ha_app_group"][0]
+ self.ha_group = {
+ "group_id": group_id,
+ "project_id": self.project_id,
+ "group_name": "doctor_ha_app_group",
+ "anti_affinity_group": True,
+ "max_instances_per_host": 1,
+ "max_impacted_members": 1,
+ "recovery_time": 4,
+ "resource_mitigation": True}
+ self.log.info('create doctor_ha_app_group constraints: %s'
+ % self.ha_group)
+ self.update_remote_group_constraints(self.ha_group)
+ instance_constraints = {}
+ for ha_instance in self.ha_instances:
+ instance = {
+ "instance_id": ha_instance.id,
+ "project_id": self.project_id,
+ "group_id": self.ha_group["group_id"],
+ "instance_name": ha_instance.name,
+ "max_interruption_time": 120,
+ "migration_type": "MIGRATE",
+ "resource_mitigation": True,
+ "lead_time": 40}
+ self.log.info('create ha instance constraints: %s'
+ % instance)
+ instance_constraints[ha_instance.id] = instance
+ for nonha_instance in self.nonha_instances:
+ instance = {
+ "instance_id": nonha_instance.id,
+ "project_id": self.project_id,
+ "group_id": self.nonha_group["group_id"],
+ "instance_name": nonha_instance.name,
+ "max_interruption_time": 120,
+ "migration_type": "MIGRATE",
+ "resource_mitigation": True,
+ "lead_time": 40}
+ self.log.info('create nonha instance constraints: %s'
+ % instance)
+ instance_constraints[nonha_instance.id] = instance
+ if not self.instance_constraints:
+ # Initial instance constraints
+ self.log.info('create initial instances constraints...')
+ for instance in [instance_constraints[i] for i
+ in instance_constraints]:
+ self.update_remote_instance_constraints(instance)
+ self.instance_constraints = instance_constraints.copy()
+ else:
+ self.log.info('check instances constraints changes...')
+ added = [i for i in instance_constraints.keys()
+ if i not in self.instance_constraints]
+ deleted = [i for i in self.instance_constraints.keys()
+ if i not in instance_constraints]
+ modified = [i for i in instance_constraints.keys()
+ if (i not in added and i not in deleted and
+ instance_constraints[i] !=
+ self.instance_constraints[i])]
+ for instance_id in deleted:
+ self.delete_remote_instance_constraints(instance_id)
+ updated = added + modified
+ for instance in [instance_constraints[i] for i in updated]:
+ self.update_remote_instance_constraints(instance)
+ if updated or deleted:
+ # Some instance constraints have changed
+ self.instance_constraints = instance_constraints.copy()
+ self.update_constraints_lock = False
+
+ def active_instance_id(self):
+ # Need rertry as it takes time after heat template done before
+ # Floating IP in place
+ retry = 5
+ while retry > 0:
+ for instance in self.ha_instances:
+ network_interfaces = next(iter(instance.addresses.values()))
+ for network_interface in network_interfaces:
+ _type = network_interface.get('OS-EXT-IPS:type')
+ if _type == "floating":
+ if not self.floating_ip:
+ self.floating_ip = network_interface.get('addr')
+ self.log.debug('active_instance: %s %s' %
+ (instance.name, instance.id))
+ return instance.id
+ time.sleep(2)
+ self.update_instances()
+ retry -= 1
+ raise Exception("No active instance found")
+
+ def switch_over_ha_instance(self):
+ for instance in self.ha_instances:
+ if instance.id != self.active_instance_id:
+ self.log.info('Switch over to: %s %s' % (instance.name,
+ instance.id))
+ # Deprecated, need to use neutron instead
+ # instance.add_floating_ip(self.floating_ip)
+ port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa
+ floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa
+ self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa
+ # Have to update ha_instances as floating_ip changed
+ self.update_instances()
+ self.active_instance_id = instance.id
+ break
+
+ def get_instance_ids(self):
+ ret = list()
+ for instance in self.nova.servers.list(detailed=False):
+ ret.append(instance.id)
+ return ret
+
+ def update_instances(self):
+ instances = self.nova.servers.list(detailed=True)
+ self.ha_instances = [i for i in instances
+ if "doctor_ha_app_" in i.name]
+ self.nonha_instances = [i for i in instances
+ if "doctor_nonha_app_" in i.name]
+
+ def _alarm_data_decoder(self, data):
+ if "[" in data or "{" in data:
+ # string to list or dict removing unicode
+ data = yaml.load(data.replace("u'", "'"))
+ return data
+
+ def _alarm_traits_decoder(self, data):
+ return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+ for t in data['reason_data']['event']['traits']})
+
+ def get_session_instance_ids(self, url, session_id):
+ ret = requests.get(url, data=None, headers=self.headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ self.log.info('get_instance_ids %s' % ret.json())
+ return ret.json()['instance_ids']
+
+ def scale_instances(self, number_of_instances):
+ number_of_instances_before = self.number_of_instances()
+
+ parameters = self.stack.parameters
+ parameters['nonha_intances'] += number_of_instances
+ self.stack.update(self.stack.stack_name,
+ self.stack.stack_id,
+ self.stack.template,
+ parameters=parameters,
+ files=self.stack.files)
+
+ number_of_instances_after = self.number_of_instances()
+ if (number_of_instances_before + number_of_instances !=
+ number_of_instances_after):
+ self.log.error('scale_instances with: %d from: %d ends up to: %d'
+ % (number_of_instances, number_of_instances_before,
+ number_of_instances_after))
+ raise Exception('scale_instances failed')
+
+ self.log.info('scaled instances from %d to %d' %
+ (number_of_instances_before,
+ number_of_instances_after))
+
+ def number_of_instances(self):
+ return len(self.nova.servers.list(detailed=False))
+
+ def run(self):
+ app = Flask('VNFM')
+
+ @app.route('/maintenance', methods=['POST'])
+ def maintenance_alarm():
+ data = json.loads(request.data.decode('utf8'))
+ try:
+ payload = self._alarm_traits_decoder(data)
+ except Exception:
+ payload = ({t[0]: t[2] for t in
+ data['reason_data']['event']['traits']})
+ self.log.error('cannot parse alarm data: %s' % payload)
+ raise Exception('VNFM cannot parse alarm.'
+ 'Possibly trait data over 256 char')
+
+ self.log.info('VNFM received data = %s' % payload)
+
+ state = payload['state']
+ reply_state = None
+ reply = dict()
+
+ self.log.info('VNFM state: %s' % state)
+
+ if state == 'MAINTENANCE':
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ my_instance_ids = self.get_instance_ids()
+ invalid_instances = (
+ [instance_id for instance_id in instance_ids
+ if instance_id not in my_instance_ids])
+ if invalid_instances:
+ self.log.error('Invalid instances: %s' % invalid_instances)
+ reply_state = 'NACK_MAINTENANCE'
+ else:
+ reply_state = 'ACK_MAINTENANCE'
+
+ elif state == 'SCALE_IN':
+ # scale down "self.scale" instances that is VCPUS equaling
+ # at least a single compute node
+ self.scale_instances(-self.scale)
+ reply_state = 'ACK_SCALE_IN'
+
+ elif state == 'MAINTENANCE_COMPLETE':
+ # possibly need to upscale
+ number_of_instances = self.number_of_instances()
+ if self.orig_number_of_instances > number_of_instances:
+ scale_instances = (self.orig_number_of_instances -
+ number_of_instances)
+ self.scale_instances(scale_instances)
+ reply_state = 'ACK_MAINTENANCE_COMPLETE'
+
+ elif state == 'PREPARE_MAINTENANCE':
+ # TBD from contraints
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+ instance_ids = payload['instance_ids'][0]
+ self.log.info('VNFM got instance: %s' % instance_ids)
+ if instance_ids == self.active_instance_id:
+ self.switch_over_ha_instance()
+ # optional also in contraints
+ reply['instance_action'] = "MIGRATE"
+ reply_state = 'ACK_PREPARE_MAINTENANCE'
+
+ elif state == 'PLANNED_MAINTENANCE':
+ # TBD from contraints
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+ instance_ids = payload['instance_ids'][0]
+ self.log.info('VNFM got instance: %s' % instance_ids)
+ if instance_ids == self.active_instance_id:
+ self.switch_over_ha_instance()
+ # optional also in contraints
+ reply['instance_action'] = "MIGRATE"
+ reply_state = 'ACK_PLANNED_MAINTENANCE'
+
+ elif state == 'INSTANCE_ACTION_DONE':
+ # TBD was action done in allowed window
+ self.log.info('%s' % payload['instance_ids'])
+ else:
+ raise Exception('VNFM received event with'
+ ' unknown state %s' % state)
+
+ if reply_state:
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ reply['state'] = reply_state
+ url = payload['reply_url']
+ self.log.info('VNFM reply: %s' % reply)
+ requests.put(url, data=json.dumps(reply), headers=self.headers)
+
+ return 'OK'
+
+ @app.route('/shutdown', methods=['POST'])
+ def shutdown():
+ self.log.info('shutdown VNFM server at %s' % time.time())
+ func = request.environ.get('werkzeug.server.shutdown')
+ if func is None:
+ raise RuntimeError('Not running with the Werkzeug Server')
+ func()
+ return 'VNFM shutting down...'
+
+ app.run(host="0.0.0.0", port=self.port)
diff --git a/doctor_tests/common/constants.py b/doctor_tests/common/constants.py
index 088ff633..201f3fc4 100644
--- a/doctor_tests/common/constants.py
+++ b/doctor_tests/common/constants.py
@@ -12,6 +12,10 @@ from collections import namedtuple
Host = namedtuple('Host', ['name', 'ip'])
+def is_fenix(conf):
+ return conf.admin_tool.type == 'fenix'
+
+
class Inspector(object):
CONGRESS = 'congress'
SAMPLE = 'sample'
diff --git a/doctor_tests/common/utils.py b/doctor_tests/common/utils.py
index 1a84c824..67ca4f4b 100644
--- a/doctor_tests/common/utils.py
+++ b/doctor_tests/common/utils.py
@@ -10,6 +10,7 @@ import json
import os
import paramiko
import re
+import subprocess
def load_json_file(full_path):
@@ -67,7 +68,7 @@ class SSHClient(object):
def __del__(self):
self.client.close()
- def ssh(self, command):
+ def ssh(self, command, raise_enabled=True):
if self.log:
self.log.info("Executing: %s" % command)
stdin, stdout, stderr = self.client.exec_command(command)
@@ -75,7 +76,7 @@ class SSHClient(object):
output = list()
for line in stdout.read().splitlines():
output.append(line.decode('utf-8'))
- if ret:
+ if ret and raise_enabled:
if self.log:
self.log.info("*** FAILED to run command %s (%s)"
% (command, ret))
@@ -97,6 +98,27 @@ class SSHClient(object):
ftp.close()
+class LocalSSH(object):
+
+ def __init__(self, log):
+ self.log = log
+ self.log.info('Init local ssh client')
+
+ def ssh(self, cmd):
+ ret = 0
+ output = "%s failed!!!" % cmd
+ try:
+ output = subprocess.check_output((cmd), shell=True,
+ universal_newlines=True)
+ except subprocess.CalledProcessError:
+ ret = 1
+ return ret, output
+
+ def scp(self, src_file, dst_file):
+ return subprocess.check_output("cp %s %s" % (src_file, dst_file),
+ shell=True)
+
+
def run_async(func):
from threading import Thread
from functools import wraps
diff --git a/doctor_tests/config.py b/doctor_tests/config.py
index dc05c0d8..cea1f0c9 100644
--- a/doctor_tests/config.py
+++ b/doctor_tests/config.py
@@ -11,6 +11,8 @@ import itertools
from oslo_config import cfg
from doctor_tests import alarm
+from doctor_tests import admin_tool
+from doctor_tests import app_manager
from doctor_tests import consumer
from doctor_tests import image
from doctor_tests import instance
@@ -30,6 +32,8 @@ def list_opts():
('monitor', monitor.OPTS),
('inspector', inspector.OPTS),
('consumer', consumer.OPTS),
+ ('admin_tool', admin_tool.OPTS),
+ ('app_manager', app_manager.OPTS),
('DEFAULT', itertools.chain(
os_clients.OPTS,
image.OPTS,
diff --git a/doctor_tests/consumer/__init__.py b/doctor_tests/consumer/__init__.py
index 2c66a547..e5a36506 100644
--- a/doctor_tests/consumer/__init__.py
+++ b/doctor_tests/consumer/__init__.py
@@ -21,7 +21,7 @@ OPTS = [
help='the ip of consumer',
required=True),
cfg.IntOpt('port',
- default='12346',
+ default=12346,
help='the port of doctor consumer',
required=True),
]
diff --git a/doctor_tests/consumer/base.py b/doctor_tests/consumer/base.py
index b6c4b34e..e21b0802 100644
--- a/doctor_tests/consumer/base.py
+++ b/doctor_tests/consumer/base.py
@@ -16,6 +16,15 @@ class BaseConsumer(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
+ self._notified_time = None
+
+ @property
+ def notified_time(self):
+ return self._notified_time
+
+ @notified_time.setter
+ def notified_time(self, notified_time):
+ self._notified_time = notified_time
@abc.abstractmethod
def start(self):
diff --git a/doctor_tests/consumer/sample.py b/doctor_tests/consumer/sample.py
index eaf5fabd..c7dcf4f9 100644
--- a/doctor_tests/consumer/sample.py
+++ b/doctor_tests/consumer/sample.py
@@ -54,7 +54,9 @@ class ConsumerApp(Thread):
@app.route('/failure', methods=['POST'])
def event_posted():
- self.log.info('doctor consumer notified at %s' % time.time())
+ notified_time = time.time()
+ self.log.info('doctor consumer notified at %s' % notified_time)
+ self.consumer.notified_time = notified_time
data = json.loads(request.data.decode('utf8'))
self.log.info('sample consumer received data = %s' % data)
return 'OK'
diff --git a/doctor_tests/identity_auth.py b/doctor_tests/identity_auth.py
index 0d429597..62b45ede 100644
--- a/doctor_tests/identity_auth.py
+++ b/doctor_tests/identity_auth.py
@@ -40,4 +40,5 @@ def get_session(auth=None):
"""Get a user credentials auth session."""
if auth is None:
auth = get_identity_auth()
- return session.Session(auth=auth)
+ return session.Session(auth=auth,
+ verify=os.environ.get('OS_CACERT'))
diff --git a/doctor_tests/image.py b/doctor_tests/image.py
index 9961b22d..50841ef6 100644
--- a/doctor_tests/image.py
+++ b/doctor_tests/image.py
@@ -7,7 +7,11 @@
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
import os
-import urllib.request
+try:
+ from urllib.request import urlopen
+except Exception:
+ from urllib2 import urlopen
+
from oslo_config import cfg
@@ -46,11 +50,14 @@ class Image(object):
def create(self):
self.log.info('image create start......')
-
images = {image.name: image for image in self.glance.images.list()}
+ if self.conf.image_name == 'cirros':
+ cirros = [image for image in images if 'cirros' in image]
+ if cirros:
+ self.conf.image_name = cirros[0]
if self.conf.image_name not in images:
if not os.path.exists(self.conf.image_filename):
- resp = urllib.request.urlopen(self.conf.image_download_url)
+ resp = urlopen(self.conf.image_download_url)
with open(self.conf.image_filename, "wb") as file:
file.write(resp.read())
self.image = \
diff --git a/doctor_tests/inspector/__init__.py b/doctor_tests/inspector/__init__.py
index 31291baf..50365a61 100644
--- a/doctor_tests/inspector/__init__.py
+++ b/doctor_tests/inspector/__init__.py
@@ -42,6 +42,10 @@ _inspector_name_class_mapping = {
}
-def get_inspector(conf, log):
+def get_inspector(conf, log, transport_url=None):
inspector_class = _inspector_name_class_mapping[conf.inspector.type]
- return importutils.import_object(inspector_class, conf, log)
+ if conf.inspector.type == 'sample':
+ return importutils.import_object(inspector_class, conf, log,
+ transport_url)
+ else:
+ return importutils.import_object(inspector_class, conf, log)
diff --git a/doctor_tests/inspector/base.py b/doctor_tests/inspector/base.py
index a6eae451..967dc9ba 100644
--- a/doctor_tests/inspector/base.py
+++ b/doctor_tests/inspector/base.py
@@ -16,6 +16,24 @@ class BaseInspector(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
+ self._host_down_time = None
+ self._vm_down_time = None
+
+ @property
+ def host_down_time(self):
+ return self._host_down_time
+
+ @host_down_time.setter
+ def host_down_time(self, host_down_time):
+ self._host_down_time = host_down_time
+
+ @property
+ def vm_down_time(self):
+ return self._vm_down_time
+
+ @vm_down_time.setter
+ def vm_down_time(self, vm_down_time):
+ self._vm_down_time = vm_down_time
@abc.abstractmethod
def get_inspector_url(self):
diff --git a/doctor_tests/inspector/congress.py b/doctor_tests/inspector/congress.py
index fb747ec5..7f918fb2 100644
--- a/doctor_tests/inspector/congress.py
+++ b/doctor_tests/inspector/congress.py
@@ -31,6 +31,8 @@ class CongressInspector(BaseInspector):
def __init__(self, conf, log):
super(CongressInspector, self).__init__(conf, log)
+ self.is_create_doctor_datasource = False
+ self.doctor_datasource_id = None
self.auth = get_identity_auth()
self.congress = congress_client(get_session(auth=self.auth))
self._init_driver_and_ds()
@@ -48,12 +50,6 @@ class CongressInspector(BaseInspector):
'version < nova_api_min_version(%s)'
% self.nova_api_min_version)
- # create doctor datasource if it's not exist
- if self.doctor_datasource not in datasources:
- self.congress.create_datasource(
- body={'driver': self.doctor_driver,
- 'name': self.doctor_datasource})
-
# check whether doctor driver exist
drivers = \
{driver['id']: driver for driver in
@@ -61,6 +57,14 @@ class CongressInspector(BaseInspector):
if self.doctor_driver not in drivers:
raise Exception('Do not support doctor driver in congress')
+ # create doctor datasource if it's not exist
+ if self.doctor_datasource not in datasources:
+ response = self.congress.create_datasource(
+ body={'driver': self.doctor_driver,
+ 'name': self.doctor_datasource})
+ self.doctor_datasource_id = response['id']
+ self.is_create_doctor_datasource = True
+
self.policy_rules = \
{rule['name']: rule for rule in
self.congress.list_policy_rules(self.policy)['results']}
@@ -86,6 +90,9 @@ class CongressInspector(BaseInspector):
for rule_name in self.rules.keys():
self._del_rule(rule_name)
+ if self.is_create_doctor_datasource:
+ self.congress.delete_datasource(self.doctor_datasource_id)
+
def _add_rule(self, rule_name, rule):
if rule_name not in self.policy_rules:
self.congress.create_policy_rule(self.policy,
diff --git a/doctor_tests/inspector/sample.py b/doctor_tests/inspector/sample.py
index fadfd3d4..c44db95d 100644
--- a/doctor_tests/inspector/sample.py
+++ b/doctor_tests/inspector/sample.py
@@ -10,9 +10,11 @@ import collections
from flask import Flask
from flask import request
import json
+import oslo_messaging
import time
from threading import Thread
import requests
+import yaml
from doctor_tests.common import utils
from doctor_tests.identity_auth import get_identity_auth
@@ -25,7 +27,7 @@ from doctor_tests.inspector.base import BaseInspector
class SampleInspector(BaseInspector):
event_type = 'compute.host.down'
- def __init__(self, conf, log):
+ def __init__(self, conf, log, trasport_url):
super(SampleInspector, self).__init__(conf, log)
self.inspector_url = self.get_inspector_url()
self.novaclients = list()
@@ -42,6 +44,17 @@ class SampleInspector(BaseInspector):
self.hostnames = list()
self.app = None
+ try:
+ transport = oslo_messaging.get_notification_transport(self.conf,
+ trasport_url)
+ self.notif = oslo_messaging.Notifier(transport,
+ 'compute.instance.update',
+ driver='messaging',
+ topics=['notifications'])
+ self.notif = self.notif.prepare(publisher_id='sample')
+ except Exception:
+ self.notif = None
+
def _init_novaclients(self):
self.NUMBER_OF_CLIENTS = self.conf.instance_count
auth = get_identity_auth(project=self.conf.doctor_project)
@@ -53,13 +66,13 @@ class SampleInspector(BaseInspector):
def _init_servers_list(self):
self.servers.clear()
opts = {'all_tenants': True}
- servers = self.nova.servers.list(search_opts=opts)
+ servers = self.nova.servers.list(detailed=True, search_opts=opts)
for server in servers:
try:
host = server.__dict__.get('OS-EXT-SRV-ATTR:host')
self.servers[host].append(server)
self.log.debug('get hostname=%s from server=%s'
- % (host, server))
+ % (host, str(server.name)))
except Exception as e:
self.log.info('can not get hostname from server=%s, error=%s'
% (server, e))
@@ -96,20 +109,60 @@ class SampleInspector(BaseInspector):
event_type = event['type']
if event_type == self.event_type:
self.hostnames.append(hostname)
+ if self.notif is not None:
+ thr0 = self._send_notif(hostname)
thr1 = self._disable_compute_host(hostname)
thr2 = self._vms_reset_state('error', hostname)
if self.conf.inspector.update_neutron_port_dp_status:
thr3 = self._set_ports_data_plane_status('DOWN', hostname)
+ if self.notif is not None:
+ thr0.join()
thr1.join()
thr2.join()
if self.conf.inspector.update_neutron_port_dp_status:
thr3.join()
+ def _alarm_data_decoder(self, data):
+ if "[" in data or "{" in data:
+ # string to list or dict removing unicode
+ data = yaml.load(data.replace("u'", "'"))
+ return data
+
+ def _alarm_traits_decoder(self, data):
+ return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+ for t in data['reason_data']['event']['traits']})
+
+ def maintenance(self, data):
+ try:
+ payload = self._alarm_traits_decoder(data)
+ except Exception:
+ payload = ({t[0]: t[2] for t in
+ data['reason_data']['event']['traits']})
+ self.log.error('cannot parse alarm data: %s' % payload)
+ raise Exception('sample inspector cannot parse alarm.'
+ 'Possibly trait data over 256 char')
+ self.log.info('sample inspector received data = %s' % payload)
+
+ state = payload['state']
+ host = payload['host']
+
+ if state == 'IN_MAINTENANCE':
+ self.log.info("sample inspector: disable %s automatic fault "
+ "management" % host)
+ elif state == 'MAINTENANCE_COMPLETE':
+ self.log.info("sample inspector: enable %s automatic fault "
+ "management" % host)
+ else:
+ raise("sample inspector couldn't handle state: %s" % state)
+
@utils.run_async
def _disable_compute_host(self, hostname):
self.nova.services.force_down(hostname, 'nova-compute', True)
+
+ hostdown_time = time.time()
+ self.host_down_time = hostdown_time
self.log.info('doctor mark host(%s) down at %s'
- % (hostname, time.time()))
+ % (hostname, hostdown_time))
@utils.run_async
def _vms_reset_state(self, state, hostname):
@@ -117,8 +170,10 @@ class SampleInspector(BaseInspector):
@utils.run_async
def _vm_reset_state(nova, server, state):
nova.servers.reset_state(server, state)
- self.log.info('doctor mark vm(%s) error at %s'
- % (server, time.time()))
+ vmdown_time = time.time()
+ self.vm_down_time = vmdown_time
+ self.log.info('doctor mark vm(%s) %s at %s'
+ % (server, state, vmdown_time))
thrs = []
for nova, server in zip(self.novaclients, self.servers[hostname]):
@@ -128,6 +183,26 @@ class SampleInspector(BaseInspector):
t.join()
@utils.run_async
+ def _send_notif(self, hostname):
+
+ @utils.run_async
+ def _send_notif(server):
+ payload = dict(tenant_id=server.tenant_id,
+ instance_id=server.id,
+ state="error")
+ self.notif.info({'some': 'context'}, 'compute.instance.update',
+ payload)
+ self.log.info('doctor compute.instance.update vm(%s) error %s'
+ % (server, time.time()))
+
+ thrs = []
+ for server in self.servers[hostname]:
+ t = _send_notif(server)
+ thrs.append(t)
+ for t in thrs:
+ t.join()
+
+ @utils.run_async
def _set_ports_data_plane_status(self, status, hostname):
body = {'data_plane_status': status}
@@ -168,7 +243,12 @@ class InspectorApp(Thread):
self.inspector.handle_events(events)
return "OK"
- @app.route('/shutdown', methods=['POST'])
+ @app.route('/maintenance', methods=['POST'])
+ def maintenance():
+ self.inspector.maintenance(request.json)
+ return "OK"
+
+ @app.route('/events/shutdown', methods=['POST'])
def shutdown():
self.log.info('shutdown inspector app server at %s' % time.time())
func = request.environ.get('werkzeug.server.shutdown')
diff --git a/doctor_tests/installer/__init__.py b/doctor_tests/installer/__init__.py
index 1ee59d99..00a01667 100644
--- a/doctor_tests/installer/__init__.py
+++ b/doctor_tests/installer/__init__.py
@@ -13,24 +13,25 @@ from oslo_utils import importutils
OPTS = [
cfg.StrOpt('type',
- default=os.environ.get('INSTALLER_TYPE', 'local'),
- choices=['local', 'apex', 'daisy'],
+ default=os.environ.get('INSTALLER_TYPE', 'devstack'),
+ choices=['apex', 'daisy', 'fuel', 'devstack'],
help='the type of installer',
required=True),
cfg.StrOpt('ip',
default=os.environ.get('INSTALLER_IP', '127.0.0.1'),
help='the ip of installer'),
- cfg.StrOpt('username',
- default='root',
- help='the user name for login installer server',
- required=True),
+ cfg.StrOpt('key_file',
+ default=os.environ.get('SSH_KEY', None),
+ help='the key for user to login installer server',
+ required=False),
]
_installer_name_class_mapping = {
- 'local': 'doctor_tests.installer.local.LocalInstaller',
'apex': 'doctor_tests.installer.apex.ApexInstaller',
- 'daisy': 'doctor_tests.installer.daisy.DaisyInstaller'
+ 'daisy': 'doctor_tests.installer.daisy.DaisyInstaller',
+ 'fuel': 'doctor_tests.installer.mcp.McpInstaller',
+ 'devstack': 'doctor_tests.installer.devstack.DevstackInstaller'
}
diff --git a/doctor_tests/installer/apex.py b/doctor_tests/installer/apex.py
index 90304a27..3ec2100c 100644
--- a/doctor_tests/installer/apex.py
+++ b/doctor_tests/installer/apex.py
@@ -6,13 +6,10 @@
# which accompanies this distribution, and is available at
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
-import getpass
-import grp
-import os
-import pwd
-import stat
-import subprocess
+import time
+from doctor_tests.common.constants import Inspector
+from doctor_tests.common.constants import is_fenix
from doctor_tests.common.utils import get_doctor_test_root_dir
from doctor_tests.common.utils import SSHClient
from doctor_tests.installer.base import BaseInstaller
@@ -20,25 +17,35 @@ from doctor_tests.installer.base import BaseInstaller
class ApexInstaller(BaseInstaller):
node_user_name = 'heat-admin'
- cm_set_script = 'set_ceilometer.py'
- cm_restore_script = 'restore_ceilometer.py'
+ installer_username = 'stack'
+ cm_set_script = 'set_config.py'
+ nc_set_compute_script = 'set_compute_config.py'
+ cg_set_script = 'set_congress.py'
+ fe_set_script = 'set_fenix.sh'
+ cm_restore_script = 'restore_config.py'
+ nc_restore_compute_script = 'restore_compute_config.py'
+ cg_restore_script = 'restore_congress.py'
+ ac_restart_script = 'restart_aodh.py'
+ ac_restore_script = 'restore_aodh.py'
+ python = 'python'
def __init__(self, conf, log):
super(ApexInstaller, self).__init__(conf, log)
self.client = SSHClient(self.conf.installer.ip,
- self.conf.installer.username,
+ self.installer_username,
+ key_filename=self.conf.installer.key_file,
look_for_keys=True)
self.key_file = None
self.controllers = list()
- self.controller_clients = list()
- self.servers = list()
- self.test_dir = get_doctor_test_root_dir()
+ self.computes = list()
def setup(self):
self.log.info('Setup Apex installer start......')
-
- self.get_ssh_key_from_installer()
- self.get_controller_ips()
+ self.key_file = self.get_ssh_key_from_installer()
+ self._get_overcloud_conf()
+ if is_fenix(self.conf):
+ self._copy_overcloudrc_to_controllers()
+ self.create_flavor()
self.set_apply_patches()
self.setup_stunnel()
@@ -48,97 +55,156 @@ class ApexInstaller(BaseInstaller):
server.terminate()
def get_ssh_key_from_installer(self):
- self.log.info('Get SSH keys from Apex installer......')
-
- if self.key_file is not None:
- self.log.info('Already have SSH keys from Apex installer......')
- return self.key_file
-
- ssh_key = '{0}/{1}'.format(self.test_dir, 'instack_key')
- self.client.scp('/home/stack/.ssh/id_rsa', ssh_key, method='get')
- user = getpass.getuser()
- uid = pwd.getpwnam(user).pw_uid
- gid = grp.getgrnam(user).gr_gid
- os.chown(ssh_key, uid, gid)
- os.chmod(ssh_key, stat.S_IREAD)
- self.key_file = ssh_key
- return self.key_file
-
- def get_controller_ips(self):
- self.log.info('Get controller ips from Apex installer......')
-
- command = "source stackrc; " \
- "nova list | grep ' overcloud-controller-[0-9] ' " \
- "| sed -e 's/^.*ctlplane=//' |awk '{print $1}'"
- ret, controllers = self.client.ssh(command)
- if ret:
- raise Exception('Exec command to get controller ips'
- 'in Apex installer failed, ret=%s, output=%s'
- % (ret, controllers))
- self.log.info('Get controller_ips:%s from Apex installer'
- % controllers)
- self.controllers = controllers
+ key_path = '/home/stack/.ssh/id_rsa'
+ return self._get_ssh_key(self.client, key_path)
+
+ def _copy_overcloudrc_to_controllers(self):
+ for ip in self.controllers:
+ cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip)
+ self._run_cmd_remote(self.client, cmd)
+
+ def _get_overcloud_conf(self):
+ self.log.info('Get overcloud config details from Apex installer'
+ '......')
+
+ command = "source stackrc; nova list | grep ' overcloud-'"
+ raw_ips_list = self._run_cmd_remote(self.client, command)
+ for line in raw_ips_list:
+ ip = line.split('ctlplane=', 1)[1].split(" ", 1)[0]
+ if 'overcloud-controller-' in line:
+ self.controllers.append(ip)
+ elif 'overcloud-novacompute-' in line:
+ self.computes.append(ip)
+ command = "grep docker /home/stack/deploy_command"
+ self.use_containers = self._check_cmd_remote(self.client, command)
+ self.log.info('controller_ips:%s' % self.controllers)
+ self.log.info('compute_ips:%s' % self.computes)
+ self.log.info('use_containers:%s' % self.use_containers)
def get_host_ip_from_hostname(self, hostname):
- self.log.info('Get host ip from host name in Apex installer......')
+ self.log.info('Get host ip by hostname=%s from Apex installer......'
+ % hostname)
hostname_in_undercloud = hostname.split('.')[0]
-
command = "source stackrc; nova show %s | awk '/ ctlplane network /{print $5}'" % (hostname_in_undercloud) # noqa
- ret, host_ip = self.client.ssh(command)
- if ret:
- raise Exception('Exec command to get host ip from hostname(%s)'
- 'in Apex installer failed, ret=%s, output=%s'
- % (hostname, ret, host_ip))
- self.log.info('Get host_ip:%s from host_name:%s in Apex installer'
- % (host_ip, hostname))
- return host_ip[0]
-
- def setup_stunnel(self):
- self.log.info('Setup ssh stunnel in controller nodes'
- 'in Apex installer......')
- for node_ip in self.controllers:
- cmd = ("ssh -o UserKnownHostsFile=/dev/null"
- " -o StrictHostKeyChecking=no"
- " -i %s %s@%s -R %s:localhost:%s"
- " sleep 600 > ssh_tunnel.%s.log"
- " 2>&1 < /dev/null &"
- % (self.key_file,
- self.node_user_name,
- node_ip,
- self.conf.consumer.port,
- self.conf.consumer.port,
- node_ip))
- server = subprocess.Popen(cmd, shell=True)
- self.servers.append(server)
- server.communicate()
+ host_ips = self._run_cmd_remote(self.client, command)
+ return host_ips[0]
+
+ def _set_docker_restart_cmd(self, service):
+ # There can be multiple instances running so need to restart all
+ cmd = "for container in `sudo docker ps | grep "
+ cmd += service
+ cmd += " | awk '{print $1}'`; do sudo docker restart $container; \
+ done;"
+ return cmd
def set_apply_patches(self):
self.log.info('Set apply patches start......')
+ fenix_files = None
+
+ set_scripts = [self.cm_set_script]
+
+ if self.use_containers:
+ restart_cmd = (self._set_docker_restart_cmd(
+ "ceilometer-notification"))
+ set_scripts.append(self.ac_restart_script)
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-ceilometer-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("nova-scheduler")
+ if is_fenix(self.conf):
+ set_scripts.append(self.fe_set_script)
+ testdir = get_doctor_test_root_dir()
+ fenix_files = ["Dockerfile", "run"]
+ else:
+ restart_cmd += ' openstack-nova-scheduler.service'
+ set_scripts.append(self.nc_set_compute_script)
+
+ if self.conf.inspector.type == Inspector.CONGRESS:
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("congress-server")
+ else:
+ restart_cmd += ' openstack-congress-server.service'
+ set_scripts.append(self.cg_set_script)
for node_ip in self.controllers:
client = SSHClient(node_ip, self.node_user_name,
key_filename=self.key_file)
- self.controller_clients.append(client)
- self._ceilometer_apply_patches(client, self.cm_set_script)
+ if fenix_files is not None:
+ for fenix_file in fenix_files:
+ src_file = '{0}/{1}/{2}'.format(testdir,
+ 'admin_tool/fenix',
+ fenix_file)
+ client.scp(src_file, fenix_file)
+ self._run_apply_patches(client,
+ restart_cmd,
+ set_scripts,
+ python=self.python)
+ time.sleep(5)
+
+ self.log.info('Set apply patches start......')
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd = self._set_docker_restart_cmd("nova")
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-nova-compute.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(client,
+ restart_cmd,
+ [self.nc_set_compute_script],
+ python=self.python)
+ time.sleep(5)
def restore_apply_patches(self):
self.log.info('restore apply patches start......')
- for client in self.controller_clients:
- self._ceilometer_apply_patches(client, self.cm_restore_script)
-
- def _ceilometer_apply_patches(self, ssh_client, script_name):
- installer_dir = os.path.dirname(os.path.realpath(__file__))
- script_abs_path = '{0}/{1}/{2}'.format(installer_dir,
- 'common', script_name)
-
- ssh_client.scp(script_abs_path, script_name)
- cmd = 'sudo python %s' % script_name
- ret, output = ssh_client.ssh(cmd)
- if ret:
- raise Exception('Do the ceilometer command in controller'
- ' node failed, ret=%s, cmd=%s, output=%s'
- % (ret, cmd, output))
- ssh_client.ssh('sudo systemctl restart '
- 'openstack-ceilometer-notification.service')
+ restore_scripts = [self.cm_restore_script]
+
+ if self.use_containers:
+ restart_cmd = (self._set_docker_restart_cmd(
+ "ceilometer-notification"))
+ restore_scripts.append(self.ac_restore_script)
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-ceilometer-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("nova-scheduler")
+ else:
+ restart_cmd += ' openstack-nova-scheduler.service'
+ restore_scripts.append(self.nc_restore_compute_script)
+
+ if self.conf.inspector.type == Inspector.CONGRESS:
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("congress-server")
+ else:
+ restart_cmd += ' openstack-congress-server.service'
+ restore_scripts.append(self.cg_restore_script)
+
+ for node_ip in self.controllers:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(client,
+ restart_cmd,
+ restore_scripts,
+ python=self.python)
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd = self._set_docker_restart_cmd("nova-compute")
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-nova-compute.service'
+ for node_ip in self.computes:
+ self._run_apply_patches(
+ client, restart_cmd,
+ [self.nc_restore_compute_script],
+ python=self.python)
diff --git a/doctor_tests/installer/base.py b/doctor_tests/installer/base.py
index dcb5b1d8..de4d2f2e 100644
--- a/doctor_tests/installer/base.py
+++ b/doctor_tests/installer/base.py
@@ -7,7 +7,18 @@
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
import abc
+import getpass
+import grp
+import os
+import pwd
import six
+import stat
+import subprocess
+import time
+
+from doctor_tests.common import utils
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import nova_client
@six.add_metaclass(abc.ABCMeta)
@@ -15,6 +26,8 @@ class BaseInstaller(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
+ self.servers = list()
+ self.use_containers = False
@abc.abstractproperty
def node_user_name(self):
@@ -35,3 +48,144 @@ class BaseInstaller(object):
@abc.abstractmethod
def cleanup(self):
pass
+
+ def create_flavor(self):
+ self.nova = \
+ nova_client(self.conf.nova_version,
+ get_session())
+ flavors = {flavor.name: flavor for flavor in self.nova.flavors.list()}
+ if self.conf.flavor not in flavors:
+ self.nova.flavors.create(self.conf.flavor, 512, 1, 1)
+
+ def setup_stunnel(self):
+ self.log.info('Setup ssh stunnel in %s installer......'
+ % self.conf.installer.type)
+ tunnels = [self.conf.consumer.port]
+ if self.conf.test_case == 'maintenance':
+ tunnel_uptime = 1200
+ tunnels += [self.conf.app_manager.port, self.conf.inspector.port]
+ elif self.conf.test_case == 'all':
+ tunnel_uptime = 1800
+ tunnels += [self.conf.app_manager.port, self.conf.inspector.port]
+ else:
+ tunnel_uptime = 600
+
+ for node_ip in self.controllers:
+ for port in tunnels:
+ self.log.info('tunnel for port %s' % port)
+ cmd = ("ssh -o UserKnownHostsFile=/dev/null"
+ " -o StrictHostKeyChecking=no"
+ " -i %s %s@%s -R %s:localhost:%s"
+ " sleep %s > ssh_tunnel.%s.%s"
+ " 2>&1 < /dev/null "
+ % (self.key_file,
+ self.node_user_name,
+ node_ip,
+ port,
+ port,
+ tunnel_uptime,
+ node_ip,
+ port))
+ server = subprocess.Popen('exec ' + cmd, shell=True)
+ self.servers.append(server)
+ if self.conf.admin_tool.type == 'fenix':
+ port = self.conf.admin_tool.port
+ self.log.info('tunnel for port %s' % port)
+ cmd = ("ssh -o UserKnownHostsFile=/dev/null"
+ " -o StrictHostKeyChecking=no"
+ " -i %s %s@%s -L %s:localhost:%s"
+ " sleep %s > ssh_tunnel.%s.%s"
+ " 2>&1 < /dev/null "
+ % (self.key_file,
+ self.node_user_name,
+ node_ip,
+ port,
+ port,
+ tunnel_uptime,
+ node_ip,
+ port))
+ server = subprocess.Popen('exec ' + cmd, shell=True)
+ self.servers.append(server)
+
+ def _get_ssh_key(self, client, key_path):
+ self.log.info('Get SSH keys from %s installer......'
+ % self.conf.installer.type)
+
+ if self.key_file is not None:
+ self.log.info('Already have SSH keys from %s installer......'
+ % self.conf.installer.type)
+ return self.key_file
+
+ ssh_key = '{0}/{1}'.format(utils.get_doctor_test_root_dir(),
+ 'instack_key')
+ client.scp(key_path, ssh_key, method='get')
+ user = getpass.getuser()
+ uid = pwd.getpwnam(user).pw_uid
+ gid = grp.getgrnam(user).gr_gid
+ os.chown(ssh_key, uid, gid)
+ os.chmod(ssh_key, stat.S_IREAD)
+ return ssh_key
+
+ @abc.abstractmethod
+ def get_transport_url(self):
+ pass
+
+ def _run_cmd_remote(self, client, command):
+ self.log.info('Run command=%s in %s installer......'
+ % (command, self.conf.installer.type))
+
+ ret, output = client.ssh(command)
+ if ret:
+ raise Exception('Exec command in %s installer failed,'
+ 'ret=%s, output=%s'
+ % (self.conf.installer.type,
+ ret, output))
+ self.log.info('Output=%s command=%s in %s installer'
+ % (output, command, self.conf.installer.type))
+ return output
+
+ def _check_cmd_remote(self, client, command):
+ self.log.info('Check command=%s return in %s installer......'
+ % (command, self.conf.installer.type))
+
+ ret, output = client.ssh(command, raise_enabled=False)
+ self.log.info('return %s' % ret)
+ if ret == 0:
+ ret = True
+ else:
+ ret = False
+ return ret
+
+ @utils.run_async
+ def _run_apply_patches(self, client, restart_cmd, script_names,
+ python='python3'):
+ installer_dir = os.path.dirname(os.path.realpath(__file__))
+ if isinstance(script_names, list):
+ for script_name in script_names:
+ script_abs_path = '{0}/{1}/{2}'.format(installer_dir,
+ 'common', script_name)
+ if self.conf.installer.type == "devstack":
+ script_name = "/opt/stack/%s" % script_name
+ try:
+ client.scp(script_abs_path, script_name)
+ except Exception:
+ client.scp(script_abs_path, script_name)
+ try:
+ if ".py" in script_name:
+ cmd = 'sudo %s %s' % (python, script_name)
+ else:
+ cmd = 'sudo chmod 700 %s;sudo ./%s' % (script_name,
+ script_name)
+ ret, output = client.ssh(cmd)
+ self.log.info('Command %s output %s' % (cmd, output))
+ except Exception:
+ ret, output = client.ssh(cmd)
+ self.log.info('Command %s output %s' % (cmd, output))
+ if ret:
+ raise Exception('Do the command in remote'
+ ' node failed, ret=%s, cmd=%s, output=%s'
+ % (ret, cmd, output))
+ if 'nova' in restart_cmd or 'devstack@n-' in restart_cmd:
+ # Make sure scheduler has proper cpu_allocation_ratio
+ time.sleep(5)
+ client.ssh(restart_cmd)
diff --git a/doctor_tests/installer/common/congress.py b/doctor_tests/installer/common/congress.py
deleted file mode 100644
index cc58c390..00000000
--- a/doctor_tests/installer/common/congress.py
+++ /dev/null
@@ -1,51 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-
-
-def set_doctor_driver_conf(ssh_client, restart_cmd):
- cg_set_cmd = '''#!/bin/bash
-co_conf=/etc/congress/congress.conf
-co_conf_bak=/etc/congress/congress.conf.bak
-co_entry="congress.datasources.doctor_driver.DoctorDriver"
-if sudo grep -e "^drivers.*$co_entry" $co_conf; then
- echo "NOTE: congress is configured as we needed"
-else
- echo "modify the congress config"
- sudo cp $co_conf $co_conf_bak
- sudo sed -i -e "/^drivers/s/$/,$co_entry/" $co_conf
- %s
-fi
- ''' % (restart_cmd)
-
- ret, output = ssh_client.ssh(cg_set_cmd)
- if ret:
- raise Exception('Do the congress command in controller node failed...'
- 'ret=%s, cmd=%s, output=%s'
- % (ret, cg_set_cmd, output))
-
-
-def restore_doctor_driver_conf(ssh_client, restart_cmd):
- cg_restore_cmd = '''#!/bin/bash
-co_conf=/etc/congress/congress.conf
-co_conf_bak=/etc/congress/congress.conf.bak
-if [ -e $co_conf_bak ]; then
- echo "restore the congress config"
- sudo cp $co_conf_bak $co_conf
- sudo rm $co_conf_bak
- %s
-else
- echo "Do not need to restore the congress config"
-fi
- ''' % (restart_cmd)
-
- ret, output = ssh_client.ssh(cg_restore_cmd)
- if ret:
- raise Exception('Do the congress command in controller node failed...'
- 'ret=%s, cmd=%s, output=%s'
- % (ret, cg_restore_cmd, output))
diff --git a/doctor_tests/installer/common/restart_aodh.py b/doctor_tests/installer/common/restart_aodh.py
new file mode 100644
index 00000000..4473bdca
--- /dev/null
+++ b/doctor_tests/installer/common/restart_aodh.py
@@ -0,0 +1,42 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import socket
+import subprocess
+
+
+def restart_aodh_event_alarm():
+ # Restart aodh-evaluator docker with localhost as controller host ip
+ # This makes our alarm sending look the same as without container
+
+ orig_docker_id = subprocess.check_output("docker ps | grep aodh-evaluator "
+ "| awk '{print $1}'", shell=True)
+ get_docker_startup = (
+ 'docker inspect --format=\'{{range .Config.Env}} -e "{{.}}" {{end}} '
+ '{{range .Mounts}} -v {{.Source}}:{{.Destination}}{{if .Mode}}:'
+ '{{.Mode}}{{end}}{{end}} -ti {{.Config.Image}}\''
+ )
+ docker_start = subprocess.check_output("%s %s" % (get_docker_startup,
+ orig_docker_id), shell=True)
+ with open("orig_docker_id", 'w') as oid:
+ oid.write(orig_docker_id)
+ oid.close()
+ subprocess.check_output("docker stop %s" % orig_docker_id, shell=True)
+ ip = socket.gethostbyname(socket.gethostname())
+
+ ae_start = '-d --add-host="localhost:%s" %s' % (ip, docker_start)
+ subprocess.check_output("docker run %s" % ae_start, shell=True)
+ new_docker_id = subprocess.check_output("docker ps | grep aodh-evaluator "
+ " | awk '{print $1}'", shell=True)
+ if orig_docker_id == new_docker_id:
+ raise Exception("Docker ids matching!")
+ with open("new_docker_id", 'w') as nid:
+ nid.write(new_docker_id)
+ nid.close()
+
+restart_aodh_event_alarm()
diff --git a/doctor_tests/installer/common/restore_aodh.py b/doctor_tests/installer/common/restore_aodh.py
new file mode 100644
index 00000000..b55eae8d
--- /dev/null
+++ b/doctor_tests/installer/common/restore_aodh.py
@@ -0,0 +1,32 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import subprocess
+
+
+def restore_aodh_event_alarm():
+ # Remove modified docker and restore original
+ orig = "orig_docker_id"
+ new = "new_docker_id"
+ if os.path.isfile(orig):
+ with open("orig_docker_id", 'r') as oid:
+ orig_docker_id = oid.read()
+ oid.close()
+ if os.path.isfile(new):
+ with open("new_docker_id", 'r') as nid:
+ new_docker_id = nid.read()
+ nid.close()
+ subprocess.check_output("docker stop %s" % new_docker_id,
+ shell=True)
+ subprocess.check_output("docker rm %s" % new_docker_id, shell=True)
+ os.remove(new)
+ subprocess.check_output("docker start %s" % orig_docker_id, shell=True)
+ os.remove(orig)
+
+restore_aodh_event_alarm()
diff --git a/doctor_tests/installer/common/restore_ceilometer.py b/doctor_tests/installer/common/restore_ceilometer.py
deleted file mode 100644
index d25b9ede..00000000
--- a/doctor_tests/installer/common/restore_ceilometer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-import os
-import shutil
-
-ep_file = '/etc/ceilometer/event_pipeline.yaml'
-ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
-
-
-def restore_ep_config():
-
- if not os.path.isfile(ep_file_bak):
- print('Bak_file:%s does not exist.' % ep_file_bak)
- else:
- print('restore')
- shutil.copyfile(ep_file_bak, ep_file)
- os.remove(ep_file_bak)
- return
-
-
-restore_ep_config()
diff --git a/doctor_tests/installer/common/restore_compute_config.py b/doctor_tests/installer/common/restore_compute_config.py
new file mode 100644
index 00000000..82e10a66
--- /dev/null
+++ b/doctor_tests/installer/common/restore_compute_config.py
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def restore_cpu_allocation_ratio():
+ for nova_file_bak in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.bak", # noqa
+ "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.bak", # noqa
+ "/etc/nova/nova.bak"]:
+ if os.path.isfile(nova_file_bak):
+ nova_file = nova_file_bak.replace(".bak", ".conf")
+ print('restoring nova.bak.')
+ shutil.copyfile(nova_file_bak, nova_file)
+ os.remove(nova_file_bak)
+ return
+ print('nova.bak does not exist.')
+ return
+
+restore_cpu_allocation_ratio()
diff --git a/doctor_tests/installer/common/restore_config.py b/doctor_tests/installer/common/restore_config.py
new file mode 100644
index 00000000..5cb83b27
--- /dev/null
+++ b/doctor_tests/installer/common/restore_config.py
@@ -0,0 +1,48 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+cbase = "/var/lib/config-data/puppet-generated/ceilometer"
+if not os.path.isdir(cbase):
+ cbase = ""
+
+
+def restore_ep_config():
+ ep_file = cbase + '/etc/ceilometer/event_pipeline.yaml'
+ ep_file_bak = cbase + '/etc/ceilometer/event_pipeline.yaml.bak'
+
+ if not os.path.isfile(ep_file_bak):
+ print('Bak_file:%s does not exist.' % ep_file_bak)
+ else:
+ print('restore')
+ shutil.copyfile(ep_file_bak, ep_file)
+ os.remove(ep_file_bak)
+ return
+
+
+def restore_ed_config():
+ ed_file = cbase + '/etc/ceilometer/event_definitions.yaml'
+ ed_file_bak = cbase + '/etc/ceilometer/event_definitions.bak'
+
+ if not os.path.isfile(ed_file_bak):
+ print("Bak_file doesn't exist: %s." % ed_file_bak)
+ else:
+ print('restore: %s' % ed_file)
+ if os.stat(ed_file_bak).st_size == 0:
+ print('Bak_file empty, so removing also: %s' % ed_file)
+ os.remove(ed_file)
+ else:
+ shutil.copyfile(ed_file_bak, ed_file)
+ os.remove(ed_file_bak)
+ return
+
+restore_ep_config()
+restore_ed_config()
diff --git a/doctor_tests/installer/common/restore_congress.py b/doctor_tests/installer/common/restore_congress.py
new file mode 100644
index 00000000..576f1b16
--- /dev/null
+++ b/doctor_tests/installer/common/restore_congress.py
@@ -0,0 +1,29 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def restore_drivers_config():
+ co_base = "/var/lib/config-data/puppet-generated/congress"
+ if not os.path.isdir(co_base):
+ co_base = ""
+ co_conf = co_base + "/etc/congress/congress.conf"
+ co_conf_bak = co_base + "/etc/congress/congress.conf.bak"
+
+ if not os.path.isfile(co_conf_bak):
+ print('Bak_file:%s does not exist.' % co_conf_bak)
+ else:
+ print('restore: %s' % co_conf)
+ shutil.copyfile(co_conf_bak, co_conf)
+ os.remove(co_conf_bak)
+ return
+
+
+restore_drivers_config()
diff --git a/doctor_tests/installer/common/set_ceilometer.py b/doctor_tests/installer/common/set_ceilometer.py
deleted file mode 100644
index 4050aaef..00000000
--- a/doctor_tests/installer/common/set_ceilometer.py
+++ /dev/null
@@ -1,45 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-import os
-import shutil
-import yaml
-
-ep_file = '/etc/ceilometer/event_pipeline.yaml'
-ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
-event_notifier_topic = 'notifier://?topic=alarm.all'
-
-
-def set_notifier_topic():
- config_modified = False
-
- if not os.path.isfile(ep_file):
- raise Exception("File doesn't exist: %s." % ep_file)
-
- with open(ep_file, 'r') as file:
- config = yaml.safe_load(file)
-
- sinks = config['sinks']
- for sink in sinks:
- if sink['name'] == 'event_sink':
- publishers = sink['publishers']
- if event_notifier_topic not in publishers:
- print('Add event notifier in ceilometer')
- publishers.append(event_notifier_topic)
- config_modified = True
- else:
- print('NOTE: event notifier is configured'
- 'in ceilometer as we needed')
-
- if config_modified:
- shutil.copyfile(ep_file, ep_file_bak)
- with open(ep_file, 'w+') as file:
- file.write(yaml.safe_dump(config))
-
-
-set_notifier_topic()
diff --git a/doctor_tests/installer/common/set_compute_config.py b/doctor_tests/installer/common/set_compute_config.py
new file mode 100644
index 00000000..615f1895
--- /dev/null
+++ b/doctor_tests/installer/common/set_compute_config.py
@@ -0,0 +1,53 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def set_cpu_allocation_ratio():
+ nova_file_bak = None
+ for nova_file in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf", # noqa
+ "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.conf", # noqa
+ "/etc/nova/nova.conf"]:
+ if os.path.isfile(nova_file):
+ nova_file_bak = nova_file.replace(".conf", ".bak")
+ break
+
+ if nova_file_bak is None:
+ raise Exception("Could not find nova.conf")
+ # TODO (tojuvone): Unfortunately ConfigParser did not produce working conf
+ fcheck = open(nova_file)
+ found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio"
+ in ca])
+ fcheck.close()
+ change = False
+ found = False
+ if found_list and len(found_list):
+ for car in found_list:
+ if car.startswith('#'):
+ continue
+ if car.startswith('cpu_allocation_ratio'):
+ found = True
+ if "1.0" not in car.split('=')[1]:
+ change = True
+ if not found or change:
+ # need to add or change
+ shutil.copyfile(nova_file, nova_file_bak)
+ fin = open(nova_file_bak)
+ fout = open(nova_file, "wt")
+ for line in fin:
+ if change and line.startswith("cpu_allocation_ratio"):
+ line = "cpu_allocation_ratio=1.0"
+ if not found and line.startswith("[DEFAULT]"):
+ line += "cpu_allocation_ratio=1.0\n"
+ fout.write(line)
+ fin.close()
+ fout.close()
+
+set_cpu_allocation_ratio()
diff --git a/doctor_tests/installer/common/set_config.py b/doctor_tests/installer/common/set_config.py
new file mode 100644
index 00000000..e66d4c2c
--- /dev/null
+++ b/doctor_tests/installer/common/set_config.py
@@ -0,0 +1,163 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+import yaml
+
+
+cbase = "/var/lib/config-data/puppet-generated/ceilometer"
+if not os.path.isdir(cbase):
+ cbase = ""
+
+
+def set_notifier_topic():
+ ep_file = cbase + '/etc/ceilometer/event_pipeline.yaml'
+ ep_file_bak = cbase + '/etc/ceilometer/event_pipeline.yaml.bak'
+ event_notifier_topic = 'notifier://?topic=alarm.all'
+ config_modified = False
+
+ if not os.path.isfile(ep_file):
+ raise Exception("File doesn't exist: %s." % ep_file)
+
+ with open(ep_file, 'r') as file:
+ config = yaml.safe_load(file)
+
+ sinks = config['sinks']
+ for sink in sinks:
+ if sink['name'] == 'event_sink':
+ publishers = sink['publishers']
+ if event_notifier_topic not in publishers:
+ print('Add event notifier in ceilometer')
+ publishers.append(event_notifier_topic)
+ config_modified = True
+ else:
+ print('NOTE: event notifier is configured'
+ 'in ceilometer as we needed')
+
+ if config_modified:
+ shutil.copyfile(ep_file, ep_file_bak)
+ with open(ep_file, 'w+') as file:
+ file.write(yaml.safe_dump(config))
+
+
+def set_event_definitions():
+ ed_file = cbase + '/etc/ceilometer/event_definitions.yaml'
+ ed_file_bak = cbase + '/etc/ceilometer/event_definitions.bak'
+ orig_ed_file_exist = True
+ modify_config = False
+
+ if not os.path.isfile(ed_file):
+ # Deployment did not modify file, so it did not exist
+ src_file = '/etc/ceilometer/event_definitions.yaml'
+ if not os.path.isfile(src_file):
+ config = []
+ orig_ed_file_exist = False
+ else:
+ shutil.copyfile('/etc/ceilometer/event_definitions.yaml', ed_file)
+ if orig_ed_file_exist:
+ with open(ed_file, 'r') as file:
+ config = yaml.safe_load(file)
+
+ et_list = [et['event_type'] for et in config]
+
+ if 'compute.instance.update' in et_list:
+ print('NOTE: compute.instance.update allready configured')
+ else:
+ print('NOTE: add compute.instance.update to event_definitions.yaml')
+ modify_config = True
+ instance_update = {
+ 'event_type': 'compute.instance.update',
+ 'traits': {
+ 'deleted_at': {'fields': 'payload.deleted_at',
+ 'type': 'datetime'},
+ 'disk_gb': {'fields': 'payload.disk_gb',
+ 'type': 'int'},
+ 'display_name': {'fields': 'payload.display_name'},
+ 'ephemeral_gb': {'fields': 'payload.ephemeral_gb',
+ 'type': 'int'},
+ 'host': {'fields': 'publisher_id.`split(., 1, 1)`'},
+ 'instance_id': {'fields': 'payload.instance_id'},
+ 'instance_type': {'fields': 'payload.instance_type'},
+ 'instance_type_id': {'fields': 'payload.instance_type_id',
+ 'type': 'int'},
+ 'launched_at': {'fields': 'payload.launched_at',
+ 'type': 'datetime'},
+ 'memory_mb': {'fields': 'payload.memory_mb',
+ 'type': 'int'},
+ 'old_state': {'fields': 'payload.old_state'},
+ 'os_architecture': {
+ 'fields':
+ "payload.image_meta.'org.openstack__1__architecture'"},
+ 'os_distro': {
+ 'fields':
+ "payload.image_meta.'org.openstack__1__os_distro'"},
+ 'os_version': {
+ 'fields':
+ "payload.image_meta.'org.openstack__1__os_version'"},
+ 'resource_id': {'fields': 'payload.instance_id'},
+ 'root_gb': {'fields': 'payload.root_gb',
+ 'type': 'int'},
+ 'service': {'fields': 'publisher_id.`split(., 0, -1)`'},
+ 'state': {'fields': 'payload.state'},
+ 'tenant_id': {'fields': 'payload.tenant_id'},
+ 'user_id': {'fields': 'payload.user_id'},
+ 'vcpus': {'fields': 'payload.vcpus', 'type': 'int'}
+ }
+ }
+ config.append(instance_update)
+
+ if 'maintenance.scheduled' in et_list:
+ print('NOTE: maintenance.scheduled allready configured')
+ else:
+ print('NOTE: add maintenance.scheduled to event_definitions.yaml')
+ modify_config = True
+ mscheduled = {
+ 'event_type': 'maintenance.scheduled',
+ 'traits': {
+ 'allowed_actions': {'fields': 'payload.allowed_actions'},
+ 'instance_ids': {'fields': 'payload.instance_ids'},
+ 'reply_url': {'fields': 'payload.reply_url'},
+ 'actions_at': {'fields': 'payload.actions_at',
+ 'type': 'datetime'},
+ 'reply_at': {'fields': 'payload.reply_at', 'type': 'datetime'},
+ 'state': {'fields': 'payload.state'},
+ 'session_id': {'fields': 'payload.session_id'},
+ 'project_id': {'fields': 'payload.project_id'},
+ 'metadata': {'fields': 'payload.metadata'}
+ }
+ }
+ config.append(mscheduled)
+
+ if 'maintenance.host' in et_list:
+ print('NOTE: maintenance.host allready configured')
+ else:
+ print('NOTE: add maintenance.host to event_definitions.yaml')
+ modify_config = True
+ mhost = {
+ 'event_type': 'maintenance.host',
+ 'traits': {
+ 'host': {'fields': 'payload.host'},
+ 'project_id': {'fields': 'payload.project_id'},
+ 'state': {'fields': 'payload.state'},
+ 'session_id': {'fields': 'payload.session_id'}
+ }
+ }
+ config.append(mhost)
+
+ if modify_config:
+ if orig_ed_file_exist:
+ shutil.copyfile(ed_file, ed_file_bak)
+ else:
+ with open(ed_file_bak, 'w+') as file:
+ file.close()
+ with open(ed_file, 'w+') as file:
+ file.write(yaml.safe_dump(config))
+
+set_notifier_topic()
+set_event_definitions()
diff --git a/doctor_tests/installer/common/set_congress.py b/doctor_tests/installer/common/set_congress.py
new file mode 100644
index 00000000..7961df32
--- /dev/null
+++ b/doctor_tests/installer/common/set_congress.py
@@ -0,0 +1,39 @@
+##############################################################################
+# Copyright (c) 2018 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from six.moves import configparser
+import os
+import shutil
+
+
+def set_drivers_config():
+ co_base = "/var/lib/config-data/puppet-generated/congress"
+ if not os.path.isdir(co_base):
+ co_base = ""
+ co_conf = co_base + "/etc/congress/congress.conf"
+ co_conf_bak = co_base + "/etc/congress/congress.conf.bak"
+ doctor_driver = "congress.datasources.doctor_driver.DoctorDriver"
+ config_modified = False
+
+ config = configparser.ConfigParser()
+ config.read(co_conf)
+ drivers = config.get('DEFAULT', 'drivers')
+
+ if doctor_driver not in drivers:
+ config_modified = True
+ drivers += ',' + doctor_driver
+
+ config.set('DEFAULT', 'drivers', drivers)
+
+ if config_modified:
+ shutil.copyfile(co_conf, co_conf_bak)
+ with open(co_conf, 'w') as configfile:
+ config.write(configfile)
+
+
+set_drivers_config()
diff --git a/doctor_tests/installer/common/set_fenix.sh b/doctor_tests/installer/common/set_fenix.sh
new file mode 100644
index 00000000..bd1eae47
--- /dev/null
+++ b/doctor_tests/installer/common/set_fenix.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+
+##############################################################################
+# Copyright (c) 2019 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+# Config files
+docker -v >/dev/null || {
+echo "Fenix needs docker to be installed..."
+ver=`grep "UBUNTU_CODENAME" /etc/os-release | cut -d '=' -f 2`
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $ver stable"
+apt install apt-transport-https ca-certificates curl software-properties-common
+apt update
+apt-cache policy docker-ce
+apt-get install -y docker-ce docker-ce-cli containerd.io
+dpkg -r --force-depends golang-docker-credential-helpers
+}
+
+docker ps | grep fenix -q && {
+REMOTE=`git ls-remote https://opendev.org/x/fenix HEAD | awk '{ print $1}'`
+LOCAL=`docker exec -t fenix git rev-parse @`
+if [[ "$LOCAL" =~ "$REMOTE" ]]; then
+ # Difference in above string ending marks, so cannot compare equal
+ echo "Fenix start: Already running latest $LOCAL equals $REMOTE"
+ exit 0
+else
+ echo "Fenix container needs to be recreated $LOCAL not $REMOTE"
+ # Remove previous container
+ for img in `docker image list | grep "^fenix" | awk '{print $1}'`; do
+ for dock in `docker ps --all -f "ancestor=$img" | grep "$img" | awk '{print $1}'`; do
+ docker stop $dock; docker rm $dock;
+ done;
+ docker image rm $img;
+ done
+fi
+} || echo "Fenix container needs to be created..."
+
+cp /root/keystonercv3 .
+
+transport=`grep -m1 "^transport" /etc/nova/nova.conf`
+. keystonercv3
+
+echo "[DEFAULT]" > fenix.conf
+echo "port = 12347" >> fenix.conf
+echo $transport >> fenix.conf
+
+echo "[database]" >> fenix.conf
+MYSQLIP=`grep -m1 "^connection" /etc/nova/nova.conf | sed -e "s/.*@//;s/\/.*//"`
+echo "connection = mysql+pymysql://fenix:fenix@$MYSQLIP/fenix" >> fenix.conf
+
+echo "[service_user]" >> fenix.conf
+echo "os_auth_url = $OS_AUTH_URL" >> fenix.conf
+echo "os_username = $OS_USERNAME" >> fenix.conf
+echo "os_password = $OS_PASSWORD" >> fenix.conf
+echo "os_user_domain_name = $OS_USER_DOMAIN_NAME" >> fenix.conf
+echo "os_project_name = $OS_PROJECT_NAME" >> fenix.conf
+echo "os_project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix.conf
+
+echo "[DEFAULT]" > fenix-api.conf
+echo "port = 12347" >> fenix-api.conf
+echo $transport >> fenix-api.conf
+
+echo "[keystone_authtoken]" >> fenix-api.conf
+echo "auth_url = $OS_AUTH_URL" >> fenix-api.conf
+echo "auth_type = password" >> fenix-api.conf
+echo "project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf
+echo "project_name = $OS_PROJECT_NAME" >> fenix-api.conf
+echo "user_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf
+echo "password = $OS_PASSWORD" >> fenix-api.conf
+echo "username = $OS_USERNAME" >> fenix-api.conf
+echo "cafile = /opt/stack/data/ca-bundle.pem" >> fenix-api.conf
+
+openstack service list | grep -q maintenance || {
+openstack service create --name fenix --enable maintenance
+openstack endpoint create --region $OS_REGION_NAME --enable fenix public http://localhost:12347/v1
+}
+
+# Mysql pw
+# MYSQLPW=`cat /var/lib/config-data/mysql/etc/puppet/hieradata/service_configs.json | grep mysql | grep root_password | awk -F": " '{print $2}' | awk -F"\"" '{print $2}'`
+MYSQLPW=root
+
+# Fenix DB
+[ `mysql -uroot -p$MYSQLPW -e "SELECT host, user FROM mysql.user;" | grep fenix | wc -l` -eq 0 ] && {
+ mysql -uroot -p$MYSQLPW -hlocalhost -e "CREATE USER 'fenix'@'localhost' IDENTIFIED BY 'fenix';"
+ mysql -uroot -p$MYSQLPW -hlocalhost -e "GRANT ALL PRIVILEGES ON fenix.* TO 'fenix'@'' identified by 'fenix';FLUSH PRIVILEGES;"
+}
+mysql -ufenix -pfenix -hlocalhost -e "DROP DATABASE IF EXISTS fenix;"
+mysql -ufenix -pfenix -hlocalhost -e "CREATE DATABASE fenix CHARACTER SET utf8;"
+
+# Build Fenix container and run it
+chmod 700 run
+docker build --build-arg OPENSTACK=master --build-arg BRANCH=master --network host $PWD -t fenix | tail -1
+docker run --network host -d --name fenix -p 12347:12347 -ti fenix
+if [ $? -eq 0 ]; then
+ echo "Fenix start: OK"
+else
+ echo "Fenix start: FAILED"
+fi
+# To debug check log from fenix container
+# docker exec -ti fenix tail -f /var/log/fenix-engine.log
diff --git a/doctor_tests/installer/common/vitrage.py b/doctor_tests/installer/common/vitrage.py
index 30a73f5d..801adff5 100644
--- a/doctor_tests/installer/common/vitrage.py
+++ b/doctor_tests/installer/common/vitrage.py
@@ -9,8 +9,11 @@
import os
+vi_base = "/var/lib/config-data/puppet-generated/vitrage"
+if not os.path.isdir(vi_base):
+ vi_base = ""
vitrage_template_file = \
- '/etc/vitrage/templates/vitrage_host_down_scenarios.yaml'
+ vi_base + '/etc/vitrage/templates/vitrage_host_down_scenarios.yaml'
template = """
metadata:
diff --git a/doctor_tests/installer/daisy.py b/doctor_tests/installer/daisy.py
index 3b1fbb24..e4499d9c 100644
--- a/doctor_tests/installer/daisy.py
+++ b/doctor_tests/installer/daisy.py
@@ -6,38 +6,27 @@
# which accompanies this distribution, and is available at
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
-import getpass
-import grp
-import os
-import pwd
-import stat
-import subprocess
-
-from doctor_tests.common.utils import get_doctor_test_root_dir
from doctor_tests.common.utils import SSHClient
-from doctor_tests.identity_auth import get_session
from doctor_tests.installer.base import BaseInstaller
-from doctor_tests.os_clients import nova_client
class DaisyInstaller(BaseInstaller):
node_user_name = 'root'
+ installer_username = 'root'
def __init__(self, conf, log):
super(DaisyInstaller, self).__init__(conf, log)
self.client = SSHClient(self.conf.installer.ip,
- self.conf.installer.username,
+ self.installer_username,
password='r00tme')
self.key_file = None
self.controllers = list()
- self.servers = list()
- self.test_dir = get_doctor_test_root_dir()
def setup(self):
self.log.info('Setup Daisy installer start......')
- self.get_ssh_key_from_installer()
- self.get_controller_ips()
+ self.key_file = self.get_ssh_key_from_installer()
+ self.controllers = self.get_controller_ips()
self.create_flavor()
self.setup_stunnel()
@@ -46,38 +35,21 @@ class DaisyInstaller(BaseInstaller):
server.terminate()
def get_ssh_key_from_installer(self):
- self.log.info('Get SSH keys from Daisy installer......')
-
- if self.key_file is not None:
- self.log.info('Already have SSH keys from Daisy installer......')
- return self.key_file
-
- ssh_key = '{0}/{1}'.format(self.test_dir, 'instack_key')
- self.client.scp('/root/.ssh/id_dsa', ssh_key, method='get')
- user = getpass.getuser()
- uid = pwd.getpwnam(user).pw_uid
- gid = grp.getgrnam(user).gr_gid
- os.chown(ssh_key, uid, gid)
- os.chmod(ssh_key, stat.S_IREAD)
- self.key_file = ssh_key
- return self.key_file
+ key_path = '/root/.ssh/id_dsa'
+ return self._get_ssh_key(self.client, key_path)
def get_controller_ips(self):
self.log.info('Get controller ips from Daisy installer......')
command = "source daisyrc_admin; " \
"daisy host-list | grep 'CONTROLLER_LB' | cut -d '|' -f 3 "
- ret, controllers = self.client.ssh(command)
- if ret:
- raise Exception('Exec command to get controller ips'
- 'in Daisy installer failed'
- 'ret=%s, output=%s' % (ret, controllers))
- controller_ips = []
- for controller in controllers:
- controller_ips.append(self.get_host_ip_from_hostname(controller))
+ controller_names = self._run_cmd_remote(self.client, command)
+ controllers = \
+ [self.get_host_ip_from_hostname(controller)
+ for controller in controller_names]
self.log.info('Get controller_ips:%s from Daisy installer'
- % controller_ips)
- self.controllers = controller_ips
+ % controllers)
+ return controllers
def get_host_ip_from_hostname(self, hostname):
self.log.info('Get host ip from host name......')
@@ -87,26 +59,3 @@ class DaisyInstaller(BaseInstaller):
self.log.info('Get host_ip:%s from host_name:%s'
% (host_ip, hostname))
return host_ip
-
- def create_flavor(self):
- self.nova = \
- nova_client(self.conf.nova_version,
- get_session())
- flavors = {flavor.name: flavor for flavor in self.nova.flavors.list()}
- if self.conf.flavor not in flavors:
- self.nova.flavors.create(self.conf.flavor, 512, 1, 1)
-
- def setup_stunnel(self):
- self.log.info('Setup ssh stunnel in controller nodes'
- 'in Daisy installer......')
- for node_ip in self.controllers:
- cmd = ("ssh -o UserKnownHostsFile=/dev/null"
- " -o StrictHostKeyChecking=no"
- " -i %s %s@%s -R %s:localhost:%s"
- " sleep 600 > ssh_tunnel.%s 2>&1 < /dev/null &"
- % (self.key_file, self.node_user_name,
- node_ip, self.conf.consumer.port,
- self.conf.consumer.port, node_ip))
- server = subprocess.Popen(cmd, shell=True)
- self.servers.append(server)
- server.communicate()
diff --git a/doctor_tests/installer/devstack.py b/doctor_tests/installer/devstack.py
new file mode 100644
index 00000000..02f3601a
--- /dev/null
+++ b/doctor_tests/installer/devstack.py
@@ -0,0 +1,151 @@
+##############################################################################
+# Copyright (c) 2019 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import socket
+import time
+
+from doctor_tests.common.utils import SSHClient
+from doctor_tests.common.utils import LocalSSH
+from doctor_tests.identity_auth import get_session
+from doctor_tests.installer.base import BaseInstaller
+from doctor_tests.os_clients import nova_client
+
+
+class DevstackInstaller(BaseInstaller):
+ node_user_name = None
+ cm_set_script = 'set_config.py'
+ nc_set_compute_script = 'set_compute_config.py'
+ cm_restore_script = 'restore_config.py'
+ nc_restore_compute_script = 'restore_compute_config.py'
+ ac_restart_script = 'restart_aodh.py'
+ ac_restore_script = 'restore_aodh.py'
+ python = 'python'
+
+ def __init__(self, conf, log):
+ super(DevstackInstaller, self).__init__(conf, log)
+ # Run Doctor under users home. sudo hides other env param to be used
+ home, self.node_user_name = (iter(os.environ.get('VIRTUAL_ENV')
+ .split('/', 3)[1:3]))
+ # Migration needs to work so ssh should have proper key defined
+ self.key_file = '/%s/%s/.ssh/id_rsa' % (home, self.node_user_name)
+ self.log.info('ssh uses: %s and %s' % (self.node_user_name,
+ self.key_file))
+ self.controllers = ([ip for ip in
+ socket.gethostbyname_ex(socket.gethostname())[2]
+ if not ip.startswith('127.')] or
+ [[(s.connect(('8.8.8.8', 53)),
+ s.getsockname()[0], s.close())
+ for s in [socket.socket(socket.AF_INET,
+ socket.SOCK_DGRAM)]][0][1]])
+ conf.admin_tool.ip = self.controllers[0]
+ self.computes = list()
+ self.nova = nova_client(conf.nova_version, get_session())
+
+ def setup(self):
+ self.log.info('Setup Devstack installer start......')
+ self._get_devstack_conf()
+ self.create_flavor()
+ self.set_apply_patches()
+
+ def cleanup(self):
+ self.restore_apply_patches()
+
+ def get_ssh_key_from_installer(self):
+ return self.key_file
+
+ def get_transport_url(self):
+ client = LocalSSH(self.log)
+ cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf'
+ ret, url = client.ssh(cmd)
+ url = url.split("= ", 1)[1][:-1]
+ self.log.info('get_transport_url %s' % url)
+ return url
+
+ def get_host_ip_from_hostname(self, hostname):
+ return [hvisor.__getattr__('host_ip') for hvisor in self.hvisors
+ if hvisor.__getattr__('hypervisor_hostname') == hostname][0]
+
+ def _get_devstack_conf(self):
+ self.log.info('Get devstack config details for Devstack installer'
+ '......')
+ self.hvisors = self.nova.hypervisors.list(detailed=True)
+ self.log.info('checking hypervisors.......')
+ self.computes = [hvisor.__getattr__('host_ip') for hvisor in
+ self.hvisors]
+ self.use_containers = False
+ self.log.info('controller_ips:%s' % self.controllers)
+ self.log.info('compute_ips:%s' % self.computes)
+ self.log.info('use_containers:%s' % self.use_containers)
+
+ def _set_docker_restart_cmd(self, service):
+ # There can be multiple instances running so need to restart all
+ cmd = "for container in `sudo docker ps | grep "
+ cmd += service
+ cmd += " | awk '{print $1}'`; do sudo docker restart $container; \
+ done;"
+ return cmd
+
+ def set_apply_patches(self):
+ self.log.info('Set apply patches start......')
+
+ set_scripts = [self.cm_set_script]
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@ceilometer-anotification.service'
+
+ client = LocalSSH(self.log)
+ self._run_apply_patches(client,
+ restart_cmd,
+ set_scripts,
+ python=self.python)
+ time.sleep(7)
+
+ self.log.info('Set apply patches start......')
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@n-cpu.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(client,
+ restart_cmd,
+ [self.nc_set_compute_script],
+ python=self.python)
+ time.sleep(7)
+
+ def restore_apply_patches(self):
+ self.log.info('restore apply patches start......')
+
+ restore_scripts = [self.cm_restore_script]
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@ceilometer-anotification.service'
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd += ' devstack@n-sch.service'
+ restore_scripts.append(self.nc_restore_compute_script)
+
+ client = LocalSSH(self.log)
+ self._run_apply_patches(client,
+ restart_cmd,
+ restore_scripts,
+ python=self.python)
+
+ if self.conf.test_case != 'fault_management':
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@n-cpu.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(
+ client, restart_cmd,
+ [self.nc_restore_compute_script],
+ python=self.python)
diff --git a/doctor_tests/installer/local.py b/doctor_tests/installer/local.py
deleted file mode 100644
index fee14f33..00000000
--- a/doctor_tests/installer/local.py
+++ /dev/null
@@ -1,118 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-import os
-import shutil
-import subprocess
-
-from doctor_tests.installer.base import BaseInstaller
-from doctor_tests.installer.common.vitrage import \
- set_vitrage_host_down_template
-from doctor_tests.common.constants import Inspector
-from doctor_tests.common.utils import load_json_file
-from doctor_tests.common.utils import write_json_file
-
-
-class LocalInstaller(BaseInstaller):
- node_user_name = 'root'
-
- nova_policy_file = '/etc/nova/policy.json'
- nova_policy_file_backup = '%s%s' % (nova_policy_file, '.bak')
-
- def __init__(self, conf, log):
- super(LocalInstaller, self).__init__(conf, log)
- self.policy_modified = False
- self.add_policy_file = False
-
- def setup(self):
- self.get_ssh_key_from_installer()
- self.set_apply_patches()
-
- def cleanup(self):
- self.restore_apply_patches()
-
- def get_ssh_key_from_installer(self):
- self.log.info('Assuming SSH keys already exchanged with computer'
- 'for local installer type')
- return None
-
- def get_host_ip_from_hostname(self, hostname):
- self.log.info('Get host ip from host name in local installer......')
-
- cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname)
- server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
- stdout, stderr = server.communicate()
- host_ip = stdout.strip().decode("utf-8")
-
- self.log.info('Get host_ip:%s from host_name:%s in local installer'
- % (host_ip, hostname))
- return host_ip
-
- def set_apply_patches(self):
- self._set_nova_policy()
- if self.conf.inspector.type == Inspector.VITRAGE:
- set_vitrage_host_down_template()
- os.system('sudo systemctl restart devstack@vitrage-graph.service')
-
- def restore_apply_patches(self):
- self._restore_nova_policy()
-
- def _set_nova_policy(self):
- host_status_policy = 'os_compute_api:servers:show:host_status'
- host_status_rule = 'rule:admin_or_owner'
- policy_data = {
- 'context_is_admin': 'role:admin',
- 'owner': 'user_id:%(user_id)s',
- 'admin_or_owner': 'rule:context_is_admin or rule:owner',
- host_status_policy: host_status_rule
- }
-
- if os.path.isfile(self.nova_policy_file):
- data = load_json_file(self.nova_policy_file)
- if host_status_policy in data:
- rule_origion = data[host_status_policy]
- if host_status_rule == rule_origion:
- self.log.info('Do not need to modify nova policy.')
- self.policy_modified = False
- else:
- # update the host_status_policy
- data[host_status_policy] = host_status_rule
- self.policy_modified = True
- else:
- # add the host_status_policy, if the admin_or_owner is not
- # defined, add it also
- for policy, rule in policy_data.items():
- if policy not in data:
- data[policy] = rule
- self.policy_modified = True
- if self.policy_modified:
- self.log.info('Nova policy is Modified.')
- shutil.copyfile(self.nova_policy_file,
- self.nova_policy_file_backup)
- else:
- # file does not exit, create a new one and add the policy
- self.log.info('Nova policy file not exist. Creating a new one')
- data = policy_data
- self.add_policy_file = True
-
- if self.policy_modified or self.add_policy_file:
- write_json_file(self.nova_policy_file, data)
- os.system('sudo systemctl restart devstack@n-api.service')
-
- def _restore_nova_policy(self):
- if self.policy_modified:
- shutil.copyfile(self.nova_policy_file_backup,
- self.nova_policy_file)
- os.remove(self.nova_policy_file_backup)
- elif self.add_policy_file:
- os.remove(self.nova_policy_file)
-
- if self.add_policy_file or self.policy_modified:
- os.system('sudo systemctl restart devstack@n-api.service')
- self.add_policy_file = False
- self.policy_modified = False
diff --git a/doctor_tests/installer/mcp.py b/doctor_tests/installer/mcp.py
new file mode 100644
index 00000000..7659c9e2
--- /dev/null
+++ b/doctor_tests/installer/mcp.py
@@ -0,0 +1,207 @@
+##############################################################################
+# Copyright (c) 2019 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from os.path import isfile
+import re
+import time
+
+from doctor_tests.common.constants import is_fenix
+from doctor_tests.common.utils import get_doctor_test_root_dir
+from doctor_tests.common.utils import SSHClient
+from doctor_tests.installer.base import BaseInstaller
+
+
+class McpInstaller(BaseInstaller):
+ node_user_name = 'ubuntu'
+
+ cm_set_script = 'set_config.py'
+ nc_set_compute_script = 'set_compute_config.py'
+ fe_set_script = 'set_fenix.sh'
+ cm_restore_script = 'restore_config.py'
+ nc_restore_compute_script = 'restore_compute_config.py'
+ ac_restart_script = 'restart_aodh.py'
+ ac_restore_script = 'restore_aodh.py'
+ python = 'python3'
+
+ def __init__(self, conf, log):
+ super(McpInstaller, self).__init__(conf, log)
+ self.key_file = self.get_ssh_key_from_installer()
+ self.client = SSHClient(self.conf.installer.ip,
+ self.node_user_name,
+ key_filename=self.key_file,
+ look_for_keys=True)
+ self.controllers = list()
+ self.controller_clients = list()
+ self.computes = list()
+
+ def setup(self):
+ self.log.info('Setup MCP installer start......')
+ self.get_node_ips()
+ self.create_flavor()
+ if is_fenix(self.conf):
+ self.set_apply_patches()
+ self.setup_stunnel()
+
+ def cleanup(self):
+ if is_fenix(self.conf):
+ self.restore_apply_patches()
+ for server in self.servers:
+ server.terminate()
+
+ def get_ssh_key_from_installer(self):
+ self.log.info('Get SSH keys from MCP......')
+
+ # Default in path /var/lib/opnfv/mcp.rsa
+ ssh_key = '/root/.ssh/id_rsa'
+ mcp_key = '/var/lib/opnfv/mcp.rsa'
+ return mcp_key if isfile(mcp_key) else ssh_key
+
+ def get_transport_url(self):
+ client = SSHClient(self.controllers[0], self.node_user_name,
+ key_filename=self.key_file)
+ try:
+ cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf'
+ ret, url = client.ssh(cmd)
+
+ if ret:
+ raise Exception('Exec command to get transport from '
+ 'controller(%s) in MCP installer failed, '
+ 'ret=%s, output=%s'
+ % (self.controllers[0], ret, url))
+ elif self.controllers[0] not in url:
+ # need to use ip instead of hostname
+ url = (re.sub("@.*:", "@%s:" % self.controllers[0],
+ url[0].split("=", 1)[1]))
+ except Exception:
+ cmd = 'grep -i "^rabbit" /etc/nova/nova.conf'
+ ret, lines = client.ssh(cmd)
+ if ret:
+ raise Exception('Exec command to get transport from '
+ 'controller(%s) in MCP installer failed, '
+ 'ret=%s, output=%s'
+ % (self.controllers[0], ret, url))
+ else:
+ for line in lines.split('\n'):
+ if line.startswith("rabbit_userid"):
+ rabbit_userid = line.split("=")
+ if line.startswith("rabbit_port"):
+ rabbit_port = line.split("=")
+ if line.startswith("rabbit_password"):
+ rabbit_password = line.split("=")
+ url = "rabbit://%s:%s@%s:%s/?ssl=0" % (rabbit_userid,
+ rabbit_password,
+ self.controllers[0],
+ rabbit_port)
+ self.log.info('get_transport_url %s' % url)
+ return url
+
+ def _copy_overcloudrc_to_controllers(self):
+ for ip in self.controllers:
+ cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip)
+ self._run_cmd_remote(self.client, cmd)
+
+ def get_node_ips(self):
+ self.log.info('Get node ips from Mcp installer......')
+
+ command = 'sudo salt "*" --out yaml pillar.get _param:single_address'
+ node_details = self._run_cmd_remote(self.client, command)
+
+ self.controllers = [line.split()[1] for line in node_details
+ if line.startswith("ctl")]
+ self.computes = [line.split()[1] for line in node_details
+ if line.startswith("cmp")]
+
+ self.log.info('controller_ips:%s' % self.controllers)
+ self.log.info('compute_ips:%s' % self.computes)
+
+ def get_host_ip_from_hostname(self, hostname):
+ command = "sudo salt --out yaml '%s*' " \
+ "pillar.get _param:single_address |" \
+ "awk '{print $2}'" % hostname
+ host_ips = self._run_cmd_remote(self.client, command)
+ return host_ips[0]
+
+ def set_apply_patches(self):
+ self.log.info('Set apply patches start......')
+ fenix_files = None
+ set_scripts = [self.cm_set_script]
+ thrs = []
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' ceilometer-agent-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ if is_fenix(self.conf):
+ set_scripts.append(self.fe_set_script)
+ testdir = get_doctor_test_root_dir()
+ fenix_files = ["Dockerfile", "run"]
+ restart_cmd += ' nova-scheduler.service'
+ set_scripts.append(self.nc_set_compute_script)
+
+ for node_ip in self.controllers:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ if fenix_files is not None:
+ for fenix_file in fenix_files:
+ src_file = '{0}/{1}/{2}'.format(testdir,
+ 'admin_tool/fenix',
+ fenix_file)
+ client.scp(src_file, fenix_file)
+ thrs.append(self._run_apply_patches(client,
+ restart_cmd,
+ set_scripts,
+ python=self.python))
+ time.sleep(5)
+
+ self.log.info('Set apply patches start......')
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd = 'sudo systemctl restart nova-compute.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ thrs.append(self._run_apply_patches(
+ client,
+ restart_cmd,
+ [self.nc_set_compute_script],
+ python=self.python))
+ time.sleep(5)
+ # If Fenix container ir build, it needs to be ready before continue
+ for thr in thrs:
+ thr.join()
+
+ def restore_apply_patches(self):
+ self.log.info('restore apply patches start......')
+
+ restore_scripts = [self.cm_restore_script]
+
+ restore_scripts.append(self.ac_restore_script)
+ restart_cmd = 'sudo systemctl restart' \
+ ' ceilometer-agent-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd += ' nova-scheduler.service'
+ restore_scripts.append(self.nc_restore_compute_script)
+
+ for node_ip in self.controllers:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(client,
+ restart_cmd,
+ restore_scripts,
+ python=self.python)
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd = 'sudo systemctl restart nova-compute.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(
+ client, restart_cmd,
+ [self.nc_restore_compute_script],
+ python=self.python)
diff --git a/doctor_tests/main.py b/doctor_tests/main.py
index f54b6403..7573faec 100644
--- a/doctor_tests/main.py
+++ b/doctor_tests/main.py
@@ -1,5 +1,5 @@
##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
+# Copyright (c) 2019 ZTE Corporation and others.
#
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
@@ -8,28 +8,19 @@
##############################################################################
import os
from os.path import isfile, join
-import random
import sys
import time
+from traceback import format_exc
-from doctor_tests.alarm import Alarm
-from doctor_tests.common.constants import Host
-from doctor_tests.common.utils import match_rep_in_file
from doctor_tests import config
-from doctor_tests.consumer import get_consumer
from doctor_tests.identity_auth import get_identity_auth
from doctor_tests.identity_auth import get_session
from doctor_tests.image import Image
-from doctor_tests.instance import Instance
-from doctor_tests.inspector import get_inspector
from doctor_tests.installer import get_installer
import doctor_tests.logger as doctor_log
-from doctor_tests.network import Network
-from doctor_tests.monitor import get_monitor
+from doctor_tests.scenario.fault_management import FaultManagement
from doctor_tests.os_clients import nova_client
-from doctor_tests.profiler_poc import main as profiler_main
-from doctor_tests.scenario.common import calculate_notification_time
-from doctor_tests.scenario.network_failure import NetworkFault
+from doctor_tests.scenario.maintenance import Maintenance
from doctor_tests.user import User
@@ -44,95 +35,65 @@ class DoctorTest(object):
self.conf = conf
self.image = Image(self.conf, LOG)
self.user = User(self.conf, LOG)
- self.network = Network(self.conf, LOG)
- self.instance = Instance(self.conf, LOG)
- self.alarm = Alarm(self.conf, LOG)
self.installer = get_installer(self.conf, LOG)
- self.inspector = get_inspector(self.conf, LOG)
- self.monitor = get_monitor(self.conf,
- self.inspector.get_inspector_url(),
- LOG)
- self.consumer = get_consumer(self.conf, LOG)
- self.fault = NetworkFault(self.conf, self.installer, LOG)
auth = get_identity_auth(project=self.conf.doctor_project)
self.nova = nova_client(self.conf.nova_version,
get_session(auth=auth))
- self.down_host = None
def setup(self):
# prepare the cloud env
self.installer.setup()
-
# preparing VM image...
self.image.create()
# creating test user...
self.user.create()
- def setup_fault_management(self):
- # user settings...
- self.user.update_quota()
-
- # creating VM...
- self.network.create()
- self.instance.create()
- self.instance.wait_for_vm_launch()
-
- # creating alarm...
- self.alarm.create()
-
- # starting doctor sample components...
- # tbd tojuvone: move inspector and consumer to common setup
- # when they support updating VMs via instance.create and
- # instance.delete alarm
-
- self.inspector.start()
- self.consumer.start()
- self.down_host = self.get_host_info_for_random_vm()
- self.monitor.start(self.down_host)
-
def test_fault_management(self):
- try:
- LOG.info('doctor fault management test starting.......')
-
- # prepare test env
- self.setup_fault_management()
-
- # wait for aodh alarms are updated in caches for event evaluator,
- # sleep time should be larger than event_alarm_cache_ttl
- # (default 60)
- time.sleep(60)
-
- # injecting host failure...
- # NOTE (umar) add INTERFACE_NAME logic to host injection
-
- self.fault.start(self.down_host)
- time.sleep(10)
-
- # verify the test results
- # NOTE (umar) copy remote monitor.log file when monitor=collectd
- self.check_host_status(self.down_host.name, 'down')
-
- notification_time = calculate_notification_time(LogFile)
- if notification_time < 1 and notification_time > 0:
- LOG.info('doctor fault management test successfully, '
- 'notification_time=%s' % notification_time)
- else:
+ retry = 2
+ # Retry once if notified_time is None
+ while retry > 0:
+ try:
+ self.fault_management = None
+ LOG.info('doctor fault management test starting.......')
+ transport_url = self.installer.get_transport_url()
+ self.fault_management = \
+ FaultManagement(self.conf, self.installer, self.user, LOG,
+ transport_url)
+
+ # prepare test env
+ self.fault_management.setup()
+
+ # wait for aodh alarms are updated in caches for event
+ # evaluator,sleep time should be larger than
+ # event_alarm_cache_ttl (default 60)
+ # (tojuvone) Fraser currently needs 120
+ time.sleep(120)
+
+ # injecting host failure...
+ # NOTE (umar) add INTERFACE_NAME logic to host injection
+ self.fault_management.start()
+ time.sleep(30)
+
+ # verify the test results
+ # NOTE (umar) copy remote monitor.log file when
+ # monitor=collectd
+ self.fault_management.check_host_status('down')
+ self.fault_management.check_notification_time()
+ retry = 0
+
+ except Exception as e:
LOG.error('doctor fault management test failed, '
- 'notification_time=%s' % notification_time)
+ 'Exception=%s' % e)
+ if 'notified_time=None' in str(e):
+ retry -= 1
+ LOG.info('doctor fault management retry')
+ continue
+ LOG.error(format_exc())
sys.exit(1)
-
- if self.conf.profiler_type:
- LOG.info('doctor fault management test begin to run '
- 'profile.......')
- self.collect_logs()
- self.run_profiler()
- except Exception as e:
- LOG.error('doctor fault management test failed, '
- 'Exception=%s' % e)
- sys.exit(1)
- finally:
- self.cleanup_fault_management()
+ finally:
+ if self.fault_management is not None:
+ self.fault_management.cleanup()
def _amount_compute_nodes(self):
services = self.nova.services.list(binary='nova-compute')
@@ -145,109 +106,62 @@ class DoctorTest(object):
LOG.info('not enough compute nodes, skipping doctor '
'maintenance test')
return
+ elif self.conf.installer.type not in ['apex', 'fuel', 'devstack']:
+ LOG.info('not supported installer, skipping doctor '
+ 'maintenance test')
+ return
try:
+ maintenance = None
LOG.info('doctor maintenance test starting.......')
- # TODO (tojuvone) test setup and actual test
+ trasport_url = self.installer.get_transport_url()
+ maintenance = Maintenance(trasport_url, self.conf, LOG)
+ maintenance.setup_maintenance(self.user)
+
+ # wait for aodh alarms are updated in caches for event evaluator,
+ # sleep time should be larger than event_alarm_cache_ttl
+ # (default 60)
+ LOG.info('wait aodh for 120s.......')
+ time.sleep(120)
+
+ session_id = maintenance.start_maintenance()
+ maintenance.wait_maintenance_complete(session_id)
+
+ LOG.info('doctor maintenance complete.......')
+
except Exception as e:
LOG.error('doctor maintenance test failed, Exception=%s' % e)
+ LOG.error(format_exc())
sys.exit(1)
- # TODO (tojuvone) finally: test case specific cleanup
+ finally:
+ if maintenance is not None:
+ maintenance.cleanup_maintenance()
def run(self):
"""run doctor tests"""
try:
LOG.info('doctor test starting.......')
+
# prepare common test env
self.setup()
+
if self.conf.test_case == 'all':
self.test_fault_management()
self.test_maintenance()
else:
- getattr(self, self.conf.test_case)()
+ function = 'test_%s' % self.conf.test_case
+ if hasattr(self, function):
+ getattr(self, function)()
+ else:
+ raise Exception('Can not find function <%s> in'
+ 'DoctorTest, see config manual'
+ % function)
except Exception as e:
LOG.error('doctor test failed, Exception=%s' % e)
+ LOG.error(format_exc())
sys.exit(1)
finally:
self.cleanup()
- def get_host_info_for_random_vm(self):
- num = random.randint(0, self.conf.instance_count - 1)
- vm_name = "%s%d" % (self.conf.instance_basename, num)
-
- servers = \
- {getattr(server, 'name'): server
- for server in self.nova.servers.list()}
- server = servers.get(vm_name)
- if not server:
- raise \
- Exception('Can not find instance: vm_name(%s)' % vm_name)
- host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
- host_ip = self.installer.get_host_ip_from_hostname(host_name)
-
- LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
- % (host_name, host_ip, vm_name))
- return Host(host_name, host_ip)
-
- def check_host_status(self, hostname, state):
- service = self.nova.services.list(host=hostname,
- binary='nova-compute')
- host_state = service[0].__dict__.get('state')
- assert host_state == state
-
- def unset_forced_down_hosts(self):
- if self.down_host:
- self.nova.services.force_down(self.down_host.name,
- 'nova-compute', False)
- time.sleep(2)
- self.check_host_status(self.down_host.name, 'up')
-
- def collect_logs(self):
- self.fault.get_disable_network_log()
-
- def run_profiler(self):
-
- net_down_log_file = self.fault.get_disable_network_log()
- reg = '(?<=doctor set link down at )\d+.\d+'
- linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
-
- reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
- vmdown = float(match_rep_in_file(reg, LogFile).group(2))
-
- reg = '(.* doctor mark host.* down at )(\d+.\d+)'
- hostdown = float(match_rep_in_file(reg, LogFile).group(2))
-
- reg = '(?<=doctor monitor detected at )\d+.\d+'
- detected = float(match_rep_in_file(reg, LogFile).group(0))
-
- reg = '(?<=doctor consumer notified at )\d+.\d+'
- notified = float(match_rep_in_file(reg, LogFile).group(0))
-
- # TODO(yujunz) check the actual delay to verify time sync status
- # expected ~1s delay from $trigger to $linkdown
- relative_start = linkdown
- os.environ['DOCTOR_PROFILER_T00'] = \
- str(int((linkdown - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T01'] = \
- str(int((detected - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T03'] = \
- str(int((vmdown - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T04'] = \
- str(int((hostdown - relative_start) * 1000))
- os.environ['DOCTOR_PROFILER_T09'] = \
- str(int((notified - relative_start) * 1000))
-
- profiler_main(log=LOG)
-
- def cleanup_fault_management(self):
- self.unset_forced_down_hosts()
- self.inspector.stop()
- self.monitor.stop()
- self.consumer.stop()
- self.alarm.delete()
- self.instance.delete()
- self.network.delete()
- self.fault.cleanup()
-
def cleanup(self):
self.installer.cleanup()
self.image.delete()
diff --git a/doctor_tests/maintenance_hot_tpl.yaml b/doctor_tests/maintenance_hot_tpl.yaml
new file mode 100644
index 00000000..e2e47023
--- /dev/null
+++ b/doctor_tests/maintenance_hot_tpl.yaml
@@ -0,0 +1,119 @@
+---
+heat_template_version: 2017-02-24
+description: Doctor Maintenance test case
+
+parameters:
+ ext_net:
+ type: string
+ default: external
+ flavor_vcpus:
+ type: number
+ default: 24
+ maint_image:
+ type: string
+ default: cirros
+ ha_intances:
+ type: number
+ default: 2
+ nonha_intances:
+ type: number
+ default: 4
+ app_manager_alarm_url:
+ type: string
+ default: http://0.0.0.0:12348/maintenance
+ inpector_alarm_url:
+ type: string
+ default: http://0.0.0.0:12345/maintenance
+
+
+resources:
+ int_net:
+ type: OS::Neutron::Net
+
+ int_subnet:
+ type: OS::Neutron::Subnet
+ properties:
+ network_id: {get_resource: int_net}
+ cidr: "9.9.9.0/24"
+ dns_nameservers: ["8.8.8.8"]
+ ip_version: 4
+
+ int_router:
+ type: OS::Neutron::Router
+ properties:
+ external_gateway_info: {network: {get_param: ext_net}}
+
+ int_interface:
+ type: OS::Neutron::RouterInterface
+ properties:
+ router_id: {get_resource: int_router}
+ subnet: {get_resource: int_subnet}
+
+ maint_instance_flavor:
+ type: OS::Nova::Flavor
+ properties:
+ name: doctor_maint_flavor
+ ram: 512
+ vcpus: {get_param: flavor_vcpus}
+ disk: 1
+
+ ha_app_svrgrp:
+ type: OS::Nova::ServerGroup
+ properties:
+ name: doctor_ha_app_group
+ policies: ['anti-affinity']
+
+ floating_ip:
+ type: OS::Nova::FloatingIP
+ properties:
+ pool: {get_param: ext_net}
+
+ multi_ha_instances:
+ type: OS::Heat::ResourceGroup
+ properties:
+ count: {get_param: ha_intances}
+ resource_def:
+ type: OS::Nova::Server
+ properties:
+ name: doctor_ha_app_%index%
+ flavor: {get_resource: maint_instance_flavor}
+ image: {get_param: maint_image}
+ networks:
+ - network: {get_resource: int_net}
+ scheduler_hints:
+ group: {get_resource: ha_app_svrgrp}
+
+ multi_nonha_instances:
+ type: OS::Heat::ResourceGroup
+ properties:
+ count: {get_param: nonha_intances}
+ resource_def:
+ type: OS::Nova::Server
+ properties:
+ name: doctor_nonha_app_%index%
+ flavor: {get_resource: maint_instance_flavor}
+ image: {get_param: maint_image}
+ networks:
+ - network: {get_resource: int_net}
+
+ association:
+ type: OS::Nova::FloatingIPAssociation
+ properties:
+ floating_ip: {get_resource: floating_ip}
+ server_id: {get_attr: [multi_ha_instances, resource.0]}
+
+ app_manager_alarm:
+ type: OS::Aodh::EventAlarm
+ properties:
+ alarm_actions:
+ - {get_param: app_manager_alarm_url}
+ event_type: "maintenance.scheduled"
+ repeat_actions: true
+
+ inpector_alarm:
+ type: OS::Aodh::EventAlarm
+ properties:
+ alarm_actions:
+ - {get_param: inpector_alarm_url}
+ event_type: "maintenance.host"
+ repeat_actions: true
diff --git a/doctor_tests/monitor/base.py b/doctor_tests/monitor/base.py
index 119c8a1c..c2341225 100644
--- a/doctor_tests/monitor/base.py
+++ b/doctor_tests/monitor/base.py
@@ -17,6 +17,15 @@ class BaseMonitor(object):
self.conf = conf
self.log = log
self.inspector_url = inspector_url
+ self._detected_time = None
+
+ @property
+ def detected_time(self):
+ return self._detected_time
+
+ @detected_time.setter
+ def detected_time(self, detected_time):
+ self._detected_time = detected_time
@abc.abstractmethod
def start(self, host):
diff --git a/doctor_tests/monitor/sample.py b/doctor_tests/monitor/sample.py
index c207cd9f..9b21750b 100644
--- a/doctor_tests/monitor/sample.py
+++ b/doctor_tests/monitor/sample.py
@@ -94,7 +94,9 @@ class Pinger(Thread):
sock.sendto(self.ICMP_ECHO_MESSAGE, (self.ip_addr, 0))
sock.recv(4096)
except socket.timeout:
- self.log.info("doctor monitor detected at %s" % time.time())
+ detected_time = time.time()
+ self.log.info("doctor monitor detected at %s" % detected_time)
+ self.monitor.detected_time = detected_time
self.monitor.report_error(self.hostname)
self.log.info("ping timeout, quit monitoring...")
self._stopped = True
diff --git a/doctor_tests/os_clients.py b/doctor_tests/os_clients.py
index 640281df..7ab4e9b4 100644
--- a/doctor_tests/os_clients.py
+++ b/doctor_tests/os_clients.py
@@ -11,6 +11,7 @@ from oslo_config import cfg
import aodhclient.client as aodhclient
from congressclient.v1 import client as congressclient
import glanceclient.client as glanceclient
+import heatclient.client as heatclient
from keystoneclient import client as ks_client
from neutronclient.v2_0 import client as neutronclient
import novaclient.client as novaclient
@@ -23,6 +24,7 @@ OPTS = [
cfg.StrOpt('aodh_version', default='2', help='aodh version'),
cfg.StrOpt('vitrage_version', default='1', help='vitrage version'),
cfg.StrOpt('keystone_version', default='v3', help='keystone version'),
+ cfg.StrOpt('heat_version', default='1', help='heat version'),
]
@@ -31,6 +33,11 @@ def glance_client(version, session):
session=session)
+def heat_client(version, session):
+ return heatclient.Client(version=version,
+ session=session)
+
+
def keystone_client(version, session):
return ks_client.Client(version=version,
session=session)
diff --git a/doctor_tests/scenario/__init__.py b/doctor_tests/scenario/__init__.py
index 9e7cd11d..4278bb26 100644
--- a/doctor_tests/scenario/__init__.py
+++ b/doctor_tests/scenario/__init__.py
@@ -6,12 +6,20 @@
# which accompanies this distribution, and is available at
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
+import os
+
from oslo_config import cfg
OPTS = [
cfg.StrOpt('test_case',
- default='all',
- help='the name of test case',
+ default=os.environ.get('TEST_CASE', 'fault_management'),
+ choices=['all', 'fault_management', 'maintenance'],
+ help="A name of test case to be executed,"
+ " choices are 'all', 'fault_management' or 'maintenance'."
+ " Set 'all' to execute all the test cases existing in"
+ " this repo. Default is 'fault_management'. Another test"
+ " case can be specified only if a function named"
+ " test_<test_case>() was implemented in DoctorTest.",
required=False),
]
diff --git a/doctor_tests/scenario/common.py b/doctor_tests/scenario/common.py
deleted file mode 100644
index a7240c00..00000000
--- a/doctor_tests/scenario/common.py
+++ /dev/null
@@ -1,26 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-from doctor_tests.common.utils import match_rep_in_file
-
-
-def calculate_notification_time(log_file):
-
- reg = '(?<=doctor monitor detected at )\d+.\d+'
- result = match_rep_in_file(reg, log_file)
- if not result:
- raise Exception('Can not match detected time')
- detected = result.group(0)
-
- reg = '(?<=doctor consumer notified at )\d+.\d+'
- result = match_rep_in_file(reg, log_file)
- if not result:
- raise Exception('Can not match notified time')
- notified = result.group(0)
-
- return float(notified) - float(detected)
diff --git a/doctor_tests/scenario/fault_management.py b/doctor_tests/scenario/fault_management.py
new file mode 100644
index 00000000..0271dffe
--- /dev/null
+++ b/doctor_tests/scenario/fault_management.py
@@ -0,0 +1,233 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import random
+import time
+
+from doctor_tests.alarm import Alarm
+from doctor_tests.common.constants import Host
+from doctor_tests.common.utils import get_doctor_test_root_dir
+from doctor_tests.common.utils import match_rep_in_file
+from doctor_tests.common.utils import SSHClient
+from doctor_tests.consumer import get_consumer
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.instance import Instance
+from doctor_tests.inspector import get_inspector
+from doctor_tests.monitor import get_monitor
+from doctor_tests.network import Network
+from doctor_tests.profiler_poc import main as profiler_main
+from doctor_tests.os_clients import nova_client
+
+
+LINK_DOWN_SCRIPT = """
+#!/bin/bash -x
+dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
+sleep 1
+sudo ip link set $dev down
+echo "doctor set link down at" $(date "+%s.%N")
+sleep 30
+sudo ip link set $dev up
+sleep 1
+"""
+
+
+class FaultManagement(object):
+
+ def __init__(self, conf, installer, user, log, transport_url):
+ self.conf = conf
+ self.log = log
+ self.user = user
+ self.installer = installer
+ auth = get_identity_auth(project=self.conf.doctor_project)
+ self.nova = nova_client(self.conf.nova_version,
+ get_session(auth=auth))
+ self.test_dir = get_doctor_test_root_dir()
+ self.down_host = None
+ self.GetLog = False
+ self.disable_network_log = None
+ self.network = Network(self.conf, log)
+ self.instance = Instance(self.conf, log)
+ self.alarm = Alarm(self.conf, log)
+ self.inspector = get_inspector(self.conf, log, transport_url)
+ self.monitor = get_monitor(self.conf,
+ self.inspector.get_inspector_url(),
+ log)
+ self.consumer = get_consumer(self.conf, log)
+
+ def setup(self):
+ self.log.info('fault management setup......')
+
+ # user settings...
+ self.user.update_quota()
+
+ # creating VM...
+ self.network.create()
+ self.instance.create()
+ self.instance.wait_for_vm_launch()
+
+ # creating alarm...
+ self.alarm.create()
+
+ # starting doctor sample components...
+ # tbd tojuvone: move inspector and consumer to common setup
+ # when they support updating VMs via instance.create and
+ # instance.delete alarm
+
+ self.inspector.start()
+ self.consumer.start()
+ self.down_host = self.get_host_info_for_random_vm()
+ self.monitor.start(self.down_host)
+
+ def start(self):
+ self.log.info('fault management start......')
+ self._set_link_down(self.down_host.ip)
+ self.log.info('fault management end......')
+
+ def cleanup(self):
+ self.log.info('fault management cleanup......')
+
+ self.get_disable_network_log()
+ self.unset_forced_down_hosts()
+ self.inspector.stop()
+ self.monitor.stop()
+ self.consumer.stop()
+ self.alarm.delete()
+ self.instance.delete()
+ self.network.delete()
+
+ def get_host_info_for_random_vm(self):
+ num = random.randint(0, self.conf.instance_count - 1)
+ vm_name = "%s%d" % (self.conf.instance_basename, num)
+
+ servers = {getattr(server, 'name'): server
+ for server in self.nova.servers.list()}
+ server = servers.get(vm_name)
+ if not server:
+ raise Exception('Can not find instance: vm_name(%s)' % vm_name)
+ # use hostname without domain name which is mapped to the cell
+ hostname = \
+ server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
+ host_name = hostname.split('.')[0]
+ host_ip = self.installer.get_host_ip_from_hostname(host_name)
+
+ self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
+ % (host_name, host_ip, vm_name))
+ return Host(host_name, host_ip)
+
+ def unset_forced_down_hosts(self):
+ if self.down_host:
+ self.nova.services.force_down(self.down_host.name,
+ 'nova-compute', False)
+ time.sleep(2)
+ self.check_host_status('up')
+
+ def check_host_status(self, state):
+ service = self.nova.services.list(host=self.down_host.name,
+ binary='nova-compute')
+ host_state = service[0].__dict__.get('state')
+ assert host_state == state
+
+ def get_disable_network_log(self):
+ if self.GetLog:
+ self.log.info('Already get the disable_netork.log '
+ 'from down_host......')
+ return self.disable_network_log
+ if self.down_host is not None:
+ client = SSHClient(
+ self.down_host.ip,
+ self.installer.node_user_name,
+ key_filename=self.installer.get_ssh_key_from_installer(),
+ look_for_keys=True,
+ log=self.log)
+
+ self.disable_network_log = \
+ '{0}/{1}'.format(self.test_dir,
+ 'disable_network.log')
+ client.scp('disable_network.log',
+ self.disable_network_log,
+ method='get')
+ self.log.info('Get the disable_netork.log from'
+ 'down_host(host_name:%s, host_ip:%s)'
+ % (self.down_host.name, self.down_host.ip))
+ self.GetLog = True
+ return self.disable_network_log
+
+ def _set_link_down(self, compute_ip):
+ file_name = '{0}/{1}'.format(self.test_dir, 'disable_network.sh')
+ with open(file_name, 'w') as file:
+ file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip))
+ client = SSHClient(
+ compute_ip,
+ self.installer.node_user_name,
+ key_filename=self.installer.get_ssh_key_from_installer(),
+ look_for_keys=True,
+ log=self.log)
+ client.scp(file_name, 'disable_network.sh')
+ command = 'bash disable_network.sh > disable_network.log 2>&1 &'
+ client.ssh(command)
+
+ def check_notification_time(self):
+ if self.consumer.notified_time is None \
+ or self.monitor.detected_time is None:
+ raise Exception('doctor fault management test failed, '
+ 'detected_time=%s, notified_time=%s'
+ % (self.monitor.detected_time,
+ self.consumer.notified_time))
+ notification_time = \
+ self.consumer.notified_time - \
+ self.monitor.detected_time
+
+ self.log.info('doctor fault management notification_time=%s'
+ % notification_time)
+
+ if notification_time < 1 and notification_time > 0:
+ self.log.info('doctor fault management test successfully')
+ else:
+ if self.conf.profiler_type:
+ self.log.info('run doctor fault management profile.......')
+ self.run_profiler()
+
+ raise Exception('doctor fault management test failed, '
+ 'notification_time=%s' % notification_time)
+
+ if self.conf.profiler_type:
+ self.log.info('run doctor fault management profile.......')
+ self.run_profiler()
+
+ def run_profiler(self):
+
+ net_down_log_file = self.get_disable_network_log()
+ reg = '(?<=doctor set link down at )\d+.\d+'
+ linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
+
+ vmdown = self.inspector.vm_down_time
+ hostdown = self.inspector.host_down_time
+ detected = self.monitor.detected_time
+ notified = self.consumer.notified_time
+
+ if None in [vmdown, hostdown, detected, notified]:
+ self.log.info('one of the time for profiler is None, return')
+ return
+
+ # TODO(yujunz) check the actual delay to verify time sync status
+ # expected ~1s delay from $trigger to $linkdown
+ relative_start = linkdown
+ os.environ['DOCTOR_PROFILER_T00'] = (
+ str(int((linkdown - relative_start) * 1000)))
+ os.environ['DOCTOR_PROFILER_T01'] = (
+ str(int((detected - relative_start) * 1000)))
+ os.environ['DOCTOR_PROFILER_T03'] = (
+ str(int((vmdown - relative_start) * 1000)))
+ os.environ['DOCTOR_PROFILER_T04'] = (
+ str(int((hostdown - relative_start) * 1000)))
+ os.environ['DOCTOR_PROFILER_T09'] = (
+ str(int((notified - relative_start) * 1000)))
+
+ profiler_main(log=self.log)
diff --git a/doctor_tests/scenario/maintenance.py b/doctor_tests/scenario/maintenance.py
new file mode 100644
index 00000000..e6cdcccd
--- /dev/null
+++ b/doctor_tests/scenario/maintenance.py
@@ -0,0 +1,250 @@
+##############################################################################
+# Copyright (c) 2019 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import datetime
+import json
+import requests
+import time
+
+from doctor_tests.admin_tool import get_admin_tool
+from doctor_tests.app_manager import get_app_manager
+from doctor_tests.common.utils import get_doctor_test_root_dir
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.inspector import get_inspector
+from doctor_tests.os_clients import keystone_client
+from doctor_tests.os_clients import neutron_client
+from doctor_tests.os_clients import nova_client
+from doctor_tests.stack import Stack
+
+
+class Maintenance(object):
+
+ def __init__(self, trasport_url, conf, log):
+ self.conf = conf
+ self.log = log
+ self.admin_session = get_session()
+ self.keystone = keystone_client(
+ self.conf.keystone_version, get_session())
+ self.nova = nova_client(conf.nova_version, get_session())
+ auth = get_identity_auth(project=self.conf.doctor_project)
+ self.neutron = neutron_client(get_session(auth=auth))
+ self.stack = Stack(self.conf, self.log)
+ if self.conf.installer.type == "devstack":
+ self.endpoint_ip = trasport_url.split("@", 1)[1].split(":", 1)[0]
+ else:
+ self.endpoint_ip = self.conf.admin_tool.ip
+ self.endpoint = "http://%s:12347/" % self.endpoint_ip
+ if self.conf.admin_tool.type == 'sample':
+ self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log)
+ self.endpoint += 'maintenance'
+ else:
+ self.endpoint += 'v1/maintenance'
+ self.app_manager = get_app_manager(self.stack, self.conf, self.log)
+ self.inspector = get_inspector(self.conf, self.log, trasport_url)
+
+ def get_external_network(self):
+ ext_net = None
+ networks = self.neutron.list_networks()['networks']
+ for network in networks:
+ if network['router:external']:
+ ext_net = network['name']
+ break
+ if ext_net is None:
+ raise Exception("external network not defined")
+ return ext_net
+
+ def setup_maintenance(self, user):
+ # each hypervisor needs to have same amount of vcpus and they
+ # need to be free before test
+ hvisors = self.nova.hypervisors.list(detailed=True)
+ prev_vcpus = 0
+ prev_hostname = ''
+ self.log.info('checking hypervisors.......')
+ for hvisor in hvisors:
+ vcpus = hvisor.__getattr__('vcpus')
+ vcpus_used = hvisor.__getattr__('vcpus_used')
+ hostname = hvisor.__getattr__('hypervisor_hostname')
+ if vcpus < 2:
+ raise Exception('not enough vcpus (%d) on %s' %
+ (vcpus, hostname))
+ if vcpus_used > 0:
+ if self.conf.test_case == 'all':
+ # VCPU might not yet be free after fault_management test
+ self.log.info('%d vcpus used on %s, retry...'
+ % (vcpus_used, hostname))
+ time.sleep(15)
+ hvisor = self.nova.hypervisors.get(hvisor.id)
+ vcpus_used = hvisor.__getattr__('vcpus_used')
+ if vcpus_used > 0:
+ raise Exception('%d vcpus used on %s'
+ % (vcpus_used, hostname))
+ if prev_vcpus != 0 and prev_vcpus != vcpus:
+ raise Exception('%d vcpus on %s does not match to'
+ '%d on %s'
+ % (vcpus, hostname,
+ prev_vcpus, prev_hostname))
+ prev_vcpus = vcpus
+ prev_hostname = hostname
+
+ # maintenance flavor made so that 2 instances take whole node
+ flavor_vcpus = int(vcpus / 2)
+ compute_nodes = len(hvisors)
+ amount_actstdby_instances = 2
+ amount_noredundancy_instances = 2 * compute_nodes - 2
+ self.log.info('testing %d computes with %d vcpus each'
+ % (compute_nodes, vcpus))
+ self.log.info('testing %d actstdby and %d noredundancy instances'
+ % (amount_actstdby_instances,
+ amount_noredundancy_instances))
+ max_instances = (amount_actstdby_instances +
+ amount_noredundancy_instances)
+ max_cores = compute_nodes * vcpus
+
+ user.update_quota(max_instances, max_cores)
+
+ test_dir = get_doctor_test_root_dir()
+ template_file = '{0}/{1}'.format(test_dir, 'maintenance_hot_tpl.yaml')
+ files, template = self.stack.get_hot_tpl(template_file)
+
+ ext_net = self.get_external_network()
+
+ parameters = {'ext_net': ext_net,
+ 'flavor_vcpus': flavor_vcpus,
+ 'maint_image': self.conf.image_name,
+ 'nonha_intances': amount_noredundancy_instances,
+ 'ha_intances': amount_actstdby_instances}
+
+ self.log.info('creating maintenance stack.......')
+ self.log.info('parameters: %s' % parameters)
+
+ self.stack.create('doctor_test_maintenance',
+ template,
+ parameters=parameters,
+ files=files)
+
+ if self.conf.admin_tool.type == 'sample':
+ self.admin_tool.start()
+ else:
+ # TBD Now we expect Fenix is running in self.conf.admin_tool.port
+ pass
+ # Inspector before app_manager, as floating ip might come late
+ self.inspector.start()
+ self.app_manager.start()
+
+ def start_maintenance(self):
+ self.log.info('start maintenance.......')
+ hvisors = self.nova.hypervisors.list(detailed=True)
+ maintenance_hosts = list()
+ for hvisor in hvisors:
+ hostname = hvisor.__getattr__('hypervisor_hostname')
+ maintenance_hosts.append(hostname)
+ url = self.endpoint
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+ if self.conf.admin_tool.type == 'fenix':
+ headers['X-Auth-Token'] = self.admin_session.get_token()
+ self.log.info('url %s headers %s' % (url, headers))
+ retries = 12
+ ret = None
+ while retries > 0:
+ # let's start maintenance 20sec from now, so projects will have
+ # time to ACK to it before that
+ maintenance_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=30)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+
+ data = {'state': 'MAINTENANCE',
+ 'maintenance_at': maintenance_at,
+ 'metadata': {'openstack_version': 'Train'}}
+
+ if self.conf.app_manager.type == 'vnfm':
+ data['workflow'] = 'vnf'
+ else:
+ data['workflow'] = 'default'
+
+ if self.conf.admin_tool.type == 'sample':
+ data['hosts'] = maintenance_hosts
+ else:
+ data['hosts'] = []
+ try:
+ ret = requests.post(url, data=json.dumps(data),
+ headers=headers)
+ except Exception:
+ if retries == 0:
+ raise Exception('admin tool did not respond in 120s')
+ else:
+ self.log.info('admin tool not ready, retry in 10s')
+ retries = retries - 1
+ time.sleep(10)
+ continue
+ break
+ if not ret:
+ raise Exception("admin tool did not respond")
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ return ret.json()['session_id']
+
+ def remove_maintenance_session(self, session_id):
+ self.log.info('remove maintenance session %s.......' % session_id)
+
+ url = ('%s/%s' % (self.endpoint, session_id))
+
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+
+ if self.conf.admin_tool.type == 'fenix':
+ headers['X-Auth-Token'] = self.admin_session.get_token()
+
+ ret = requests.delete(url, data=None, headers=headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+
+ def get_maintenance_state(self, session_id):
+
+ url = ('%s/%s' % (self.endpoint, session_id))
+
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+
+ if self.conf.admin_tool.type == 'fenix':
+ headers['X-Auth-Token'] = self.admin_session.get_token()
+
+ ret = requests.get(url, data=None, headers=headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ return ret.json()['state']
+
+ def wait_maintenance_complete(self, session_id):
+ retries = 90
+ state = None
+ time.sleep(300)
+ while (state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and
+ retries > 0):
+ time.sleep(10)
+ state = self.get_maintenance_state(session_id)
+ retries = retries - 1
+ self.remove_maintenance_session(session_id)
+ self.log.info('maintenance %s ended with state %s' %
+ (session_id, state))
+ if state == 'MAINTENANCE_FAILED':
+ raise Exception('maintenance %s failed' % session_id)
+ elif retries == 0:
+ raise Exception('maintenance %s not completed within 20min' %
+ session_id)
+
+ def cleanup_maintenance(self):
+ if self.conf.admin_tool.type == 'sample':
+ self.admin_tool.stop()
+ self.app_manager.stop()
+ self.inspector.stop()
+ self.log.info('stack delete start.......')
+ self.stack.delete()
diff --git a/doctor_tests/scenario/network_failure.py b/doctor_tests/scenario/network_failure.py
deleted file mode 100644
index b55440ed..00000000
--- a/doctor_tests/scenario/network_failure.py
+++ /dev/null
@@ -1,84 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-from doctor_tests.identity_auth import get_session
-from doctor_tests.os_clients import nova_client
-from doctor_tests.common.utils import get_doctor_test_root_dir
-from doctor_tests.common.utils import SSHClient
-
-LINK_DOWN_SCRIPT = """
-#!/bin/bash -x
-dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
-sleep 1
-sudo ip link set $dev down
-echo "doctor set link down at" $(date "+%s.%N")
-sleep 10
-sudo ip link set $dev up
-sleep 1
-"""
-
-
-class NetworkFault(object):
-
- def __init__(self, conf, installer, log):
- self.conf = conf
- self.log = log
- self.installer = installer
- self.nova = nova_client(self.conf.nova_version, get_session())
- self.test_dir = get_doctor_test_root_dir()
- self.host = None
- self.GetLog = False
- self.disable_network_log = None
-
- def start(self, host):
- self.log.info('fault inject start......')
- self._set_link_down(host.ip)
- self.host = host
- self.log.info('fault inject end......')
-
- def cleanup(self):
- self.log.info('fault inject cleanup......')
- self.get_disable_network_log()
-
- def get_disable_network_log(self):
- if self.GetLog:
- self.log.info('Already get the disable_netork.log '
- 'from down_host......')
- return self.disable_network_log
- if self.host is not None:
- client = SSHClient(
- self.host.ip,
- self.installer.node_user_name,
- key_filename=self.installer.get_ssh_key_from_installer(),
- look_for_keys=True,
- log=self.log)
-
- self.disable_network_log = '{0}/{1}'.format(self.test_dir,
- 'disable_network.log')
- client.scp('disable_network.log',
- self.disable_network_log,
- method='get')
- self.log.info('Get the disable_netork.log from'
- 'down_host(host_name:%s, host_ip:%s)'
- % (self.host.name, self.host.ip))
- self.GetLog = True
- return self.disable_network_log
-
- def _set_link_down(self, compute_ip):
- file_name = '{0}/{1}'.format(self.test_dir, 'disable_network.sh')
- with open(file_name, 'w') as file:
- file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip))
- client = SSHClient(
- compute_ip,
- self.installer.node_user_name,
- key_filename=self.installer.get_ssh_key_from_installer(),
- look_for_keys=True,
- log=self.log)
- client.scp(file_name, 'disable_network.sh')
- command = 'bash disable_network.sh > disable_network.log 2>&1 &'
- client.ssh(command)
diff --git a/doctor_tests/stack.py b/doctor_tests/stack.py
new file mode 100644
index 00000000..8a921beb
--- /dev/null
+++ b/doctor_tests/stack.py
@@ -0,0 +1,118 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import time
+
+from heatclient.common.template_utils import get_template_contents
+from heatclient import exc as heat_excecption
+
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import heat_client
+
+
+class Stack(object):
+
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+ auth = get_identity_auth(project=self.conf.doctor_project)
+ self.heat = heat_client(self.conf.heat_version,
+ get_session(auth=auth))
+ self.stack_name = None
+ self.stack_id = None
+ self.template = None
+ self.parameters = {}
+ self.files = {}
+
+ # standard yaml.load will not work for hot tpl becasue of date format in
+ # heat_template_version is not string
+ def get_hot_tpl(self, template_file):
+ if not os.path.isfile(template_file):
+ raise Exception('File(%s) does not exist' % template_file)
+ return get_template_contents(template_file=template_file)
+
+ def _wait_stack_action_complete(self, action):
+ action_in_progress = '%s_IN_PROGRESS' % action
+ action_complete = '%s_COMPLETE' % action
+ action_failed = '%s_FAILED' % action
+
+ status = action_in_progress
+ stack_retries = 160
+ while status == action_in_progress and stack_retries > 0:
+ time.sleep(2)
+ try:
+ stack = self.heat.stacks.get(self.stack_name)
+ except heat_excecption.HTTPNotFound:
+ if action == 'DELETE':
+ # Might happen you never get status as stack deleted
+ status = action_complete
+ break
+ else:
+ raise Exception('unable to get stack')
+ status = stack.stack_status
+ stack_retries = stack_retries - 1
+ if stack_retries == 0 and status != action_complete:
+ raise Exception("stack %s not completed within 5min, status:"
+ " %s" % (action, status))
+ elif status == action_complete:
+ self.log.info('stack %s %s' % (self.stack_name, status))
+ elif status == action_failed:
+ raise Exception("stack %s failed" % action)
+ else:
+ self.log.error('stack %s %s' % (self.stack_name, status))
+ raise Exception("stack %s unknown result" % action)
+
+ def wait_stack_delete(self):
+ self._wait_stack_action_complete('DELETE')
+
+ def wait_stack_create(self):
+ self._wait_stack_action_complete('CREATE')
+
+ def wait_stack_update(self):
+ self._wait_stack_action_complete('UPDATE')
+
+ def create(self, stack_name, template, parameters={}, files={}):
+ self.stack_name = stack_name
+ self.template = template
+ self.parameters = parameters
+ self.files = files
+ stack = self.heat.stacks.create(stack_name=self.stack_name,
+ files=files,
+ template=template,
+ parameters=parameters)
+ self.stack_id = stack['stack']['id']
+ try:
+ self.wait_stack_create()
+ except Exception:
+ # It might not always work at first
+ self.log.info('retry creating maintenance stack.......')
+ self.delete()
+ time.sleep(5)
+ stack = self.heat.stacks.create(stack_name=self.stack_name,
+ files=files,
+ template=template,
+ parameters=parameters)
+ self.stack_id = stack['stack']['id']
+ self.wait_stack_create()
+
+ def update(self, stack_name, stack_id, template, parameters={}, files={}):
+ self.heat.stacks.update(stack_name=stack_name,
+ stack_id=stack_id,
+ files=files,
+ template=template,
+ parameters=parameters)
+ self.wait_stack_update()
+
+ def delete(self):
+ if self.stack_id is not None:
+ self.heat.stacks.delete(self.stack_name)
+ self.wait_stack_delete()
+ else:
+ self.log.info('no stack to delete')
diff --git a/doctor_tests/user.py b/doctor_tests/user.py
index fee3e1fb..2cd9757f 100644
--- a/doctor_tests/user.py
+++ b/doctor_tests/user.py
@@ -8,12 +8,12 @@
##############################################################################
import os
+from keystoneclient import exceptions as ks_exceptions
from oslo_config import cfg
from doctor_tests.identity_auth import get_session
from doctor_tests.os_clients import keystone_client
from doctor_tests.os_clients import nova_client
-from keystoneclient import exceptions as ks_exceptions
OPTS = [
@@ -53,10 +53,11 @@ class User(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
+ self.def_quota = None
+ self.restore_def_quota = False
self.keystone = keystone_client(
self.conf.keystone_version, get_session())
- self.nova = \
- nova_client(conf.nova_version, get_session())
+ self.nova = nova_client(conf.nova_version, get_session())
self.users = {}
self.projects = {}
self.roles = {}
@@ -83,10 +84,9 @@ class User(object):
domain=self.conf.doctor_domain_id)}
if self.conf.doctor_project not in self.projects:
self.log.info('create project......')
- test_project = \
- self.keystone.projects.create(
- self.conf.doctor_project,
- self.conf.doctor_domain_id)
+ test_project = self.keystone.projects.create(
+ self.conf.doctor_project,
+ self.conf.doctor_domain_id)
self.projects[test_project.name] = test_project
else:
self.log.info('project %s already created......'
@@ -129,7 +129,6 @@ class User(object):
def _add_user_role_in_project(self, is_admin=False):
"""add test user with test role in test project"""
-
project = self.projects.get(self.conf.doctor_project)
user_name = 'admin' if is_admin else self.conf.doctor_user
@@ -151,6 +150,13 @@ class User(object):
self.keystone.roles.grant(role, user=user, project=project)
roles_for_user[role_name] = role
+ def _restore_default_quota(self):
+ if self.def_quota is not None and self.restore_def_quota:
+ self.log.info('restore default quota......')
+ self.nova.quota_classes.update('default',
+ instances=self.def_quota.instances,
+ cores=self.def_quota.cores)
+
def delete(self):
"""delete the test user, project and role"""
self.log.info('user delete start......')
@@ -159,6 +165,8 @@ class User(object):
user = self.users.get(self.conf.doctor_user)
role = self.roles.get(self.conf.doctor_role)
+ self._restore_default_quota()
+
if project:
if 'admin' in self.roles_for_admin:
self.keystone.roles.revoke(
@@ -177,23 +185,45 @@ class User(object):
self.keystone.projects.delete(project)
self.log.info('user delete end......')
- def update_quota(self):
- self.log.info('user quota update start......')
+ def update_quota(self, instances=None, cores=None):
+ self.log.info('quota update start......')
project = self.projects.get(self.conf.doctor_project)
+
user = self.users.get(self.conf.doctor_user)
+ if instances is not None:
+ quota_instances = instances
+ else:
+ quota_instances = self.conf.quota_instances
+ if cores is not None:
+ quota_cores = cores
+ else:
+ quota_cores = self.conf.quota_cores
+
if project and user:
+ # default needs to be at least the same as with doctor_user
+ self.log.info('default quota update start......')
+
+ self.def_quota = self.nova.quota_classes.get('default')
+ if quota_instances > self.def_quota.instances:
+ self.restore_def_quota = True
+ self.nova.quota_classes.update('default',
+ instances=quota_instances)
+ if quota_cores > self.def_quota.cores:
+ self.restore_def_quota = True
+ self.nova.quota_classes.update('default',
+ cores=quota_cores)
+ self.log.info('user quota update start......')
self.quota = self.nova.quotas.get(project.id,
user_id=user.id)
- if self.conf.quota_instances > self.quota.instances:
- self.nova.quotas.update(
- project.id,
- instances=self.conf.quota_instances,
- user_id=user.id)
- if self.conf.quota_cores > self.quota.cores:
+ if quota_instances > self.quota.instances:
+ self.nova.quotas.update(project.id,
+ instances=quota_instances,
+ user_id=user.id)
+ if quota_cores > self.quota.cores:
self.nova.quotas.update(project.id,
- cores=self.conf.quota_cores,
+ cores=quota_cores,
user_id=user.id)
- self.log.info('user quota update end......')
else:
raise Exception('No project or role for update quota')
+ self.log.info('quota update end......')
diff --git a/etc/doctor.sample.conf b/etc/doctor.sample.conf
index 6eeea3a5..0e8a49bb 100644
--- a/etc/doctor.sample.conf
+++ b/etc/doctor.sample.conf
@@ -51,3 +51,6 @@
#type = sample
#ip = 127.0.0.1
#port = 12346
+
+[scenario]
+#test_case = fault_management
diff --git a/requirements.txt b/requirements.txt
index ac29d795..1eab2a04 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,22 @@
# The order of packages is significant, because pip processes them in the order
# of appearance. Changing the order has an impact on the overall integration
# process, which may cause wedges in the gate later.
-Flask!=0.11,<1.0,>=0.10 # BSD
-paramiko>=2.0 # LGPLv2.1+
+Flask!=0.11 # BSD
+paramiko # LGPLv2.1+
scp
-requests>=2.14.2 # Apache-2.0
-oslo.config!=4.3.0,!=4.4.0,>=4.0.0 # Apache-2.0
-python-openstackclient>=3.11.0 # Apache-2.0
-python-ceilometerclient>=2.5.0 # Apache-2.0
-aodhclient>=0.7.0 # Apache-2.0
-python-keystoneclient>=3.8.0 # Apache-2.0
-python-neutronclient>=6.3.0 # Apache-2.0
-python-novaclient>=9.0.0 # Apache-2.0
-python-congressclient<2000,>=1.3.0 # Apache-2.0
-python-glanceclient>=2.8.0 # Apache-2.0
-python-vitrageclient>=1.3.0 # Apache-2.0
-virtualenv>=13.1.0 # MIT
+requests # Apache-2.0
+oslo.config!=4.3.0,!=4.4.0 # Apache-2.0
+python-openstackclient # Apache-2.0
+oslo.messaging # Apache-2.0
+oslo.versionedobjects # Apache-2.0
+python-ceilometerclient # Apache-2.0
+aodhclient # Apache-2.0
+python-keystoneclient!=2.1.0 # Apache-2.0
+python-neutronclient # Apache-2.0
+python-novaclient # Apache-2.0
+python-congressclient<2000 # Apache-2.0
+python-glanceclient # Apache-2.0
+python-vitrageclient # Apache-2.0
+virtualenv # MIT
+python-heatclient # Apache-2.0
flake8<2.6.0,>=2.5.4 # MIT
diff --git a/tox.ini b/tox.ini
index 2eaadacd..2937c329 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,12 +1,12 @@
[tox]
minversion = 2.3.1
-envlist = py35, pep8
+envlist = py36,pep8,docs,docs-linkcheck
skipsdist = True
[testenv]
usedevelop = True
-install_command = pip install \
- -chttps://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/pike \
+install_command = pip3 install \
+ -chttps://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/stein \
{opts} {packages}
setenv = VIRTUAL_ENV={envdir}
deps = -r{toxinidir}/requirements.txt
@@ -21,15 +21,21 @@ passenv =
OS_PROJECT_DOMAIN_NAME
OS_PROJECT_DOMAIN_ID
OS_REGION_NAME
+ OS_CACERT
IMAGE_NAME
VM_COUNT
PROFILER_TYPE
CI_DEBUG
INSTALLER_TYPE
INSTALLER_IP
- PROFILER_TYPE
+ INSPECTOR_TYPE
+ ADMIN_TOOL_TYPE
+ TEST_CASE
+ SSH_KEY
+ APP_MANAGER_TYPE
changedir = {toxinidir}/doctor_tests
commands = doctor-test
+ /usr/bin/find {toxinidir} -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete
[testenv:pep8]
changedir = {toxinidir}
@@ -46,3 +52,16 @@ enable-extensions=H106,H203
builtins = _
filename = *.py,app.wsgi
exclude=.venv,.git,.tox,dist,doc,*lib/python*,*egg,build,tests
+
+[testenv:docs]
+changedir = {toxinidir}
+deps = -rdocs/requirements.txt
+commands =
+ sphinx-build -b html -n -d {envtmpdir}/doctrees ./docs/ {toxinidir}/docs/_build/html
+ echo "Generated docs available in {toxinidir}/docs/_build/html"
+whitelist_externals = echo
+
+[testenv:docs-linkcheck]
+changedir = {toxinidir}
+deps = -rdocs/requirements.txt
+commands = sphinx-build -b linkcheck -d {envtmpdir}/doctrees ./docs/ {toxinidir}/docs/_build/linkcheck