summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--INFO27
-rw-r--r--INFO.yaml24
-rw-r--r--devstack/README.rst4
-rw-r--r--devstack/local.conf.sample120
-rw-r--r--docs/conf.py2
-rw-r--r--docs/conf.yaml3
-rw-r--r--docs/development/design/images/maintenance-workflow.pngbin0 -> 81286 bytes
-rw-r--r--docs/development/design/maintenance-design-guideline.rst426
-rw-r--r--docs/development/index.rst14
-rwxr-xr-xdocs/development/overview/functest_scenario/images/figure-p1.pngbin60756 -> 0 bytes
-rw-r--r--docs/development/overview/index.rst7
-rw-r--r--docs/development/overview/overview.rst52
-rw-r--r--docs/development/requirements/index.rst6
-rw-r--r--docs/index.rst17
-rw-r--r--docs/release/configguide/feature.configuration.rst54
-rw-r--r--docs/release/configguide/index.rst6
-rw-r--r--docs/release/index.rst12
-rw-r--r--docs/release/installation/index.rst (renamed from docs/development/manuals/index.rst)12
-rw-r--r--docs/release/installation/installation.rst44
-rw-r--r--docs/release/release-notes/index.rst2
-rw-r--r--docs/release/release-notes/release-notes.rst146
-rw-r--r--docs/release/release-notes/releasenotes_fraser.rst (renamed from docs/release/release-notes/releasenotes.rst)1
-rw-r--r--docs/release/release-notes/releasenotes_gambia.rst303
-rw-r--r--docs/release/release-notes/releasenotes_iruya.rst129
-rw-r--r--docs/release/scenarios/fault_management/fault_management.rst (renamed from docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst)78
-rw-r--r--docs/release/scenarios/maintenance/images/Fault-management-design.pngbin0 -> 237110 bytes
-rw-r--r--docs/release/scenarios/maintenance/images/LICENSE (renamed from docs/development/overview/functest_scenario/images/LICENSE)0
-rw-r--r--docs/release/scenarios/maintenance/images/Maintenance-design.pngbin0 -> 316640 bytes
-rw-r--r--docs/release/scenarios/maintenance/images/Maintenance-workflow.pngbin0 -> 81286 bytes
-rw-r--r--docs/release/scenarios/maintenance/maintenance.rst120
-rw-r--r--docs/release/userguide/get-valid-server-state.rst (renamed from docs/development/manuals/get-valid-server-state.rst)0
-rw-r--r--docs/release/userguide/index.rst3
-rw-r--r--docs/release/userguide/mark-host-down_manual.rst (renamed from docs/development/manuals/mark-host-down_manual.rst)0
-rw-r--r--docs/release/userguide/monitors.rst (renamed from docs/development/manuals/monitors.rst)3
-rw-r--r--docs/requirements.txt2
-rw-r--r--docs/testing/developer/index.rst13
-rw-r--r--docs/testing/developer/testing.rst (renamed from docs/development/overview/testing.rst)55
-rw-r--r--docs/testing/index.rst15
-rw-r--r--docs/testing/user/index.rst13
-rw-r--r--docs/testing/user/testing.rst30
-rw-r--r--doctor_tests/admin_tool/__init__.py37
-rw-r--r--doctor_tests/admin_tool/base.py26
-rw-r--r--doctor_tests/admin_tool/fenix/Dockerfile34
-rwxr-xr-xdoctor_tests/admin_tool/fenix/run32
-rw-r--r--doctor_tests/admin_tool/sample.py739
-rw-r--r--doctor_tests/app_manager/__init__.py40
-rw-r--r--doctor_tests/app_manager/base.py26
-rw-r--r--doctor_tests/app_manager/sample.py265
-rw-r--r--doctor_tests/app_manager/vnfm.py441
-rw-r--r--doctor_tests/common/constants.py4
-rw-r--r--doctor_tests/common/utils.py26
-rw-r--r--doctor_tests/config.py4
-rw-r--r--doctor_tests/consumer/__init__.py2
-rw-r--r--doctor_tests/image.py13
-rw-r--r--doctor_tests/inspector/__init__.py8
-rw-r--r--doctor_tests/inspector/congress.py19
-rw-r--r--doctor_tests/inspector/sample.py85
-rw-r--r--doctor_tests/installer/__init__.py16
-rw-r--r--doctor_tests/installer/apex.py176
-rw-r--r--doctor_tests/installer/base.py113
-rw-r--r--doctor_tests/installer/common/congress.py51
-rw-r--r--doctor_tests/installer/common/restart_aodh.py42
-rw-r--r--doctor_tests/installer/common/restore_aodh.py32
-rw-r--r--doctor_tests/installer/common/restore_ceilometer.py27
-rw-r--r--doctor_tests/installer/common/restore_compute_config.py26
-rw-r--r--doctor_tests/installer/common/restore_config.py48
-rw-r--r--doctor_tests/installer/common/restore_congress.py29
-rw-r--r--doctor_tests/installer/common/set_ceilometer.py45
-rw-r--r--doctor_tests/installer/common/set_compute_config.py53
-rw-r--r--doctor_tests/installer/common/set_config.py163
-rw-r--r--doctor_tests/installer/common/set_congress.py39
-rw-r--r--doctor_tests/installer/common/set_fenix.sh106
-rw-r--r--doctor_tests/installer/common/vitrage.py5
-rw-r--r--doctor_tests/installer/daisy.py3
-rw-r--r--doctor_tests/installer/devstack.py151
-rw-r--r--doctor_tests/installer/local.py118
-rw-r--r--doctor_tests/installer/mcp.py182
-rw-r--r--doctor_tests/main.py111
-rw-r--r--doctor_tests/maintenance_hot_tpl.yaml119
-rw-r--r--doctor_tests/os_clients.py7
-rw-r--r--doctor_tests/scenario/fault_management.py26
-rw-r--r--doctor_tests/scenario/maintenance.py250
-rw-r--r--doctor_tests/stack.py118
-rw-r--r--doctor_tests/user.py66
-rw-r--r--requirements.txt31
-rw-r--r--tox.ini26
87 files changed, 5015 insertions, 710 deletions
diff --git a/.gitignore b/.gitignore
index 6fc25628..b21ec655 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@
/*.egg-info/
.eggs/
/build/
-/docs_build/
/docs_output/
/releng/
/tests/*.img
@@ -16,3 +15,4 @@
#Build results
.tox
+docs/_build/*
diff --git a/INFO b/INFO
deleted file mode 100644
index f722e870..00000000
--- a/INFO
+++ /dev/null
@@ -1,27 +0,0 @@
-Project: Fault Management project (doctor)
-Project Creation Date: December 2, 2014
-Project Category: Requirement
-Lifecycle State: Mature
-Primary Contact: Tomi Juvonen (tomi.juvonen@nokia.com)
-Project Lead: Tomi Juvonen (tomi.juvonen@nokia.com)
-Jira Project Name: Fault Management project
-Jira Project Prefix: DOCTOR
-Mailing list tag: [doctor]
-IRC: Server:freenode.net Channel:#opnfv-doctor
-Repository: doctor
-
-Committers:
-Ashiq Khan (NTT DOCOMO, khan@nttdocomo.com)
-Bertrand Souville (NTT DOCOMO, souville@docomolab-euro.com)
-Dong Wenjuan (ZTE, dong.wenjuan@zte.com.cn)
-Gerald Kunzmann (NTT DOCOMO, kunzmann@docomolab-euro.com)
-Ryota Mibu (NEC, r-mibu@cq.jp.nec.com)
-Serge Manning (Sprint, Serge.Manning@sprint.com)
-Tomi Juvonen (Nokia, tomi.juvonen@nokia.com)
-
-Link to TSC approval of the project: http://meetbot.opnfv.org/meetings/opnfv-meeting/2014/opnfv-meeting.2014-12-02-14.58.html
-Link(s) to approval of committer update:
-http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html
-http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2015-June/003165.html
-http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-June/011245.html
-http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-July/011771.html
diff --git a/INFO.yaml b/INFO.yaml
index 3b9a3101..97acb69f 100644
--- a/INFO.yaml
+++ b/INFO.yaml
@@ -34,31 +34,12 @@ repositories:
- 'doctor'
committers:
- <<: *opnfv_doctor_ptl
- - name: 'Ashiq Khan'
- email: 'khan@nttdocomo.com'
- company: 'NTT DOCOMO'
- id: 'ashiq.khan'
- - name: 'Serge Manning'
- email: 'serge.manning@sprint.com'
- company: 'Sprint'
- id: 'sergem913'
- - name: 'Gerald Kunzmann'
- email: 'kunzmann@docomolab-euro.com'
- company: 'DOCOMO Euro-Labs'
- id: 'kunzmann'
- name: 'wenjuan dong'
email: 'dong.wenjuan@zte.com.cn'
company: 'ZTE'
id: 'dongwenjuan'
- - name: 'Bertrand Souville'
- email: 'souville@docomolab-euro.com'
- company: 'DOCOMO Euro-Labs'
- id: 'bertys'
- - name: 'Ryota Mibu'
- email: 'r-mibu@cq.jp.nec.com'
- company: 'NEC'
- id: 'r-mibu'
tsc:
+ # yamllint disable rule:line-length
approval: 'http//meetbot.opnfv.org/meetings/opnfv-meeting/2014/opnfv-meeting.2014-12-02-14.58.html'
changes:
- type: 'removal'
@@ -100,3 +81,6 @@ tsc:
- type: 'removal'
name: 'Peter Lee'
link: 'https://lists.opnfv.org/pipermail/opnfv-tsc/2018-March/004190.html'
+ - type: 'removal'
+ name: 'Bertrand Souville'
+ link: 'https://lists.opnfv.org/g/opnfv-tech-discuss/message/22344'
diff --git a/devstack/README.rst b/devstack/README.rst
index 91e8abfe..aaa18a7f 100644
--- a/devstack/README.rst
+++ b/devstack/README.rst
@@ -18,7 +18,9 @@ OPNFV Doctor in DevStack.
enable_plugin osprofiler https://git.openstack.org/openstack/osprofiler
enable_plugin doctor https://git.opnfv.org/doctor
-to the ``[[local|localrc]]`` section.
+to the ``[[local|localrc]]`` section. Or, you can copy the local.conf.sample::
+
+ cp /<path-to-doctor>/devstack/local.conf.sample ${DEVSTACK_DIR}/local.conf
.. note:: The order of enabling plugins matters.
diff --git a/devstack/local.conf.sample b/devstack/local.conf.sample
new file mode 100644
index 00000000..2967714a
--- /dev/null
+++ b/devstack/local.conf.sample
@@ -0,0 +1,120 @@
+# Sample ``local.conf`` for user-configurable variables in ``stack.sh``
+
+# NOTE: Copy this file to the root DevStack directory for it to work properly.
+
+# ``local.conf`` is a user-maintained settings file that is sourced from ``stackrc``.
+# This gives it the ability to override any variables set in ``stackrc``.
+# Also, most of the settings in ``stack.sh`` are written to only be set if no
+# value has already been set; this lets ``local.conf`` effectively override the
+# default values.
+
+# This is a collection of some of the settings we have found to be useful
+# in our DevStack development environments. Additional settings are described
+# in https://docs.openstack.org/devstack/latest/configuration.html#local-conf
+# These should be considered as samples and are unsupported DevStack code.
+
+# The ``localrc`` section replaces the old ``localrc`` configuration file.
+# Note that if ``localrc`` is present it will be used in favor of this section.
+[[local|localrc]]
+
+# Minimal Contents
+# ----------------
+
+# While ``stack.sh`` is happy to run without ``localrc``, devlife is better when
+# there are a few minimal variables set:
+
+# If the ``*_PASSWORD`` variables are not set here you will be prompted to enter
+# values for them by ``stack.sh``and they will be added to ``local.conf``.
+ADMIN_PASSWORD=devstack
+DATABASE_PASSWORD=$ADMIN_PASSWORD
+RABBIT_PASSWORD=$ADMIN_PASSWORD
+SERVICE_PASSWORD=$ADMIN_PASSWORD
+
+# ``HOST_IP`` and ``HOST_IPV6`` should be set manually for best results if
+# the NIC configuration of the host is unusual, i.e. ``eth1`` has the default
+# route but ``eth0`` is the public interface. They are auto-detected in
+# ``stack.sh`` but often is indeterminate on later runs due to the IP moving
+# from an Ethernet interface to a bridge on the host. Setting it here also
+# makes it available for ``openrc`` to include when setting ``OS_AUTH_URL``.
+# Neither is set by default.
+HOST_IP=127.0.0.1
+#HOST_IPV6=2001:db8::7
+
+
+# Logging
+# -------
+
+# By default ``stack.sh`` output only goes to the terminal where it runs. It can
+# be configured to additionally log to a file by setting ``LOGFILE`` to the full
+# path of the destination log file. A timestamp will be appended to the given name.
+LOGFILE=$DEST/logs/stack.sh.log
+
+# Old log files are automatically removed after 7 days to keep things neat. Change
+# the number of days by setting ``LOGDAYS``.
+LOGDAYS=2
+
+# Nova logs will be colorized if ``SYSLOG`` is not set; turn this off by setting
+# ``LOG_COLOR`` false.
+#LOG_COLOR=False
+
+
+# Using milestone-proposed branches
+# ---------------------------------
+
+# Uncomment these to grab the milestone-proposed branches from the
+# repos:
+#CINDER_BRANCH=milestone-proposed
+#GLANCE_BRANCH=milestone-proposed
+#HORIZON_BRANCH=milestone-proposed
+#KEYSTONE_BRANCH=milestone-proposed
+#KEYSTONECLIENT_BRANCH=milestone-proposed
+#NOVA_BRANCH=milestone-proposed
+#NOVACLIENT_BRANCH=milestone-proposed
+#NEUTRON_BRANCH=milestone-proposed
+#SWIFT_BRANCH=milestone-proposed
+
+# Using git versions of clients
+# -----------------------------
+# By default clients are installed from pip. See LIBS_FROM_GIT in
+# stackrc for details on getting clients from specific branches or
+# revisions. e.g.
+# LIBS_FROM_GIT="python-ironicclient"
+# IRONICCLIENT_BRANCH=refs/changes/44/2.../1
+
+# Swift
+# -----
+
+# Swift is now used as the back-end for the S3-like object store. Setting the
+# hash value is required and you will be prompted for it if Swift is enabled
+# so just set it to something already:
+SWIFT_HASH=66a3d6b56c1f479c8b4e70ab5c2000f5
+
+# For development purposes the default of 3 replicas is usually not required.
+# Set this to 1 to save some resources:
+SWIFT_REPLICAS=1
+
+# The data for Swift is stored by default in (``$DEST/data/swift``),
+# or (``$DATA_DIR/swift``) if ``DATA_DIR`` has been set, and can be
+# moved by setting ``SWIFT_DATA_DIR``. The directory will be created
+# if it does not exist.
+SWIFT_DATA_DIR=$DEST/data
+
+# OPNFV Doctor
+# ------------
+
+# Enable the required plugins
+# The order of enabling plugins matters
+enable_plugin aodh http://git.openstack.org/openstack/aodh
+enable_plugin panko https://git.openstack.org/openstack/panko
+enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer
+enable_plugin osprofiler https://git.openstack.org/openstack/osprofiler
+enable_plugin doctor https://git.opnfv.org/doctor
+
+# To enable Python 3
+# USE_PYTHON3=True
+
+# To enable Congress as Doctor Inspector
+# enable_plugin congress https://git.openstack.org/openstack/congress
+
+# To enable Neutron port data plane status
+# Q_ML2_PLUGIN_EXT_DRIVERS=data_plane_status
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 00000000..3c9978bb
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,2 @@
+from docs_conf.conf import * # noqa: F401,F403
+master_doc = 'index'
diff --git a/docs/conf.yaml b/docs/conf.yaml
new file mode 100644
index 00000000..e08a41ab
--- /dev/null
+++ b/docs/conf.yaml
@@ -0,0 +1,3 @@
+---
+project_cfg: opnfv
+project: DOCTOR
diff --git a/docs/development/design/images/maintenance-workflow.png b/docs/development/design/images/maintenance-workflow.png
new file mode 100644
index 00000000..9b65fd59
--- /dev/null
+++ b/docs/development/design/images/maintenance-workflow.png
Binary files differ
diff --git a/docs/development/design/maintenance-design-guideline.rst b/docs/development/design/maintenance-design-guideline.rst
index 93c3cf4e..47002b96 100644
--- a/docs/development/design/maintenance-design-guideline.rst
+++ b/docs/development/design/maintenance-design-guideline.rst
@@ -5,151 +5,329 @@
Planned Maintenance Design Guideline
====================================
-.. NOTE::
- This is spec draft of design guideline for planned maintenance.
- JIRA ticket to track the update and collect comments: `DOCTOR-52`_.
-
-This document describes how one can implement planned maintenance by utilizing
-the `OPNFV Doctor project`_. framework and to meet the set requirements.
+This document describes how one can implement infrastructure maintenance in
+interaction with VNFM by utilizing the `OPNFV Doctor project`_ framework and to
+meet the set requirements. Document concentrates to OpenStack and VMs while
+the concept designed is generic for any payload or even different VIM. Admin
+tool should be also for controller and other cloud hardware, but that is not the
+main focus in OPNFV Doctor and should be defined better in the upstream
+implementation. Same goes for any more detailed work to be done.
Problem Description
===================
-Telco application need to know when planned maintenance is going to happen in
-order to guarantee zero down time in its operation. It needs to be possible to
-make own actions to have application running on not affected resource or give
+Telco application need to know when infrastructure maintenance is going to happen
+in order to guarantee zero down time in its operation. It needs to be possible
+to make own actions to have application running on not affected resource or give
guidance to admin actions like migration. More details are defined in
requirement documentation: `use cases`_, `architecture`_ and `implementation`_.
-Also discussion in the OPNFV summit about `planned maintenance session`_.
Guidelines
==========
-Cloud admin needs to make a notification about planned maintenance including
-all details that application needs in order to make decisions upon his affected
-service. This notification payload can be consumed by application by subscribing
-to corresponding event alarm trough alarming service like OpenStack AODH.
+Concepts used:
+
+- `event`: Notification to rabbitmq with particular event type.
+
+- `state event`: Notification to rabbitmq with particular event type including
+ payload with variable defined for state.
+
+- `project event`: Notification to rabbitmq that is meant for project. Single
+ event type is used with different payload and state information.
+
+- `admin event`: Notification to rabbitmq that is meant for admin or as for any
+ infrastructure service. Single event type is used with different state
+ information.
+
+- `rolling maintenance`: Node by Node rolling maintenance and upgrade where
+ a single node at a time will be maintained after a possible application
+ payload is moved away from the node.
+
+- `project` stands for `application` in OpenStack contents and both are used in
+ this document. `tenant` is many times used for the same.
+
+Infrastructure admin needs to make notification with two different event types.
+One is meant for admin and one for project. Notification payload can be consumed
+by application and admin by subscribing to corresponding event alarm trough
+alarming service like OpenStack AODH.
+
+- Infrastructure admin needs to make a notification about infrastructure
+ maintenance including all details that application needs in order to make
+ a decisions upon his affected service. Alarm Payload can hold a link to
+ infrastructure admin tool API for reply and for other possible information.
+ There is many steps of communication between admin tool and application, thus
+ the payload needed for the information passed is very similar. Because of
+ this, the same event type can be used, but there can be a variable like
+ `state` to tell application what is needed as action for each event.
+ If a project have not subscribed to alarm, admin tool responsible for the
+ maintenance will assume it can do maintenance operations without interaction
+ with application on top of it.
+
+- Infrastructure admin needs to make an event about infrastructure maintenance
+ telling when the maintenance starts and another when it ends. This admin level
+ event should include the host name. This could be consumed by any admin level
+ infrastructure entity. In this document we consume this in `Inspector` that
+ is in `OPNFV Doctor project`_ terms infrastructure entity responsible for
+ automatic host fault management. Automated actions surely needs to be disabled
+ during planned maintenance.
Before maintenance starts application needs to be able to make switch over for
his ACT-STBY service affected, do operation to move service to not effected part
-of infra or give a hint for admin operation like migration that can be
+of infrastructure or give a hint for admin operation like migration that can be
automatically issued by admin tool according to agreed policy.
-Flow diagram::
-
- admin alarming project controller inspector
- | service app manager | |
- | 1. | | | |
- +------------------------->+ |
- +<-------------------------+ |
- | 2. | | | |
- +------>+ 3. | | |
- | +-------->+ 4. | |
- | | +------->+ |
- | | 5. +<-------+ |
- +<----------------+ | |
- | | 6. | |
- +------------------------->+ |
- +<-------------------------+ 7. |
- +------------------------------------->+
- | 8. | | | |
- +------>+ 9. | | |
- | +-------->+ | |
- +--------------------------------------+
- | 10. |
- +--------------------------------------+
- | 11. | | | |
- +------------------------->+ |
- +<-------------------------+ |
- | 12. | | | |
- +------>+-------->+ | 13. |
- +------------------------------------->+
- +-------+---------+--------+-----------+
-
-Concepts used below:
-
-- `full maintenance`: This means maintenance will take a longer time and
- resource should be emptied, meaning container or VM need to be moved or
- deleted. Admin might need to test resource to work after maintenance.
-
-- `reboot`: Only a reboot is needed and admin does not need separate testing
- after that. Container or VM can be left in place if so wanted.
-
-- `notification`: Notification to rabbitmq.
-
-Admin makes a planned maintenance session where he sets
-a `maintenance_session_id` that is a unique ID for all the hardware resources he
-is going to have the maintenance at the same time. Mostly maintenance should be
-done node by node, meaning a single compute node at a time would be in single
-planned maintenance session having unique `maintenance_session_id`. This ID will
-be carried trough the whole session in all places and can be used to query
-maintenance in admin tool API. Project running a Telco application should set
-a specific role for admin tool to know it cannot do planned maintenance unless
-project has agreed actions to be done for its VMs or containers. This means the
-project has configured itself to get alarms upon planned maintenance and it is
-capable of agreeing needed actions. Admin is supposed to use an admin tool to
-automate maintenance process partially or entirely.
-
-The flow of a successful planned maintenance session as in OpenStack example
-case:
-
-1. Admin disables nova-compute in order to do planned maintenance on a compute
- host and gets ACK from the API call. This action needs to be done to ensure
- no thing will be placed in this compute host by any user. Action is always
- done regardless the whole compute will be affected or not.
-2. Admin sends a project specific maintenance notification with state
- `planned maintenance`. This includes detailed information about maintenance,
- like when it is going to start, is it `reboot` or `full maintenance`
- including the information about project containers or VMs running on host or
- the part of it that will need maintenance. Also default action like
- migration will be mentioned that will be issued by admin before maintenance
- starts if no other action is set by project. In case project has a specific
- role set, planned maintenance cannot start unless project has agreed the
- admin action. Available admin actions are also listed in notification.
-3. Application manager of the project receives AODH alarm about the same.
-4. Application manager can do switch over to his ACT-STBY service, delete and
- re-instantiate his service on not affected resource if so wanted.
-5. Application manager may call admin tool API to give preferred instructions
- for leaving VMs and containers in place or do admin action to migrate them.
- In case admin does not receive this instruction before maintenance is to
- start it will do the pre-configured default action like migration to
- projects without a specific role to say project need to agree the action.
- VMs or Containers can be left on host if type of maintenance is just `reboot`.
-6. Admin does possible actions to VMs and containers and receives an ACK.
-7. In case everything went ok, Admin sends admin type of maintenance
- notification with state `in maintenance`. This notification can be consumed
- by Inspector and other cloud services to know there is ongoing maintenance
- which means things like automatic fault management actions for the hardware
- resources should be disabled.
-8. If maintenance type is `reboot` and project is still having containers or
- VMs running on affected hardware resource, Admin sends project specific
- maintenance notification with state updated to `in maintenance`. If project
- do not have anything left running on affected hardware resource, state will
- be `maintenance over` instead. If maintenance can not be performed for some
- reason state should be `maintenance cancelled`. In this case last operation
- remaining for admin is to re-enable nova-compute service, ensure
- everything is running and not to proceed any further steps.
-9. Application manager of the project receives AODH alarm about the same.
-10. Admin will do the maintenance. This is out of Doctor scope.
-11. Admin enables nova-compute service when maintenance is over and host can be
- put back to production. An ACK is received from API call.
-12. In case project had left containers or VMs on hardware resource over
- maintenance, Admin sends project specific maintenance notification with
- state updated to `maintenance over`.
-13. Admin sends admin type of maintenance notification with state updated to
- `maintenance over`. Inspector and other
- cloud services can consume this to know hardware resource is back in use.
+There should be at least one empty host compatible to host under maintenance in
+order to have a smooth `rolling maintenance` done. For this to be possible also
+down scaling the application instances should be possible.
+
+Infrastructure admin should have a tool that is responsible for hosting a
+maintenance work flow session with needed APIs for admin and for applications.
+The Group of hosts in single maintenance session should always have the same
+physical capabilities, so the rolling maintenance can be guaranteed.
+
+Flow diagram is meant to be as high level as possible. It currently does not try
+to be perfect, but to show the most important interfaces needed between VNFM and
+infrastructure admin. This can be seen e.g. as missing error handling that can
+be defined later on.
+
+Flow diagram:
+
+.. figure:: images/maintenance-workflow.png
+ :alt: Work flow in OpenStack
+
+Flow diagram step by step:
+
+- Infrastructure admin makes a maintenance session to maintain and upgrade
+ certain group of hardware. At least compute hardware in single session should
+ be having same capabilities like the amount number of VCPUs to ensure
+ the maintenance can be done node by node in rolling fashion. Maintenance
+ session need to have a `session_id` that is a unique ID to be carried
+ throughout all events and can be used in APIs needed when interacting with
+ the session. Maintenance session needs to have knowledge about when
+ maintenance will start and what capabilities the possible upgrade to
+ infrastructure will bring to application payload on top of it. It will be
+ matter of the implementation to define in more detail whether some more data is
+ needed when creating a session or if it is defined in the admin tool
+ configuration.
+
+ There can be several parallel maintenance sessions and a single session can
+ include multiple projects payload. Typically maintenance session should include
+ similar type of compute hardware, so you can guarantee moving of instances on
+ top of them can work between the compute hosts.
+
+- State `MAINTENANCE` `project event` and reply `ACK_MAINTENANCE`. Immediately
+ after a maintenance session is created, infrastructure admin tool will send
+ a project specific 'notification' which application manager can consume by
+ subscribing to AODH alarm for this event. As explained already earlier all
+ `project event`s will only be sent in case the project subscribes to alarm and
+ otherwise the interaction with application will simply not be done and
+ operations could be forced.
+
+ The state `MAINTENANCE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `MAINTENANCE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances will be affected by
+ the maintenance. This might be a link to admin tool project specific API
+ as AODH variables are limited to string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_MAINTENANCE` including the `session_id`.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when maintenance work flow will start.
+ `ACK_MAINTENANCE` reply is needed before that time.
+ - `metadata` to include key values pairs of a capabilities coming over the
+ maintenance operation like 'openstack_version': 'Queens'
+
+- Optional state `DOWN_SCALE` `project event` and reply `ACK_DOWN_SCALE`. When it
+ is time to start the maintenance work flow as the time reaches the `actions_at`
+ defined in previous `state event`, admin tool needs to check if there is already
+ an empty compute host needed by the `rolling maintenance`. In case there is no
+ empty host, admin tool can ask application to down scale by sending project
+ specific `DOWN_SCALE` `state event`.
+
+ The state `DOWN_SCALE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `DOWN_SCALE` to identify event action needed.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_DOWN_SCALE` including the `session_id`.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when is the last moment to send
+ `ACK_DOWN_SCALE`. This means application can have time to finish some
+ ongoing transactions before down scaling his instances. This guarantees
+ a zero downtime for his service.
+
+- Optional state `PREPARE_MAINTENANCE` `project event` and reply
+ `ACK_PREPARE_MAINTENANCE`. In case still after down scaling the applications
+ there is still no empty compute host, admin tools needs to analyze the
+ situation on compute host under maintenance. It needs to choose compute node
+ that is now almost empty or has otherwise least critical instances running if
+ possible, like looking if there is floating IPs. When compute host is chosen,
+ a `PREPARE_MAINTENANCE` `state event` can be sent to projects having instances
+ running on this host to migrate them to other compute hosts. It might also be
+ possible to have another round of `DOWN_SCALE` `state event` if necessary, but
+ this is not proposed here.
+
+ The state `PREPARE_MAINTENANCE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `PREPARE_MAINTENANCE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances will be affected by
+ the `state event`. This might be a link to admin tool project specific API
+ as AODH variables are limited to string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_PREPARE_MAINTENANCE` including the `session_id` and
+ `instance_ids` with list of key value pairs with key as `instance_id` and
+ chosen action from allowed actions given via `allowed_actions` as value.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when is the last moment to send
+ `ACK_PREPARE_MAINTENANCE`. This means application can have time to finish
+ some ongoing transactions within his instances and make possible
+ switch over. This guarantees a zero downtime for his service.
+ - `allowed_actions` to tell what admin tool supports as action to move
+ instances to another compute host. Typically a list like: `['MIGRATE', 'LIVE_MIGRATE']`
+
+- Optional state `INSTANCE_ACTION_DONE` `project event`. In case admin tool needed
+ to make action to move instance like migrating it to another compute host, this
+ `state event` will be sent to tell the operation is complete.
+
+ The state `INSTANCE_ACTION_DONE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `instance_ids` to tell project which of his instance had the admin action
+ done.
+ - `project_id` to identify project.
+
+- At this state it is guaranteed there is an empty compute host. It would be
+ maintained first trough `IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` steps, but
+ following the flow chart `PLANNED_MAINTENANCE` will be explained next.
+
+- Optional state `PLANNED_MAINTENANCE` `project event` and reply
+ `ACK_PLANNED_MAINTENANCE`. In case compute host to be maintained has
+ instances, projects owning those should have this `state event`. When project
+ receives this `state event` it knows instances moved to other compute host as
+ resulting actions will now go to host that is already maintained. This means
+ it might have new capabilities that project can take into use. This gives the
+ project the possibility to upgrade his instances also to support new
+ capabilities over the action chosen to move instances.
+
+ The state `PLANNED_MAINTENANCE` event should at least include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `PLANNED_MAINTENANCE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances will be affected by
+ the event. This might be a link to admin tool project specific API as AODH
+ variables are limited to string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_PLANNED_MAINTENANCE` including the `session_id` and
+ `instance_ids` with list of key value pairs with key as `instance_id` and
+ chosen action from allowed actions given via `allowed_actions` as value.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when is the last moment to send
+ `ACK_PLANNED_MAINTENANCE`. This means application can have time to finish
+ some ongoing transactions within his instances and make possible switch
+ over. This guarantees a zero downtime for his service.
+ - `allowed_actions` to tell what admin tool supports as action to move
+ instances to another compute host. Typically a list like: `['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']`
+ `OWN_ACTION` means that application may want to re-instantiate his
+ instance perhaps to take into use the new capability coming over the
+ infrastructure maintenance. Re-instantiated instance will go to already
+ maintained host having the new capability.
+ - `metadata` to include key values pairs of a capabilities coming over the
+ maintenance operation like 'openstack_version': 'Queens'
+
+- `State IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` `admin event`s. Just before
+ host goes to maintenance the IN_MAINTENANCE` `state event` will be send to
+ indicate host is entering to maintenance. Host is then taken out of production
+ and can be powered off, replaced, or rebooted during the operation.
+ During the maintenance and upgrade host might be moved to admin's own host
+ aggregate, so it can be tested to work before putting back to production.
+ After maintenance is complete `MAINTENANCE_COMPLETE` `state event` will be sent
+ to know host is back in use. Adding or removing of a host is yet not
+ included in this concept, but can be addressed later.
+
+ The state `IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` event should at least
+ include:
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `IN_MAINTENANCE` or `MAINTENANCE_COMPLETE` to indicate host
+ state.
+ - `project_id` to identify admin project needed by AODH alarm.
+ - `host` to indicate the host name.
+
+- State `MAINTENANCE_COMPLETE` `project event` and reply
+ `MAINTENANCE_COMPLETE_ACK`. After all compute nodes in the maintenance session
+ have gone trough maintenance operation this `state event` can be send to all
+ projects that had instances running on any of those nodes. If there was a down
+ scale done, now the application could up scale back to full operation.
+
+ - `session_id` to reference correct maintenance session.
+ - `state` as `MAINTENANCE_COMPLETE` to identify event action needed.
+ - `instance_ids` to tell project which of his instances are currently
+ running on hosts maintained in this maintenance session. This might be a
+ link to admin tool project specific API as AODH variables are limited to
+ string of 255 character.
+ - `reply_url` for application to call admin tool project specific API to
+ answer `ACK_MAINTENANCE` including the `session_id`.
+ - `project_id` to identify project.
+ - `actions_at` time stamp to indicate when maintenance work flow will start.
+ - `metadata` to include key values pairs of a capabilities coming over the
+ maintenance operation like 'openstack_version': 'Queens'
+
+- At the end admin tool maintenance session can enter to `MAINTENANCE_COMPLETE`
+ state and session can be removed.
+
+Benefits
+========
+
+- Application is guaranteed zero downtime as it is aware of the maintenance
+ action affecting its payload. The application is made aware of the maintenance
+ time window to make sure it can prepare for it.
+- Application gets to know new capabilities over infrastructure maintenance and
+ upgrade and can utilize those (like do its own upgrade)
+- Any application supporting the interaction being defined could be running on
+ top of the same infrastructure provider. No vendor lock-in for application.
+- Any infrastructure component can be aware of host(s) under maintenance via
+ `admin event`s about host state. No vendor lock-in for infrastructure
+ components.
+- Generic messaging making it possible to use same concept in different type of
+ clouds and application payloads. `instance_ids` will uniquely identify any
+ type of instance and similar notification payload can be used regardless we
+ are in OpenStack. Work flow just need to support different cloud
+ infrastructure management to support different cloud.
+- No additional hardware is needed during maintenance operations as down- and
+ up-scaling can be supported for the applications. Optional, if no extensive
+ spare capacity is available for the maintenance - as typically the case in
+ Telco environments.
+- Parallel maintenance sessions for different group of hardware. Same session
+ should include hardware with same capabilities to guarantee `rolling
+ maintenance` actions.
+- Multi-tenancy support. Project specific messaging about maintenance.
+
+Future considerations
+=====================
+
+- Pluggable architecture for infrastructure admin tool to handle different
+ clouds and payloads.
+- Pluggable architecture to handle specific maintenance/upgrade cases like
+ OpenStack upgrade between specific versions or admin testing before giving
+ host back to production.
+- Support for user specific details need to be taken into account in admin side
+ actions (e.g. run a script, ...).
+- (Re-)Use existing implementations like Mistral for work flows.
+- Scaling hardware resources. Allow critical application to be scaled at the
+ same time in controlled fashion or retire application.
POC
---
-There was a `Maintenance POC`_ for planned maintenance in the OPNFV Beijing
-summit to show the basic concept of using framework defined by the project.
+There was a `Maintenance POC`_ demo 'How to gain VNF zero down-time during
+Infrastructure Maintenance and Upgrade' in the OCP and ONS summit March 2018.
+Similar concept is also being made as `OPNFV Doctor project`_ new test case
+scenario.
-.. _DOCTOR-52: https://jira.opnfv.org/browse/DOCTOR-52
.. _OPNFV Doctor project: https://wiki.opnfv.org/doctor
.. _use cases: http://artifacts.opnfv.org/doctor/docs/requirements/02-use_cases.html#nvfi-maintenance
.. _architecture: http://artifacts.opnfv.org/doctor/docs/requirements/03-architecture.html#nfvi-maintenance
.. _implementation: http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html#nfvi-maintenance
-.. _planned maintenance session: https://lists.opnfv.org/pipermail/opnfv-tech-discuss/2017-June/016677.html
-.. _Maintenance POC: https://wiki.opnfv.org/download/attachments/5046291/Doctor%20Maintenance%20PoC%202017.pptx?version=1&modificationDate=1498182869000&api=v2
+.. _Maintenance POC: https://youtu.be/7q496Tutzlo
diff --git a/docs/development/index.rst b/docs/development/index.rst
index 2dc16a82..a7d2817b 100644
--- a/docs/development/index.rst
+++ b/docs/development/index.rst
@@ -2,18 +2,18 @@
.. http://creativecommons.org/licenses/by/4.0
.. (c) 2016 OPNFV.
+.. _development:
-======
-Doctor
-======
+===========
+Development
+===========
.. toctree::
:maxdepth: 2
- ./design/index.rst
- ./requirements/index.rst
- ./manuals/index.rst
- ./overview/functest_scenario/index.rst
+ ./design/index
+ ./overview/index
+ ./requirements/index
Indices
=======
diff --git a/docs/development/overview/functest_scenario/images/figure-p1.png b/docs/development/overview/functest_scenario/images/figure-p1.png
deleted file mode 100755
index e963d8bd..00000000
--- a/docs/development/overview/functest_scenario/images/figure-p1.png
+++ /dev/null
Binary files differ
diff --git a/docs/development/overview/index.rst b/docs/development/overview/index.rst
index 956e73e3..f6d78d57 100644
--- a/docs/development/overview/index.rst
+++ b/docs/development/overview/index.rst
@@ -3,11 +3,12 @@
.. _doctor-overview:
-************************
-Doctor Development Guide
-************************
+********
+Overview
+********
.. toctree::
:maxdepth: 2
+ overview.rst
testing.rst
diff --git a/docs/development/overview/overview.rst b/docs/development/overview/overview.rst
new file mode 100644
index 00000000..21f5439e
--- /dev/null
+++ b/docs/development/overview/overview.rst
@@ -0,0 +1,52 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Platform overview
+"""""""""""""""""
+
+Doctor platform provides these features since `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_:
+
+* Immediate Notification
+* Consistent resource state awareness for compute host down
+* Valid compute host status given to VM owner
+
+These features enable high availability of Network Services on top of
+the virtualized infrastructure. Immediate notification allows VNF managers
+(VNFM) to process recovery actions promptly once a failure has occurred.
+Same framework can also be utilized to have VNFM awareness about
+infrastructure maintenance.
+
+Consistency of resource state is necessary to execute recovery actions
+properly in the VIM.
+
+Ability to query host status gives VM owner the possibility to get
+consistent state information through an API in case of a compute host
+fault.
+
+The Doctor platform consists of the following components:
+
+* OpenStack Compute (Nova)
+* OpenStack Networking (Neutron)
+* OpenStack Telemetry (Ceilometer)
+* OpenStack Alarming (AODH)
+* Doctor Sample Inspector, OpenStack Congress or OpenStack Vitrage
+* Doctor Sample Monitor or any monitor supported by Congress or Vitrage
+
+.. note::
+ Doctor Sample Monitor is used in Doctor testing. However in real
+ implementation like Vitrage, there are several other monitors supported.
+
+You can see an overview of the Doctor platform and how components interact in
+:numref:`figure-p1`.
+
+
+Maintenance use case provides these features since `Iruya Release <https://wiki.opnfv.org/display/SWREL/Iruya>`_:
+
+* Infrastructure maintenance and upgrade workflow
+* Interaction between VNFM and infrastructe workflow
+
+Since `Jerma Release <https://wiki.opnfv.org/display/SWREL/Jerma>`_ maintenance
+use case also supports 'ETSI FEAT03' implementation to have the infrastructure
+maintenance and upgrade fully optimized while keeping zero impact on VNF
+service.
+
diff --git a/docs/development/requirements/index.rst b/docs/development/requirements/index.rst
index fceaebf0..ccc35cb8 100644
--- a/docs/development/requirements/index.rst
+++ b/docs/development/requirements/index.rst
@@ -3,9 +3,9 @@
.. _doctor-requirements:
-****************************************
-Doctor: Fault Management and Maintenance
-****************************************
+**********************************************
+Requirements: Fault Management and Maintenance
+**********************************************
:Project: Doctor, https://wiki.opnfv.org/doctor
:Editors: Ashiq Khan (NTT DOCOMO), Gerald Kunzmann (NTT DOCOMO)
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 00000000..b8e8bfd0
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,17 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+.. _doctor:
+
+=========================================
+Fault Management and Maintenance (Doctor)
+=========================================
+
+.. toctree::
+ :numbered:
+ :maxdepth: 2
+
+ development/index
+ release/index
+ testing/index
diff --git a/docs/release/configguide/feature.configuration.rst b/docs/release/configguide/feature.configuration.rst
index 64928eea..8fbff50e 100644
--- a/docs/release/configguide/feature.configuration.rst
+++ b/docs/release/configguide/feature.configuration.rst
@@ -159,3 +159,57 @@ You can configure the Sample Monitor as follows (Example for Apex deployment):
"http://127.0.0.1:$INSPECTOR_PORT/events" > monitor.log 2>&1 &
**Collectd Monitor**
+
+OpenStack components
+====================
+
+In OPNFV and with Doctor testing you can have all OpenStack components configured
+as needed. Here is sample of the needed configuration modifications.
+
+Ceilometer
+----------
+
+/etc/ceilometer/event_definitions.yaml:
+# Maintenance use case needs new alarm definitions to be added
+- event_type: maintenance.scheduled
+ traits:
+ actions_at:
+ fields: payload.maintenance_at
+ type: datetime
+ allowed_actions:
+ fields: payload.allowed_actions
+ host_id:
+ fields: payload.host_id
+ instances:
+ fields: payload.instances
+ metadata:
+ fields: payload.metadata
+ project_id:
+ fields: payload.project_id
+ reply_url:
+ fields: payload.reply_url
+ session_id:
+ fields: payload.session_id
+ state:
+ fields: payload.state
+- event_type: maintenance.host
+ traits:
+ host:
+ fields: payload.host
+ project_id:
+ fields: payload.project_id
+ session_id:
+ fields: payload.session_id
+ state:
+ fields: payload.state
+
+/etc/ceilometer/event_pipeline.yaml:
+# Maintenance and Fault management both needs these to be added
+ - notifier://
+ - notifier://?topic=alarm.all
+
+Nova
+----
+
+/etc/nova/nova.conf
+cpu_allocation_ratio=1.0
diff --git a/docs/release/configguide/index.rst b/docs/release/configguide/index.rst
index b1e7c33d..c2331115 100644
--- a/docs/release/configguide/index.rst
+++ b/docs/release/configguide/index.rst
@@ -3,9 +3,9 @@
.. _doctor-configguide:
-*************************
-Doctor Installation Guide
-*************************
+**************************
+Doctor Configuration Guide
+**************************
.. toctree::
:maxdepth: 2
diff --git a/docs/release/index.rst b/docs/release/index.rst
index 8a1bf405..67eb4c5f 100644
--- a/docs/release/index.rst
+++ b/docs/release/index.rst
@@ -2,14 +2,18 @@
.. http://creativecommons.org/licenses/by/4.0
.. (c) 2017 OPNFV.
+.. _release:
-======
-Doctor
-======
+=======
+Release
+=======
.. toctree::
:maxdepth: 2
+ ./configguide/index.rst
./installation/index.rst
+ ./release-notes/index.rst
+ ./scenarios/fault_management/fault_management.rst
+ ./scenarios/maintenance/maintenance.rst
./userguide/index.rst
-
diff --git a/docs/development/manuals/index.rst b/docs/release/installation/index.rst
index f705f94a..f6527e5d 100644
--- a/docs/development/manuals/index.rst
+++ b/docs/release/installation/index.rst
@@ -1,13 +1,13 @@
.. This work is licensed under a Creative Commons Attribution 4.0 International License.
.. http://creativecommons.org/licenses/by/4.0
-.. _doctor-manuals:
+.. _doctor-configguide:
-*******
-Manuals
-*******
+*************************
+Doctor Installation Guide
+*************************
.. toctree::
+ :maxdepth: 2
-.. include:: mark-host-down_manual.rst
-.. include:: get-valid-server-state.rst
+ installation.rst
diff --git a/docs/release/installation/installation.rst b/docs/release/installation/installation.rst
new file mode 100644
index 00000000..564f19fd
--- /dev/null
+++ b/docs/release/installation/installation.rst
@@ -0,0 +1,44 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Doctor Installation
+====================
+
+You can clone doctor project in OPNFV installer jumphost or if you are not
+in OPNFV environment you can clone Doctor to DevStack controller node
+
+git clone https://gerrit.opnfv.org/gerrit/doctor
+
+In DevStack controller here is a sample of including what Doctor testing
+will require for sample fault management testing and for maintenance
+testing using Fenix
+
+.. code-block:: bash
+
+ git clone https://github.com/openstack/devstack -b stable/train
+
+.. code-block:: bash
+
+ cd devstack vi local.conf
+
+.. code-block:: bash
+
+ [[local|localrc]]
+ GIT_BASE=https://git.openstack.org
+ HOST_IP=<host_ip>
+ ADMIN_PASSWORD=admin
+ DATABASE_PASSWORD=admin
+ RABBIT_PASSWORD=admin
+ SERVICE_PASSWORD=admin
+ LOGFILE=/opt/stack/stack.sh.log
+
+ PUBLIC_INTERFACE=eth0
+
+ CEILOMETER_EVENT_ALARM=True
+
+ ENABLED_SERVICES=key,rabbit,mysql,fenix-engine,fenix-api,aodh-evaluator,aodh-notifier,aodh-api
+
+ enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer stable/train
+ enable_plugin aodh https://git.openstack.org/openstack/aodh stable/train
+ enable_plugin gnocchi https://github.com/openstack/gnocchi
+ enable_plugin fenix https://opendev.org/x/fenix master
diff --git a/docs/release/release-notes/index.rst b/docs/release/release-notes/index.rst
index 2e6d46e1..a0e30501 100644
--- a/docs/release/release-notes/index.rst
+++ b/docs/release/release-notes/index.rst
@@ -10,4 +10,4 @@ Doctor Release Notes
.. toctree::
:maxdepth: 2
- releasenotes.rst
+ release-notes.rst
diff --git a/docs/release/release-notes/release-notes.rst b/docs/release/release-notes/release-notes.rst
new file mode 100644
index 00000000..b525335e
--- /dev/null
+++ b/docs/release/release-notes/release-notes.rst
@@ -0,0 +1,146 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+This document provides the release notes for Iruya version of Doctor.
+
+Important notes
+===============
+
+Jerma release has mainly been for finalizing maintenance use case testing
+supporting the ETSI FEAT03 defined interactino between VNFM and infrastructure.
+This is mainly to have infrastructure maintenance and upgrade operations
+opttimized as fast as they can while keeping VNFs on top with zero impact
+on their service.
+
+Further more this is the final release of Doctor and the more deep testing is
+moving more to upstream projects like Fenix for the maintenance. Also in
+this release we have made sure that all Doctor testing and any deeper testing
+with ehe upstream projects can be done in DevStack. This also makes DevStack
+the most important installer.
+
+Summary
+=======
+
+Jerma Doctor framework uses OpenStack Train integrated into its test cases.
+
+Release Data
+============
+
+Doctor changes
+
+- Maintenance use case updated to support latest version of Fenix.
+- Maintenance use case now supports ETSI FEAT03 optimization with Fenix.
+- Doctor testing is now preferred to be done in DevStack environment
+ where one can easily select OpenStack release from Rocky to Ussuri to
+ test Doctor functionality. Latest OPNFV Fuel can also be used for the
+ OpenStack version it supports.
+
+Doctor CI
+
+- Doctor tested with fuel installer.
+- Fault management use case is tested with sample inspector.
+- Maintenance use case is tested with sample implementation and towards
+ the latest Fenix version. The includes the new ETSI FEAT03 optimization.
+
+Version change
+^^^^^^^^^^^^^^
+
+Module version changes
+~~~~~~~~~~~~~~~~~~~~~~
+
+- OpenStack has changed Train
+
+Document version changes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+All documentation is updated to OPNFV unified format according to
+documentation guidelines. Small updates in many documents.
+
+Reason for version
+^^^^^^^^^^^^^^^^^^
+
+N/A
+
+Feature additions
+~~~~~~~~~~~~~~~~~
+
++--------------------+--------------------------------------------+
+| **JIRA REFERENCE** | **SLOGAN** |
++--------------------+--------------------------------------------+
+| DOCTOR-137 | VNFM maintenance with ETSI changes |
++--------------------+--------------------------------------------+
+| DOCTOR-136 | DevStack support |
++--------------------+--------------------------------------------+
+
+
+Deliverables
+------------
+
+Software deliverables
+=====================
+
+None
+
+Documentation deliverables
+==========================
+
+https://git.opnfv.org/doctor/tree/docs
+
+Known Limitations, Issues and Workarounds
+=========================================
+
+System Limitations
+^^^^^^^^^^^^^^^^^^
+
+Maintenance test case requirements:
+
+- Minimum number of nodes: 1 Controller, 3 Computes
+- Min number of VCPUs: 2 VCPUs for each compute
+
+Known issues
+^^^^^^^^^^^^
+
+None
+
+Workarounds
+^^^^^^^^^^^
+
+None
+
+Test Result
+===========
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
++--------------------------------------+--------------+
+
+Doctor CI results with TEST_CASE='maintenance' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
+| ADMIN_TOOL_TYPE='fenix' *) | |
++--------------------------------------+--------------+
+
+*) Sample implementation not updated according to latest upstream Fenix
+ and is currently not being tested.
+
+References
+==========
+
+For more information about the OPNFV Doctor latest work, please see:
+
+https://wiki.opnfv.org/display/doctor/Doctor+Home
+
+Further information about ETSI FEAT03 optimization can be found from Fenix
+Documentation:
+
+https://fenix.readthedocs.io/en/latest
diff --git a/docs/release/release-notes/releasenotes.rst b/docs/release/release-notes/releasenotes_fraser.rst
index 67653415..f1cf9d7e 100644
--- a/docs/release/release-notes/releasenotes.rst
+++ b/docs/release/release-notes/releasenotes_fraser.rst
@@ -14,6 +14,7 @@ Version history
+------------+----------+--------------+-------------+
| **Date** | **Ver.** | **Author** | **Comment** |
+============+==========+==============+=============+
+| 2018-06-25 | 6.2.0 | Tomi Juvonen | |
| 2018-05-25 | 6.1.0 | Tomi Juvonen | |
| 2018-04-23 | 6.0.0 | Tomi Juvonen | |
+------------+----------+--------------+-------------+
diff --git a/docs/release/release-notes/releasenotes_gambia.rst b/docs/release/release-notes/releasenotes_gambia.rst
new file mode 100644
index 00000000..142bfacf
--- /dev/null
+++ b/docs/release/release-notes/releasenotes_gambia.rst
@@ -0,0 +1,303 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+This document provides the release notes for Gambia of Doctor.
+
+Important notes
+===============
+
+In Gambia release, Doctor has been working with our second use case over
+maintenance. Design guideline is now done and test case exists with sample
+maintenance workflow code implemented in Doctor. Work has also started to have
+the real implementation done in the OpenStack Fenix project
+https://wiki.openstack.org/wiki/Fenix.
+
+Doctor CI testing has now moved to use tox on jumphots instead of running test
+through features container. Also in Apex we use OpenStack services running in
+containers. Functest daily testing supports Doctor fault management test case
+for Apex, Daisy and Fuel installers. This testing is done through features
+container.
+
+In this release, Doctor has not been working with the fault management use case as
+the basic framework has been already done. However, we might need to get back to
+it later to better meet the tough industry requirements as well as requirements
+from edge, containers and 5G.
+
+
+Summary
+=======
+
+Gambia Doctor framework uses OpenStack Queens integrated into its test cases.
+Compared to the previous release, the Heat project is also being used in the
+maintenance test case.
+
+Release Data
+============
+
+Doctor changes
+
++------------------------------------------+----------------------------------------------------------+
+| **commit-ID** | **Subject** |
++------------------------------------------+----------------------------------------------------------+
+| 5b3f5937e7b861fca46b2a6b2d6708866b800f95 | fix building docs |
++------------------------------------------+----------------------------------------------------------+
+| 2ca5924081ce4784f599437707bd32807aa155ce | Fix SSH client connection reset |
++------------------------------------------+----------------------------------------------------------+
+| baac6579556f8216b36db0d0f87f9c2d4f8b4ef5 | Support Apex with services in containers |
++------------------------------------------+----------------------------------------------------------+
+| 23bf63c4616040cb0d69cd26238af2a4a7c00a90 | fix the username to login undercloud in Apex |
++------------------------------------------+----------------------------------------------------------+
+| 61eb3927ada784cc3dffb5ddd17f66e47871f708 | Local Documentation Builds |
++------------------------------------------+----------------------------------------------------------+
+| 0f1dd4314b9e0247d9af7af6df2410462423aeca | Updated from global requirements |
++------------------------------------------+----------------------------------------------------------+
+| 2d4a9f0c0a93797da6534583f6e74553a4b634be | Fix links to remove references to submodules |
++------------------------------------------+----------------------------------------------------------+
+| 3ddc2392b0ed364eede49ff006d64df3ea456350 | Gambia release notes |
++------------------------------------------+----------------------------------------------------------+
+| 825a0a0dd5e8028129b782ed21c549586257b1c5 | delete doctor datasource in congress when cleanup |
++------------------------------------------+----------------------------------------------------------+
+| fcf53129ab2b18b84571faff13d7cb118b3a41b3 | run profile even the notification time is larger than 1S |
++------------------------------------------+----------------------------------------------------------+
+| 495965d0336d42fc36494c81fd15cee2f34c96e9 | Update and add test case |
++------------------------------------------+----------------------------------------------------------+
+| da25598a6a31abe0579ffed12d1719e5ff75f9a7 | bugfix: add doctor datasource in congress |
++------------------------------------------+----------------------------------------------------------+
+| f9e1e3b1ae4be80bc2dc61d9c4213c81c091ea72 | Update the maintenance design document |
++------------------------------------------+----------------------------------------------------------+
+| 4639f15e6db2f1480b41f6fbfd11d70312d4e421 | Add maintenance test code |
++------------------------------------------+----------------------------------------------------------+
+| b54cbc5dd2d32fcb27238680b4657ed384d021c5 | Add setup and cleanup for maintenance test |
++------------------------------------------+----------------------------------------------------------+
+| b2bb504032ac81a2ed3f404113b097d9ce3d7f14 | bugfix: kill the stunnel when cleanup |
++------------------------------------------+----------------------------------------------------------+
+| eaeb3c0f9dc9e6645a159d0a78b9fc181fce53d4 | add ssh_keyfile for connect to installer in Apex |
++------------------------------------------+----------------------------------------------------------+
+| dcbe7bf1c26052b0e95d209254e7273aa1eaace1 | Add tox and test case to testing document |
++------------------------------------------+----------------------------------------------------------+
+| 0f607cb5efd91ee497346b7f792dfa844d15595c | enlarge the time of link down |
++------------------------------------------+----------------------------------------------------------+
+| 1351038a65739b8d799820de515178326ad05f7b | bugfix: fix the filename of ssh tunnel |
++------------------------------------------+----------------------------------------------------------+
+| e70bf248daac03eee6b449cd1654d2ee6265dd8c | Use py34 instead of py35 |
++------------------------------------------+----------------------------------------------------------+
+| 2a60d460eaf018951456451077b7118b60219b32 | add INSPECTOR_TYPE and TEST_CASE to tox env |
++------------------------------------------+----------------------------------------------------------+
+| 2043ceeb08c1eca849daeb2b3696d385425ba061 | [consumer] fix default value for port number |
++------------------------------------------+----------------------------------------------------------+
+
+Releng changes
+
++------------------------------------------+-----------------------------------------------------------------------+
+| **commit-ID** | **Subject** |
++------------------------------------------+-----------------------------------------------------------------------+
+| c87309f5a75ccc5d595f708817b97793c24c4387 | Add Doctor maintenance job |
++------------------------------------------+-----------------------------------------------------------------------+
+| bd16a9756ffd0743e143f0f2f966da8dd666c7a3 | remove congress test in Daisy |
++------------------------------------------+-----------------------------------------------------------------------+
+| c47aaaa53c91aae93877f2532c72374beaa4eabe | remove fuel job in Doctor |
++------------------------------------------+-----------------------------------------------------------------------+
+| ab2fed2522eaf82ea7c63dd05008a37c56e825d0 | use 'workspace-cleanup' plugin in publisher |
++------------------------------------------+-----------------------------------------------------------------------+
+| 3aaed5cf40092744f1b87680b9205a2901baecf3 | clean the workspace in the publisher |
++------------------------------------------+-----------------------------------------------------------------------+
+| 50151eb3717edd4ddd996f3705fbe1732de7f3b7 | run tox with 'sudo' |
++------------------------------------------+-----------------------------------------------------------------------+
+| a3adc85ecb52f5d19ec4e9c49ca1ac35aa429ff9 | remove inspector variable form job template |
++------------------------------------------+-----------------------------------------------------------------------+
+| adfbaf2a3e8487e4c9152bf864a653a0425b8582 | run doctor tests with different inspectors in sequence |
++------------------------------------------+-----------------------------------------------------------------------+
+| 2e98e56224cd550cb3bf9798e420eece28139bd9 | add the ssh_key info if the key_file is exist |
++------------------------------------------+-----------------------------------------------------------------------+
+| c109c271018e9a85d94be1b9b468338d64589684 | prepare installer info for doctor test |
++------------------------------------------+-----------------------------------------------------------------------+
+| 57cbefc7160958eae1d49e4753779180a25864af | use py34 for tox |
++------------------------------------------+-----------------------------------------------------------------------+
+| 3547754e808a581b09c9d22e013a7d986d9f6cd1 | specify the cacert file when it exits |
++------------------------------------------+-----------------------------------------------------------------------+
+| ef4f36aa1c2ff0819d73cde44f84b99a42e15c7e | bugfix: wrong usage of '!include-raw' |
++------------------------------------------+-----------------------------------------------------------------------+
+| 0e0e0d4cb71fb27b1789a2bef2d3c4ff313e67ff | use tox instead of functest for doctor CI jobs |
++------------------------------------------+-----------------------------------------------------------------------+
+| 5b22f1b95feacaec0380f6a7543cbf510b628451 | pass value to parameters |
++------------------------------------------+-----------------------------------------------------------------------+
+| 44ab0cea07fa2a734c4f6b80776ad48fd006d1b8 | Doctor job bugfix: fix the scenario |
++------------------------------------------+-----------------------------------------------------------------------+
+| 17617f1c0a78c7bdad0d11d329a6c7e119cbbddd | bugfix: run doctor tests parallelly |
++------------------------------------------+-----------------------------------------------------------------------+
+| 811e4ef7f4c37b7bc246afc34ff880c014ecc05d | delete 'opnfv-build-ubuntu-defaults' parameters for doctor verify job |
++------------------------------------------+-----------------------------------------------------------------------+
+| 0705f31ab5bc54c073df120cbe0fe62cf10f9a81 | delete the 'node' parameter in 'doctor-slave-parameter' macro |
++------------------------------------------+-----------------------------------------------------------------------+
+| 304151b15f9d7241db8c5fea067cafe048287d84 | fix the default node label for doctor test |
++------------------------------------------+-----------------------------------------------------------------------+
+| a6963f92f015a33b44b27199886952205499b44c | Fix project name |
++------------------------------------------+-----------------------------------------------------------------------+
+| f122bfed998b3b0e0178106a7538377c609c6512 | add a default value for SSH_KEY |
++------------------------------------------+-----------------------------------------------------------------------+
+
+Version change
+^^^^^^^^^^^^^^
+
+Module version changes
+~~~~~~~~~~~~~~~~~~~~~~
+
+- OpenStack has changed from Pike-1 to Queens-1
+
+Document version changes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These documents have been updated in Gambia release
+
+- Testing document
+ docs/development/overview/testing.rst
+- Doctor scenario in functest
+ docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst
+- Maintenance design guideline
+ docs/development/design/maintenance-design-guideline.rst
+
+Reason for version
+^^^^^^^^^^^^^^^^^^
+
+Documentation is updated due to tox usage in testing and adding maintenance
+use case related documentation.
+
+Feature additions
+~~~~~~~~~~~~~~~~~
+
++--------------------+--------------------------------------------------------+
+| **JIRA REFERENCE** | **SLOGAN** |
++--------------------+--------------------------------------------------------+
+| DOCTOR-106 | Maintenance scenario |
++--------------------+--------------------------------------------------------+
+| DOCTOR-125 | Maintenance design document according to our test case |
++--------------------+--------------------------------------------------------+
+| DOCTOR-126 | Use Tox instead of Functest for doctor CI jobs |
++--------------------+--------------------------------------------------------+
+| DOCTOR-127 | Maintenance test POD |
++--------------------+--------------------------------------------------------+
+| DOCTOR-130 | Apex with containers |
++--------------------+--------------------------------------------------------+
+
+
+
+Deliverables
+------------
+
+
+Software deliverables
+=====================
+
+None
+
+Documentation deliverables
+==========================
+
+https://git.opnfv.org/doctor/tree/docs
+
+Known Limitations, Issues and Workarounds
+=========================================
+
+System Limitations
+^^^^^^^^^^^^^^^^^^
+
+Maintenance test case requirements:
+
+- Minimum number of nodes: 1 Controller, 3 Computes
+- Min number of VCPUs: 2 VCPUs for each compute
+
+Known issues
+^^^^^^^^^^^^
+
+None
+
+Workarounds
+^^^^^^^^^^^
+
+None
+
+Test Result
+===========
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | SUCCESS |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Compass' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Daisy' | SUCCESS |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Fuel' | No POD |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Joid' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Local' | N/A |
++--------------------------------------+--------------+
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=congress
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | FAILED |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Compass' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Daisy' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Fuel' | No POD |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Joid' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Local' | N/A |
++--------------------------------------+--------------+
+
+
+Doctor Functest results with TEST_CASE='fault_management'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | skipped |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Compass' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Daisy' | skipped |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Fuel' | skipped |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Joid' | N/A |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Local' | N/A |
++--------------------------------------+--------------+
+
+Note: Installer Functest does not currently test features or skips running the
+project test cases
+
+Doctor CI results with TEST_CASE='maintenance'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='Apex' | SUCCESS |
++--------------------------------------+--------------+
+
+Doctor Functest results with TEST_CASE='maintenance'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+N/A - Needs special target and currently there is only sample implementation
+
+References
+==========
+
+For more information about the OPNFV Doctor latest work, please see:
+
+https://wiki.opnfv.org/display/doctor/Doctor+Home
diff --git a/docs/release/release-notes/releasenotes_iruya.rst b/docs/release/release-notes/releasenotes_iruya.rst
new file mode 100644
index 00000000..92775557
--- /dev/null
+++ b/docs/release/release-notes/releasenotes_iruya.rst
@@ -0,0 +1,129 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+This document provides the release notes for Iruya version of Doctor.
+
+Important notes
+===============
+
+In Iruya release there has not been many changes.
+
+All testing is now being made with Fuel installer. Maintenance use case
+is now only tested against latest upstream Fenix. Only sample inspector is
+tested as Fuel do not support Vitrage or Congress.
+
+Summary
+=======
+
+Iruya Doctor framework uses OpenStack Stein integrated into its test cases.
+
+Release Data
+============
+
+Doctor changes
+
+- Maintenance use case updated to support latest version of Fenix running
+ in container on controller node
+- Maintenance use case now support Fuel installer
+- Doctor updated to use OpenStack Stein and only python 3.6
+- Testing only sample inspector as lacking installer support for
+ Vitrage and Congress
+
+Releng changes
+
+- Doctor testing running with python 3.6 and with sample inspector
+- Doctor is only tested with Fuel installer
+
+Version change
+^^^^^^^^^^^^^^
+
+Module version changes
+~~~~~~~~~~~~~~~~~~~~~~
+
+- OpenStack has changed from Rocky to Stein since previous Hunter release.
+
+Document version changes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+Reason for version
+^^^^^^^^^^^^^^^^^^
+
+N/A
+
+Feature additions
+~~~~~~~~~~~~~~~~~
+
++--------------------+--------------------------------------------------------------+
+| **JIRA REFERENCE** | **SLOGAN** |
++--------------------+--------------------------------------------------------------+
+| DOCTOR-134 | Update Doctor maintenance use case to work with latest Fenix |
++--------------------+--------------------------------------------------------------+
+
+Deliverables
+------------
+
+Software deliverables
+=====================
+
+None
+
+Documentation deliverables
+==========================
+
+https://git.opnfv.org/doctor/tree/docs
+
+Known Limitations, Issues and Workarounds
+=========================================
+
+System Limitations
+^^^^^^^^^^^^^^^^^^
+
+Maintenance test case requirements:
+
+- Minimum number of nodes: 1 Controller, 3 Computes
+- Min number of VCPUs: 2 VCPUs for each compute
+
+Known issues
+^^^^^^^^^^^^
+
+None
+
+Workarounds
+^^^^^^^^^^^
+
+None
+
+Test Result
+===========
+
+Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
++--------------------------------------+--------------+
+
+Doctor CI results with TEST_CASE='maintenance' and INSPECTOR_TYPE=sample
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------------------+--------------+
+| **TEST-SUITE** | **Results:** |
++--------------------------------------+--------------+
+| INSTALLER_TYPE='fuel' | SUCCESS |
+| ADMIN_TOOL_TYPE='fenix' *) | |
++--------------------------------------+--------------+
+
+*) Sample implementation not updated according to latest upstream Fenix
+ and is currently not being tested.
+
+References
+==========
+
+For more information about the OPNFV Doctor latest work, please see:
+
+https://wiki.opnfv.org/display/doctor/Doctor+Home
diff --git a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst b/docs/release/scenarios/fault_management/fault_management.rst
index b3d73d5c..99371201 100644
--- a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst
+++ b/docs/release/scenarios/fault_management/fault_management.rst
@@ -2,53 +2,19 @@
.. http://creativecommons.org/licenses/by/4.0
+Running test cases
+""""""""""""""""""
-Platform overview
-"""""""""""""""""
+Functest will call the "doctor_tests/main.py" in Doctor to run the test job.
+Doctor testing can also be triggered by tox on OPNFV installer jumphost. Tox
+is normally used for functional, module and coding style testing in Python
+project.
-Doctor platform provides these features in `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_:
+Currently 'MCP' and 'devstack' installer are supported.
-* Immediate Notification
-* Consistent resource state awareness for compute host down
-* Valid compute host status given to VM owner
-These features enable high availability of Network Services on top of
-the virtualized infrastructure. Immediate notification allows VNF managers
-(VNFM) to process recovery actions promptly once a failure has occurred.
-
-Consistency of resource state is necessary to execute recovery actions
-properly in the VIM.
-
-Ability to query host status gives VM owner the possibility to get
-consistent state information through an API in case of a compute host
-fault.
-
-The Doctor platform consists of the following components:
-
-* OpenStack Compute (Nova)
-* OpenStack Telemetry (Ceilometer)
-* OpenStack Alarming (Aodh)
-* Doctor Inspector
-* Doctor Monitor
-
-.. note::
- Doctor Inspector and Monitor are sample implementations for reference.
-
-You can see an overview of the Doctor platform and how components interact in
-:numref:`figure-p1`.
-
-.. figure:: ./images/figure-p1.png
- :name: figure-p1
- :width: 100%
-
- Doctor platform and typical sequence
-
-Detailed information on the Doctor architecture can be found in the Doctor
-requirements documentation:
-http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html
-
-Use case
-""""""""
+Fault management use case
+"""""""""""""""""""""""""
* A consumer of the NFVI wants to receive immediate notifications about faults
in the NFVI affecting the proper functioning of the virtual resources.
@@ -67,7 +33,8 @@ configuration.
Detailed workflow information is as follows:
* Consumer(VNFM): (step 0) creates resources (network, server/instance) and an
- event alarm on state down notification of that server/instance
+ event alarm on state down notification of that server/instance or Neutron
+ port.
* Monitor: (step 1) periodically checks nodes, such as ping from/to each
dplane nic to/from gw of node, (step 2) once it fails to send out event
@@ -75,29 +42,26 @@ Detailed workflow information is as follows:
* Inspector: when it receives an event, it will (step 3) mark the host down
("mark-host-down"), (step 4) map the PM to VM, and change the VM status to
- down
+ down. In network failure case, also Neutron port is changed to down.
-* Controller: (step 5) sends out instance update event to Ceilometer
+* Controller: (step 5) sends out instance update event to Ceilometer. In network
+ failure case, also Neutron port is changed to down and corresponding event is
+ sent to Ceilometer.
-* Notifier: (step 6) Ceilometer transforms and passes the event to Aodh,
- (step 7) Aodh will evaluate event with the registered alarm definitions,
+* Notifier: (step 6) Ceilometer transforms and passes the events to AODH,
+ (step 7) AODH will evaluate events with the registered alarm definitions,
then (step 8) it will fire the alarm to the "consumer" who owns the
instance
* Consumer(VNFM): (step 9) receives the event and (step 10) recreates a new
instance
-Test case
-"""""""""
-
-Functest will call the "run.sh" script in Doctor to run the test job.
+Fault management test case
+""""""""""""""""""""""""""
-Currently, only 'Apex' and 'local' installer are supported. The test also
-can run successfully in 'fuel' installer with the modification of some
-configurations of OpenStack in the script. But still need 'fuel' installer
-to support these configurations.
+Functest will call the 'doctor-test' command in Doctor to run the test job.
-The "run.sh" script will execute the following steps.
+The following steps are executed:
Firstly, get the installer ip according to the installer type. Then ssh to
the installer node to get the private key for accessing to the cloud. As
diff --git a/docs/release/scenarios/maintenance/images/Fault-management-design.png b/docs/release/scenarios/maintenance/images/Fault-management-design.png
new file mode 100644
index 00000000..6d98cdec
--- /dev/null
+++ b/docs/release/scenarios/maintenance/images/Fault-management-design.png
Binary files differ
diff --git a/docs/development/overview/functest_scenario/images/LICENSE b/docs/release/scenarios/maintenance/images/LICENSE
index 21a2d03d..21a2d03d 100644
--- a/docs/development/overview/functest_scenario/images/LICENSE
+++ b/docs/release/scenarios/maintenance/images/LICENSE
diff --git a/docs/release/scenarios/maintenance/images/Maintenance-design.png b/docs/release/scenarios/maintenance/images/Maintenance-design.png
new file mode 100644
index 00000000..8f21db6a
--- /dev/null
+++ b/docs/release/scenarios/maintenance/images/Maintenance-design.png
Binary files differ
diff --git a/docs/release/scenarios/maintenance/images/Maintenance-workflow.png b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png
new file mode 100644
index 00000000..9b65fd59
--- /dev/null
+++ b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png
Binary files differ
diff --git a/docs/release/scenarios/maintenance/maintenance.rst b/docs/release/scenarios/maintenance/maintenance.rst
new file mode 100644
index 00000000..ecfe76b1
--- /dev/null
+++ b/docs/release/scenarios/maintenance/maintenance.rst
@@ -0,0 +1,120 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+
+Maintenance use case
+""""""""""""""""""""
+
+* A consumer of the NFVI wants to interact with NFVI maintenance, upgrade,
+ scaling and to have graceful retirement. Receiving notifications over these
+ NFVI events and responding to those within given time window, consumer can
+ guarantee zero downtime to his service.
+
+The maintenance use case adds the Doctor platform an `admin tool` and an
+`app manager` component. Overview of maintenance components can be seen in
+:numref:`figure-p2`.
+
+.. figure:: ./images/Maintenance-design.png
+ :name: figure-p2
+ :width: 100%
+
+ Doctor platform components in maintenance use case
+
+In maintenance use case, `app manager` (VNFM) will subscribe to maintenance
+notifications triggered by project specific alarms through AODH. This is the way
+it gets to know different NFVI maintenance, upgrade and scaling operations that
+effect to its instances. The `app manager` can do actions depicted in `green
+color` or tell `admin tool` to do admin actions depicted in `orange color`
+
+Any infrastructure component like `Inspector` can subscribe to maintenance
+notifications triggered by host specific alarms through AODH. Subscribing to the
+notifications needs admin privileges and can tell when a host is out of use as
+in maintenance and when it is taken back to production.
+
+Maintenance test case
+"""""""""""""""""""""
+
+Maintenance test case is currently running in our Apex CI and executed by tox.
+This is because the special limitation mentioned below and also the fact we
+currently have only sample implementation as a proof of concept and we also
+support unofficial OpenStack project Fenix. Environment variable
+TEST_CASE='maintenance' needs to be used when executing "doctor_tests/main.py"
+and ADMIN_TOOL_TYPE='fenix' if want to test with Fenix instead of sample
+implementation. Test case workflow can be seen in :numref:`figure-p3`.
+
+.. figure:: ./images/Maintenance-workflow.png
+ :name: figure-p3
+ :width: 100%
+
+ Maintenance test case workflow
+
+In test case all compute capacity will be consumed with project (VNF) instances.
+For redundant services on instances and an empty compute needed for maintenance,
+test case will need at least 3 compute nodes in system. There will be 2
+instances on each compute, so minimum number of VCPUs is also 2. Depending on
+how many compute nodes there is application will always have 2 redundant
+instances (ACT-STDBY) on different compute nodes and rest of the compute
+capacity will be filled with non-redundant instances.
+
+For each project specific maintenance message there is a time window for
+`app manager` to make any needed action. This will guarantee zero
+down time for his service. All replies back are done by calling `admin tool` API
+given in the message.
+
+The following steps are executed:
+
+Infrastructure admin will call `admin tool` API to trigger maintenance for
+compute hosts having instances belonging to a VNF.
+
+Project specific `MAINTENANCE` notification is triggered to tell `app manager`
+that his instances are going to hit by infrastructure maintenance at a specific
+point in time. `app manager` will call `admin tool` API to answer back
+`ACK_MAINTENANCE`.
+
+When the time comes to start the actual maintenance workflow in `admin tool`,
+a `DOWN_SCALE` notification is triggered as there is no empty compute node for
+maintenance (or compute upgrade). Project receives corresponding alarm and scales
+down instances and call `admin tool` API to answer back `ACK_DOWN_SCALE`.
+
+As it might happen instances are not scaled down (removed) from a single
+compute node, `admin tool` might need to figure out what compute node should be
+made empty first and send `PREPARE_MAINTENANCE` to project telling which instance
+needs to be migrated to have the needed empty compute. `app manager` makes sure
+he is ready to migrate instance and call `admin tool` API to answer back
+`ACK_PREPARE_MAINTENANCE`. `admin tool` will make the migration and answer
+`ADMIN_ACTION_DONE`, so `app manager` knows instance can be again used.
+
+:numref:`figure-p3` has next a light blue section of actions to be done for each
+compute. However as we now have one empty compute, we will maintain/upgrade that
+first. So on first round, we can straight put compute in maintenance and send
+admin level host specific `IN_MAINTENANCE` message. This is caught by `Inspector`
+to know host is down for maintenance. `Inspector` can now disable any automatic
+fault management actions for the host as it can be down for a purpose. After
+`admin tool` has completed maintenance/upgrade `MAINTENANCE_COMPLETE` message
+is sent to tell host is back in production.
+
+Next rounds we always have instances on compute, so we need to have
+`PLANNED_MAINTANANCE` message to tell that those instances are now going to hit
+by maintenance. When `app manager` now receives this message, he knows instances
+to be moved away from compute will now move to already maintained/upgraded host.
+In test case no upgrade is done on application side to upgrade instances
+according to new infrastructure capabilities, but this could be done here as
+this information is also passed in the message. This might be just upgrading
+some RPMs, but also totally re-instantiating instance with a new flavor. Now if
+application runs an active side of a redundant instance on this compute,
+a switch over will be done. After `app manager` is ready he will call
+`admin tool` API to answer back `ACK_PLANNED_MAINTENANCE`. In test case the
+answer is `migrate`, so `admin tool` will migrate instances and reply
+`ADMIN_ACTION_DONE` and then `app manager` knows instances can be again used.
+Then we are ready to make the actual maintenance as previously trough
+`IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` steps.
+
+After all computes are maintained, `admin tool` can send `MAINTENANCE_COMPLETE`
+to tell maintenance/upgrade is now complete. For `app manager` this means he
+can scale back to full capacity.
+
+There is currently sample implementation on VNFM and test case. In
+infrastructure side there is sample implementation of 'admin_tool' and
+there is also support for the OpenStack Fenix that extends the use case to
+support 'ETSI FEAT03' for VNFM interaction and to optimize the whole
+infrastructure mainteannce and upgrade.
diff --git a/docs/development/manuals/get-valid-server-state.rst b/docs/release/userguide/get-valid-server-state.rst
index 824ea3c2..824ea3c2 100644
--- a/docs/development/manuals/get-valid-server-state.rst
+++ b/docs/release/userguide/get-valid-server-state.rst
diff --git a/docs/release/userguide/index.rst b/docs/release/userguide/index.rst
index eee855dc..577072c7 100644
--- a/docs/release/userguide/index.rst
+++ b/docs/release/userguide/index.rst
@@ -11,3 +11,6 @@ Doctor User Guide
:maxdepth: 2
feature.userguide.rst
+ get-valid-server-state.rst
+ mark-host-down_manual.rst
+ monitors.rst
diff --git a/docs/development/manuals/mark-host-down_manual.rst b/docs/release/userguide/mark-host-down_manual.rst
index 3815205d..3815205d 100644
--- a/docs/development/manuals/mark-host-down_manual.rst
+++ b/docs/release/userguide/mark-host-down_manual.rst
diff --git a/docs/development/manuals/monitors.rst b/docs/release/userguide/monitors.rst
index 0d22b1de..eeb5e226 100644
--- a/docs/development/manuals/monitors.rst
+++ b/docs/release/userguide/monitors.rst
@@ -23,7 +23,8 @@ calculated by using the difference of time at which compute node sends notificat
control node and the time at which consumer is notified. The time on control and compute
node has to be synchronized for this reason. For further details on setting up collectd
on the compute node, use the following link:
-http://docs.opnfv.org/en/stable-danube/submodules/barometer/docs/release/userguide/feature.userguide.html#id18
+:doc:`<barometer:release/userguide/feature.userguide>`
+
Collectd monitors an interface managed by OVS. If the interface is not be assigned
an IP, the user has to provide the name of interface to be monitored. The command to
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..9fde2df2
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+lfdocs-conf
+sphinx_opnfv_theme
diff --git a/docs/testing/developer/index.rst b/docs/testing/developer/index.rst
new file mode 100644
index 00000000..dfbcfa74
--- /dev/null
+++ b/docs/testing/developer/index.rst
@@ -0,0 +1,13 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+*********
+Developer
+*********
+
+.. toctree::
+ :numbered:
+ :maxdepth: 2
+
+ testing.rst
diff --git a/docs/development/overview/testing.rst b/docs/testing/developer/testing.rst
index 98be43e9..6a929130 100644
--- a/docs/development/overview/testing.rst
+++ b/docs/testing/developer/testing.rst
@@ -29,6 +29,28 @@ OpenStack services.
.. _OpenStackClient Configuration: https://docs.openstack.org/python-openstackclient/latest/configuration/index.html
+Doctor now supports different test cases and for that you might want to
+export TEST_CASE with different values:
+
+.. code-block:: bash
+
+ #Fault management (default)
+ export TEST_CASE='fault_management'
+ #Maintenance (requires 3 compute nodes)
+ export TEST_CASE='maintenance'
+ #Run both tests cases
+ export TEST_CASE='all'
+
+ #Use Fenix in maintenance testing instead of sample admin_tool
+ #This is only for 'mainteanance' test case
+ export ADMIN_TOOL_TYPE='fenix'
+ export APP_MANAGER_TYPE='vnfm'
+
+ #Run in different installer jumphost 'fuel' or 'apex'
+ #In multinode DevStack you run Doctor in controller node
+ #with value export APP_MANAGER_TYPE=vnfm
+ export INSTALLER_TYPE='fuel'
+
Run Python Test Script
~~~~~~~~~~~~~~~~~~~~~~
@@ -45,29 +67,16 @@ environment and then run the test.
.. _doctor.sample.conf: https://git.opnfv.org/doctor/tree/etc/doctor.sample.conf
-Run Functest Suite
-==================
-
-Functest supports Doctor testing by triggering the test script above in a
-Functest container. You can run the Doctor test with the following steps:
+In OPNFV testing environment jumphost you can run Doctor testing as follows
+using tox:
.. code-block:: bash
- DOCKER_TAG=latest
- docker pull docker.io/opnfv/functest-features:${DOCKER_TAG}
- docker run --privileged=true -id \
- -e INSTALLER_TYPE=${INSTALLER_TYPE} \
- -e INSTALLER_IP=${INSTALLER_IP} \
- -e INSPECTOR_TYPE=sample \
- docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash
- docker exec <container_id> functest testcase run doctor-notification
-
-See `Functest Userguide`_ for more information.
-
-.. _Functest Userguide: http://docs.opnfv.org/en/latest/submodules/functest/docs/testing/user/userguide/index.html
-
-For testing with stable version, change DOCKER_TAG to 'stable' or other release
-tag identifier.
-
-Tips
-====
+ source overcloudrc
+ export INSTALLER_IP=${INSTALLER_IP}
+ export INSTALLER_TYPE=${INSTALLER_TYPE}
+ git clone https://gerrit.opnfv.org/gerrit/doctor
+ cd doctor
+ sudo -E tox
+
+Note! In DevStack you run Doctor in controller node.
diff --git a/docs/testing/index.rst b/docs/testing/index.rst
new file mode 100644
index 00000000..3fae9568
--- /dev/null
+++ b/docs/testing/index.rst
@@ -0,0 +1,15 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+.. _testing:
+
+=======
+Testing
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ ./developer/index.rst
+ ./user/index.rst
diff --git a/docs/testing/user/index.rst b/docs/testing/user/index.rst
new file mode 100644
index 00000000..1be9c7eb
--- /dev/null
+++ b/docs/testing/user/index.rst
@@ -0,0 +1,13 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. SPDX-License-Identifier: CC-BY-4.0
+.. (c) Open Platform for NFV Project, Inc. and its contributors
+
+****
+User
+****
+
+.. toctree::
+ :numbered:
+ :maxdepth: 2
+
+ testing.rst
diff --git a/docs/testing/user/testing.rst b/docs/testing/user/testing.rst
new file mode 100644
index 00000000..6172d26a
--- /dev/null
+++ b/docs/testing/user/testing.rst
@@ -0,0 +1,30 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Run Functest Suite (obsolete)
+=============================
+
+Functest supports Doctor testing by triggering the test script above in a
+Functest container. You can run the Doctor test with the following steps:
+
+.. code-block:: bash
+
+ DOCKER_TAG=latest
+ docker pull docker.io/opnfv/functest-features:${DOCKER_TAG}
+ docker run --privileged=true -id \
+ -e INSTALLER_TYPE=${INSTALLER_TYPE} \
+ -e INSTALLER_IP=${INSTALLER_IP} \
+ -e INSPECTOR_TYPE=sample \
+ docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash
+ docker exec <container_id> functest testcase run doctor-notification
+
+See `Functest Userguide`_ for more information.
+
+.. _Functest Userguide: :doc:`<functest:testing/user/userguide>`
+
+
+For testing with stable version, change DOCKER_TAG to 'stable' or other release
+tag identifier.
+
+Tips
+====
diff --git a/doctor_tests/admin_tool/__init__.py b/doctor_tests/admin_tool/__init__.py
new file mode 100644
index 00000000..3417a334
--- /dev/null
+++ b/doctor_tests/admin_tool/__init__.py
@@ -0,0 +1,37 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from oslo_config import cfg
+from oslo_utils import importutils
+import os
+
+OPTS = [
+ cfg.StrOpt('type',
+ default=os.environ.get('ADMIN_TOOL_TYPE', 'sample'),
+ choices=['sample', 'fenix'],
+ help='the component of doctor admin_tool',
+ required=True),
+ cfg.StrOpt('ip',
+ default='0.0.0.0',
+ help='the ip of admin_tool',
+ required=True),
+ cfg.IntOpt('port',
+ default='12347',
+ help='the port of doctor admin_tool',
+ required=True),
+]
+
+
+_admin_tool_name_class_mapping = {
+ 'sample': 'doctor_tests.admin_tool.sample.SampleAdminTool'
+}
+
+
+def get_admin_tool(trasport_url, conf, log):
+ admin_tool_class = _admin_tool_name_class_mapping.get(conf.admin_tool.type)
+ return importutils.import_object(admin_tool_class, trasport_url, conf, log)
diff --git a/doctor_tests/admin_tool/base.py b/doctor_tests/admin_tool/base.py
new file mode 100644
index 00000000..0f0b2dcd
--- /dev/null
+++ b/doctor_tests/admin_tool/base.py
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseAdminTool(object):
+
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+
+ @abc.abstractmethod
+ def start(self):
+ pass
+
+ @abc.abstractmethod
+ def stop(self):
+ pass
diff --git a/doctor_tests/admin_tool/fenix/Dockerfile b/doctor_tests/admin_tool/fenix/Dockerfile
new file mode 100644
index 00000000..202380eb
--- /dev/null
+++ b/doctor_tests/admin_tool/fenix/Dockerfile
@@ -0,0 +1,34 @@
+FROM gliderlabs/alpine:3.6
+
+ARG BRANCH=master
+ARG OPENSTACK=master
+
+EXPOSE 12347
+
+RUN echo "Building Fenix container against OpenStack $OPENSTACK" && \
+ echo "Building Fenix with $BRANCH" && \
+ mkdir /etc/fenix && \
+ mkdir -p /var/tmp/fenix
+WORKDIR /var/tmp/fenix
+COPY fenix*.conf /etc/fenix/
+
+RUN apk --no-cache add ca-certificates && \
+ apk --no-cache add --update python3 sshpass py-pip git curl && \
+ apk --no-cache add --virtual .build-deps --update \
+ python3-dev build-base linux-headers libffi-dev \
+ openssl-dev libjpeg-turbo-dev && \
+ curl https://opendev.org/openstack/requirements/raw/branch/$OPENSTACK/upper-constraints.txt > upper-constraints.txt && \
+ if [ ! -e /usr/bin/pip ]; then ln -s pip3 /usr/bin/pip ; fi && \
+ if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python3 /usr/bin/python; fi && \
+ pip3 install --upgrade pip && \
+ pip3 install alembic aodhclient decorator flask Flask-RESTful eventlet jsonschema \
+ keystoneauth1 keystonemiddleware python-novaclient oslo.config pecan \
+ oslo.db oslo.log oslo.messaging oslo.serialization oslo.service oslo_policy \
+ oslotest oslo.utils pbr pymysql six sqlalchemy -cupper-constraints.txt && \
+ git clone https://opendev.org/x/fenix -b $BRANCH /fenix && \
+ rm -fr /var/tmp/fenix
+COPY run /fenix
+COPY keystonercv3 /fenix
+WORKDIR /fenix
+RUN python3 setup.py install
+CMD ./run
diff --git a/doctor_tests/admin_tool/fenix/run b/doctor_tests/admin_tool/fenix/run
new file mode 100755
index 00000000..50ae68e7
--- /dev/null
+++ b/doctor_tests/admin_tool/fenix/run
@@ -0,0 +1,32 @@
+#!/bin/sh
+. keystonercv3
+
+# Start the first process
+nohup python3 /fenix/fenix/cmd/engine.py > /var/log/fenix-engine.log&
+status=$?
+if [ $status -ne 0 ]; then
+ echo "Failed to start engine.py: $status"
+ exit $status
+fi
+
+# Start the second process
+nohup python3 /fenix/fenix/cmd/api.py > /var/log/fenix-api.log&
+status=$?
+if [ $status -ne 0 ]; then
+ echo "Failed to start api.py: $status"
+ exit $status
+fi
+
+echo "started Fenix: engine and api"
+while sleep 60; do
+ ps aux |grep "cmd/engine.py" |grep -q -v grep
+ PROCESS_1_STATUS=$?
+ ps aux |grep "cmd/api.py" |grep -q -v grep
+ PROCESS_2_STATUS=$?
+ # If the greps above find anything, they exit with 0 status
+ # If they are not both 0, then something is wrong
+ if [ $PROCESS_1_STATUS -ne 0 -o $PROCESS_2_STATUS -ne 0 ]; then
+ echo "One of the processes has already exited."
+ exit 1
+ fi
+done
diff --git a/doctor_tests/admin_tool/sample.py b/doctor_tests/admin_tool/sample.py
new file mode 100644
index 00000000..a71f43a1
--- /dev/null
+++ b/doctor_tests/admin_tool/sample.py
@@ -0,0 +1,739 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import datetime
+from flask import Flask
+from flask import request
+import json
+from novaclient.exceptions import BadRequest
+import oslo_messaging as messaging
+import requests
+import time
+from threading import Thread
+from traceback import format_exc
+from uuid import uuid1 as generate_uuid
+
+from doctor_tests.admin_tool.base import BaseAdminTool
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import aodh_client
+from doctor_tests.os_clients import nova_client
+
+
+class SampleAdminTool(BaseAdminTool):
+
+ def __init__(self, trasport_url, conf, log):
+ super(SampleAdminTool, self).__init__(conf, log)
+ self.trasport_url = trasport_url
+ self.app = None
+
+ def start(self):
+ self.log.info('sample admin tool start......')
+ self.app = AdminTool(self.trasport_url, self.conf, self, self.log)
+ self.app.start()
+
+ def stop(self):
+ self.log.info('sample admin tool stop......')
+ if not self.app:
+ return
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ }
+ url = 'http://%s:%d/shutdown'\
+ % (self.conf.admin_tool.ip,
+ self.conf.admin_tool.port)
+ requests.post(url, data='', headers=headers)
+
+
+class AdminMain(Thread):
+
+ def __init__(self, trasport_url, session_id, data, parent, conf, log):
+ Thread.__init__(self)
+ self.session_id = session_id
+ self.parent = parent
+ self.log = log
+ self.conf = conf
+ self.url = 'http://%s:%s' % (conf.admin_tool.ip, conf.admin_tool.port)
+ self.projects_state = dict() # current state for each project
+ self.proj_server_actions = dict() # actions for each project server
+ self.projects_servers = dict() # servers processed in current state
+ self.maint_proj_servers = dict() # servers under whole maintenance
+ self.hosts = data['hosts']
+ self.maintenance_at = data['maintenance_at']
+ self.computes_disabled = list()
+ self.metadata = data['metadata']
+ self.auth = get_identity_auth(project=self.conf.doctor_project)
+ self.state = data['state']
+ self.aodh = aodh_client(self.conf.aodh_version,
+ get_session(auth=self.auth))
+ self.nova = nova_client(self.conf.nova_version,
+ get_session(auth=self.auth))
+ self.log.info('transport_url %s' % trasport_url)
+ transport = messaging.get_transport(self.conf, trasport_url)
+ self.notif_proj = messaging.Notifier(transport,
+ 'maintenance.planned',
+ driver='messaging',
+ topics=['notifications'])
+ self.notif_proj = self.notif_proj.prepare(publisher_id='admin_tool')
+ self.notif_admin = messaging.Notifier(transport,
+ 'maintenance.host',
+ driver='messaging',
+ topics=['notifications'])
+ self.notif_admin = self.notif_admin.prepare(publisher_id='admin_tool')
+ self.stopped = False
+ self.log.info('Admin tool session %s initialized' % self.session_id)
+
+ def cleanup(self):
+ for host in self.computes_disabled:
+ self.log.info('enable nova-compute on %s' % host)
+ self.nova.services.enable(host, 'nova-compute')
+
+ def _projects_not_in_wanted_states(self, wanted_states):
+ if len([v for v in self.projects_state.values()
+ if v not in wanted_states]):
+ return True
+ else:
+ return False
+
+ def projects_not_in_state(self, state):
+ if len([v for v in self.projects_state.values()
+ if v != state]):
+ return True
+ else:
+ return False
+
+ def wait_projects_state(self, wanted_states, wait_seconds):
+ retries = wait_seconds
+ while (retries > 0 and
+ self._projects_not_in_wanted_states(wanted_states)):
+ time.sleep(1)
+ retries = retries - 1
+ if self._projects_not_in_wanted_states(wanted_states):
+ self.log.error('Admin tool session %s: projects in invalid states '
+ '%s' % (self.session_id, self.projects_state))
+ return False
+ else:
+ self.log.info('all projects replied')
+ return True
+
+ def _project_notify(self, project_id, instance_ids, allowed_actions,
+ actions_at, state, metadata):
+ reply_url = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project_id)
+
+ payload = dict(project_id=project_id,
+ instance_ids=instance_ids,
+ allowed_actions=allowed_actions,
+ state=state,
+ actions_at=actions_at,
+ session_id=self.session_id,
+ metadata=metadata,
+ reply_url=reply_url)
+
+ self.log.debug('Sending "maintenance.planned" to project: %s' %
+ payload)
+
+ self.notif_proj.info({'some': 'context'}, 'maintenance.scheduled',
+ payload)
+
+ def _admin_notify(self, project, host, state, session_id):
+ payload = dict(project_id=project, host=host, state=state,
+ session_id=session_id)
+
+ self.log.debug('Sending "maintenance.host": %s' % payload)
+
+ self.notif_admin.info({'some': 'context'}, 'maintenance.host', payload)
+
+ def in_scale(self):
+ for project in self.projects_servers:
+ self.log.info('SCALE_IN to project %s' % project)
+ self.log.debug('instance_ids %s' % self.projects_servers[project])
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = []
+ wait_seconds = 120
+ actions_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=wait_seconds)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+ state = self.state
+ metadata = self.metadata
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = ['ACK_SCALE_IN', 'NACK_SCALE_IN']
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ if self.projects_not_in_state('ACK_SCALE_IN'):
+ self.log.error('%s: all states not ACK_SCALE_IN' %
+ self.session_id)
+ self.state = 'MAINTENANCE_FAILED'
+
+ def maintenance(self):
+ for project in self.projects_servers:
+ self.log.info('\nMAINTENANCE to project %s\n' % project)
+ self.log.debug('instance_ids %s' % self.projects_servers[project])
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = []
+ actions_at = self.maintenance_at
+ state = self.state
+ metadata = self.metadata
+ maint_at = self.str_to_datetime(self.maintenance_at)
+ td = maint_at - datetime.datetime.utcnow()
+ wait_seconds = int(td.total_seconds())
+ if wait_seconds < 10:
+ raise Exception('Admin tool session %s: No time for project to'
+ ' answer: %s' %
+ (self.session_id, wait_seconds))
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = ['ACK_MAINTENANCE', 'NACK_MAINTENANCE']
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ if self.projects_not_in_state('ACK_MAINTENANCE'):
+ self.log.error('%s: all states not ACK_MAINTENANCE' %
+ self.session_id)
+ self.state = 'MAINTENANCE_FAILED'
+
+ def maintenance_complete(self):
+ for project in self.projects_servers:
+ self.log.info('MAINTENANCE_COMPLETE to project %s' % project)
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = []
+ wait_seconds = 120
+ actions_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=wait_seconds)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+ state = 'MAINTENANCE_COMPLETE'
+ metadata = self.metadata
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = ['ACK_MAINTENANCE_COMPLETE',
+ 'NACK_MAINTENANCE_COMPLETE']
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ if self.projects_not_in_state('ACK_MAINTENANCE_COMPLETE'):
+ self.log.error('%s: all states not ACK_MAINTENANCE_COMPLETE' %
+ self.session_id)
+ self.state = 'MAINTENANCE_FAILED'
+
+ def need_in_scale(self, host_servers):
+ room_for_instances = 0
+ for host in host_servers:
+ instances = 0
+ for project in host_servers[host]:
+ for instance in host_servers[host][project]:
+ instances += 1
+ room_for_instances += (2 - instances)
+ self.log.info('there is room for %d instances' % room_for_instances)
+ if room_for_instances > 1:
+ return False
+ else:
+ return True
+
+ def find_host_to_be_empty(self, host_servers):
+ host_to_be_empty = None
+ host_nonha_instances = 0
+ for host in host_servers:
+ ha_instances = 0
+ nonha_instances = 0
+ for project in host_servers[host]:
+ for instance in host_servers[host][project]:
+ if ('doctor_ha_app_' in
+ host_servers[host][project][instance]):
+ ha_instances += 1
+ else:
+ nonha_instances += 1
+ self.log.info('host %s has %d ha and %d non ha instances' %
+ (host, ha_instances, nonha_instances))
+ if ha_instances == 0:
+ if host_to_be_empty:
+ if nonha_instances < host_nonha_instances:
+ host_to_be_empty = host
+ host_nonha_instances = nonha_instances
+ else:
+ host_to_be_empty = host
+ host_nonha_instances = nonha_instances
+ self.log.info('host %s selected to be empty' % host_to_be_empty)
+ return host_to_be_empty
+
+ def make_compute_host_empty(self, host, projects_servers, statebase):
+ state = statebase
+ state_ack = 'ACK_%s' % statebase
+ state_nack = 'NACK_%s' % statebase
+ for project in projects_servers:
+ # self.projects_servers must have servers under action
+ self.projects_servers[project] = projects_servers[project].copy()
+ self.log.info('%s to project %s' % (state, project))
+ self.project_servers_log_info(project, projects_servers)
+ instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id,
+ project)
+ allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']
+ wait_seconds = 120
+ actions_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=wait_seconds)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+ metadata = self.metadata
+ self._project_notify(project, instance_ids,
+ allowed_actions, actions_at, state,
+ metadata)
+ allowed_states = [state_ack, state_nack]
+ if not self.wait_projects_state(allowed_states, wait_seconds):
+ self.state = 'MAINTENANCE_FAILED'
+ elif self.projects_not_in_state(state_ack):
+ self.log.error('%s: all states not %s' %
+ (self.session_id, state_ack))
+ self.state = 'MAINTENANCE_FAILED'
+ else:
+ self.actions_to_have_empty_host(host)
+
+ def notify_action_done(self, project, instance_id):
+ instance_ids = instance_id
+ allowed_actions = []
+ actions_at = None
+ state = "INSTANCE_ACTION_DONE"
+ metadata = None
+ self._project_notify(project, instance_ids, allowed_actions,
+ actions_at, state, metadata)
+
+ def actions_to_have_empty_host(self, host):
+ retry = 0
+ while len(self.proj_server_actions) == 0:
+ time.sleep(2)
+ if retry == 10:
+ raise Exception('Admin tool session %s: project server actions'
+ ' not set' % self.session_id)
+ retry += 1
+ for project in self.proj_server_actions:
+ for server, action in self.proj_server_actions[project].items():
+ self.log.info('Action %s server %s: %s' % (action, server,
+ self.projects_servers[project][server]))
+ if action == 'MIGRATE':
+ self.migrate_server(server)
+ self.notify_action_done(project, server)
+ elif action == 'OWN_ACTION':
+ pass
+ else:
+ raise Exception('Admin tool session %s: server %s action '
+ '%s not supported' %
+ (self.session_id, server, action))
+ self.proj_server_actions = dict()
+ self._wait_host_empty(host)
+
+ def migrate_server(self, server_id):
+ server = self.nova.servers.get(server_id)
+ vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
+ self.log.info('server %s state %s' % (server_id, vm_state))
+ last_vm_state = vm_state
+ retry_migrate = 5
+ while True:
+ try:
+ server.migrate()
+ time.sleep(5)
+ retries = 36
+ while vm_state != 'resized' and retries > 0:
+ # try to confirm within 3min
+ server = self.nova.servers.get(server_id)
+ vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
+ if vm_state == 'resized':
+ server.confirm_resize()
+ self.log.info('server %s migration confirmed' %
+ server_id)
+ return
+ if last_vm_state != vm_state:
+ self.log.info('server %s state: %s' % (server_id,
+ vm_state))
+ if vm_state == 'error':
+ raise Exception('server %s migration failed, state: %s'
+ % (server_id, vm_state))
+ time.sleep(5)
+ retries = retries - 1
+ last_vm_state = vm_state
+ # Timout waiting state to change
+ break
+
+ except BadRequest:
+ if retry_migrate == 0:
+ raise Exception('server %s migrate failed' % server_id)
+ # Might take time for scheduler to sync inconsistent instance
+ # list for host
+ retry_time = 180 - (retry_migrate * 30)
+ self.log.info('server %s migrate failed, retry in %s sec'
+ % (server_id, retry_time))
+ time.sleep(retry_time)
+ except Exception as e:
+ self.log.error('server %s migration failed, Exception=%s' %
+ (server_id, e))
+ self.log.error(format_exc())
+ raise Exception('server %s migration failed, state: %s' %
+ (server_id, vm_state))
+ finally:
+ retry_migrate = retry_migrate - 1
+ raise Exception('server %s migration timeout, state: %s' %
+ (server_id, vm_state))
+
+ def _wait_host_empty(self, host):
+ hid = self.nova.hypervisors.search(host)[0].id
+ vcpus_used_last = 0
+ # wait 4min to get host empty
+ for j in range(48):
+ hvisor = self.nova.hypervisors.get(hid)
+ vcpus_used = hvisor.__getattr__('vcpus_used')
+ if vcpus_used > 0:
+ if vcpus_used_last == 0:
+ self.log.info('%s still has %d vcpus reserved. wait...'
+ % (host, vcpus_used))
+ elif vcpus_used != vcpus_used_last:
+ self.log.info('%s still has %d vcpus reserved. wait...'
+ % (host, vcpus_used))
+ vcpus_used_last = vcpus_used
+ time.sleep(5)
+ else:
+ self.log.info('%s empty' % host)
+ return
+ raise Exception('%s host not empty' % host)
+
+ def projects_listen_alarm(self, match_event):
+ match_projects = ([str(alarm['project_id']) for alarm in
+ self.aodh.alarm.list() if
+ str(alarm['event_rule']['event_type']) ==
+ match_event])
+ all_projects_match = True
+ for project in list(self.projects_state):
+ if project not in match_projects:
+ self.log.error('Admin tool session %s: project %s not '
+ 'listening to %s' %
+ (self.session_id, project, match_event))
+ all_projects_match = False
+ return all_projects_match
+
+ def project_servers_log_info(self, project, host_servers):
+ info = 'Project servers:\n'
+ for server in host_servers[project]:
+ info += (' %s: %s\n' %
+ (server, host_servers[project][server]))
+ self.log.info('%s' % info)
+
+ def servers_log_info(self, host_servers):
+ info = '\n'
+ for host in self.hosts:
+ info += '%s:\n' % host
+ if host in host_servers:
+ for project in host_servers[host]:
+ info += ' %s:\n' % project
+ for server in host_servers[host][project]:
+ info += (' %s: %s\n' %
+ (server, host_servers[host][project][server]))
+ self.log.info('%s' % info)
+
+ def update_server_info(self):
+ opts = {'all_tenants': True}
+ servers = self.nova.servers.list(search_opts=opts)
+ self.projects_servers = dict()
+ host_servers = dict()
+ for server in servers:
+ try:
+ host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
+ project = str(server.tenant_id)
+ server_name = str(server.name)
+ server_id = str(server.id)
+ except Exception:
+ raise Exception('can not get params from server=%s' %
+ server)
+ if host not in self.hosts:
+ continue
+ if host not in host_servers:
+ host_servers[host] = dict()
+ if project not in host_servers[host]:
+ host_servers[host][project] = dict()
+ if project not in self.projects_servers:
+ self.projects_servers[project] = dict()
+ if project not in self.projects_state:
+ self.projects_state[project] = None
+ host_servers[host][project][server_id] = server_name
+ self.projects_servers[project][server_id] = server_name
+ return host_servers
+
+ def str_to_datetime(self, dt_str):
+ mdate, mtime = dt_str.split()
+ year, month, day = map(int, mdate.split('-'))
+ hours, minutes, seconds = map(int, mtime.split(':'))
+ return datetime.datetime(year, month, day, hours, minutes, seconds)
+
+ def host_maintenance(self, host):
+ self.log.info('maintaining host %s' % host)
+ # no implementation to make real maintenance
+ time.sleep(5)
+
+ def run(self):
+ while (self.state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and
+ not self.stopped):
+ self.log.info('--==session %s: processing state %s==--' %
+ (self.session_id, self.state))
+ if self.state == 'MAINTENANCE':
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+
+ if not self.projects_listen_alarm('maintenance.scheduled'):
+ raise Exception('all projects do not listen maintenance '
+ 'alarm')
+ self.maintenance()
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ maint_at = self.str_to_datetime(self.maintenance_at)
+ if maint_at > datetime.datetime.utcnow():
+ time_now = (datetime.datetime.utcnow().strftime(
+ '%Y-%m-%d %H:%M:%S'))
+ self.log.info('Time now: %s maintenance starts: %s....' %
+ (time_now, self.maintenance_at))
+ td = maint_at - datetime.datetime.utcnow()
+ time.sleep(td.total_seconds())
+ time_now = (datetime.datetime.utcnow().strftime(
+ '%Y-%m-%d %H:%M:%S'))
+ self.log.info('Time to start maintenance starts: %s' %
+ time_now)
+
+ # check if we have empty compute host
+ # True -> PLANNED_MAINTENANCE
+ # False -> check if we can migrate VMs to get empty host
+ # True -> PREPARE_MAINTENANCE
+ # False -> SCALE_IN
+ maintenance_empty_hosts = ([h for h in self.hosts if h not in
+ host_servers])
+
+ if len(maintenance_empty_hosts) == 0:
+ if self.need_in_scale(host_servers):
+ self.log.info('Need to down scale')
+ self.state = 'SCALE_IN'
+ else:
+ self.log.info('Free capacity, but need empty host')
+ self.state = 'PREPARE_MAINTENANCE'
+ else:
+ self.log.info('Free capacity, but need empty host')
+ self.state = 'PLANNED_MAINTENANCE'
+ self.log.info('--==State change from MAINTENANCE to %s==--'
+ % self.state)
+ elif self.state == 'SCALE_IN':
+ # Test case is hard coded to have all compute capacity used
+ # We need to down scale to have one empty compute host
+ self.update_server_info()
+ self.in_scale()
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ self.state = 'PREPARE_MAINTENANCE'
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ self.log.info('--==State change from SCALE_IN to'
+ ' %s==--' % self.state)
+
+ elif self.state == 'PREPARE_MAINTENANCE':
+ # It might be down scale did not free capacity on a single
+ # compute host, so we need to arrange free capacity to a single
+ # compute host
+ self.maint_proj_servers = self.projects_servers.copy()
+ maintenance_empty_hosts = ([h for h in self.hosts if h not in
+ host_servers])
+ if len(maintenance_empty_hosts) == 0:
+ self.log.info('no empty hosts for maintenance')
+ if self.need_in_scale(host_servers):
+ raise Exception('Admin tool session %s: Not enough '
+ 'free capacity for maintenance' %
+ self.session_id)
+ host = self.find_host_to_be_empty(host_servers)
+ if host:
+ self.make_compute_host_empty(host, host_servers[host],
+ 'PREPARE_MAINTENANCE')
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ else:
+ # We do not currently support another down scale if
+ # first was not enough
+ raise Exception('Admin tool session %s: No host '
+ 'candidate to be emptied' %
+ self.session_id)
+ else:
+ for host in maintenance_empty_hosts:
+ self.log.info('%s already empty '
+ 'for maintenance' % host)
+ self.state = 'PLANNED_MAINTENANCE'
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ self.log.info('--==State change from PREPARE_MAINTENANCE to %s'
+ '==--' % self.state)
+ elif self.state == 'PLANNED_MAINTENANCE':
+ maintenance_hosts = list()
+ maintenance_empty_hosts = list()
+ # TODO This should be admin. hack for now to have it work
+ admin_project = list(self.projects_state)[0]
+ for host in self.hosts:
+ self.log.info('disable nova-compute on host %s' % host)
+ self.nova.services.disable_log_reason(host, 'nova-compute',
+ 'maintenance')
+ self.computes_disabled.append(host)
+ if host in host_servers and len(host_servers[host]):
+ maintenance_hosts.append(host)
+ else:
+ maintenance_empty_hosts.append(host)
+ self.log.info('--==Start to maintain empty hosts==--\n%s' %
+ maintenance_empty_hosts)
+ self.update_server_info()
+ for host in maintenance_empty_hosts:
+ # scheduler has problems, let's see if just down scaled
+ # host is really empty
+ self._wait_host_empty(host)
+ self.log.info('IN_MAINTENANCE host %s' % host)
+ self._admin_notify(admin_project, host, 'IN_MAINTENANCE',
+ self.session_id)
+ self.host_maintenance(host)
+ self._admin_notify(admin_project, host,
+ 'MAINTENANCE_COMPLETE',
+ self.session_id)
+ self.nova.services.enable(host, 'nova-compute')
+ self.computes_disabled.remove(host)
+ self.log.info('MAINTENANCE_COMPLETE host %s' % host)
+ self.log.info('--==Start to maintain occupied hosts==--\n%s' %
+ maintenance_hosts)
+ for host in maintenance_hosts:
+ self.log.info('PLANNED_MAINTENANCE host %s' % host)
+ self.make_compute_host_empty(host, host_servers[host],
+ 'PLANNED_MAINTENANCE')
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ self.log.info('IN_MAINTENANCE host %s' % host)
+ self._admin_notify(admin_project, host, 'IN_MAINTENANCE',
+ self.session_id)
+ self.host_maintenance(host)
+ self._admin_notify(admin_project, host,
+ 'MAINTENANCE_COMPLETE',
+ self.session_id)
+ self.nova.services.enable(host, 'nova-compute')
+ self.computes_disabled.remove(host)
+ self.log.info('MAINTENANCE_COMPLETE host %s' % host)
+ self.state = 'PLANNED_MAINTENANCE_COMPLETE'
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ elif self.state == 'PLANNED_MAINTENANCE_COMPLETE':
+ self.log.info('Projects still need to up scale back to full '
+ 'capcity')
+ self.maintenance_complete()
+ if self.state == 'MAINTENANCE_FAILED':
+ continue
+ host_servers = self.update_server_info()
+ self.servers_log_info(host_servers)
+ self.state = 'MAINTENANCE_DONE'
+ else:
+ raise Exception('Admin tool session %s: session in invalid '
+ 'state %s' % (self.session_id, self.state))
+ self.log.info('--==Maintenance session %s: %s==--' %
+ (self.session_id, self.state))
+
+ def project_input(self, project_id, data):
+ self.log.debug('Admin tool session %s: project %s input' %
+ (self.session_id, project_id))
+ if 'instance_actions' in data:
+ self.proj_server_actions[project_id] = (
+ data['instance_actions'].copy())
+ self.projects_state[project_id] = data['state']
+
+ def project_get_instances(self, project_id):
+ ret = list(self.projects_servers[project_id])
+ self.log.debug('Admin tool session %s: project %s GET return: %s' %
+ (self.session_id, project_id, ret))
+ return ret
+
+ def stop(self):
+ self.stopped = True
+
+
+class AdminTool(Thread):
+
+ def __init__(self, trasport_url, conf, admin_tool, log):
+ Thread.__init__(self)
+ self.admin_tool = admin_tool
+ self.log = log
+ self.conf = conf
+ self.maint_sessions = {}
+ self.projects = {}
+ self.maintenance_hosts = []
+ self.trasport_url = trasport_url
+
+ def run(self):
+ app = Flask('admin_tool')
+
+ @app.route('/maintenance', methods=['POST'])
+ def admin_maintenance_api_post():
+ data = json.loads(request.data.decode('utf8'))
+ self.log.info('maintenance message: %s' % data)
+ session_id = str(generate_uuid())
+ self.log.info('creating session: %s' % session_id)
+ self.maint_sessions[session_id] = (
+ AdminMain(self.trasport_url,
+ session_id,
+ data,
+ self,
+ self.conf,
+ self.log))
+ self.maint_sessions[session_id].start()
+ reply = json.dumps({'session_id': session_id,
+ 'state': 'ACK_%s' % data['state']})
+ self.log.debug('reply: %s' % reply)
+ return reply, 200, None
+
+ @app.route('/maintenance/<session_id>', methods=['GET'])
+ def admin_maintenance_api_get(session_id=None):
+ self.log.debug('Admin get maintenance')
+ reply = json.dumps({'state':
+ self.maint_sessions[session_id].state})
+ self.log.info('reply: %s' % reply)
+ return reply, 200, None
+
+ @app.route('/maintenance/<session_id>/<projet_id>', methods=['PUT'])
+ def project_maintenance_api_put(session_id=None, projet_id=None):
+ data = json.loads(request.data.decode('utf8'))
+ self.log.debug('%s project put: %s' % (projet_id, data))
+ self.project_input(session_id, projet_id, data)
+ return 'OK'
+
+ @app.route('/maintenance/<session_id>/<projet_id>', methods=['GET'])
+ def project_maintenance_api_get(session_id=None, projet_id=None):
+ self.log.debug('%s project get %s' % (projet_id, session_id))
+ instances = self.project_get_instances(session_id, projet_id)
+ reply = json.dumps({'instance_ids': instances})
+ self.log.debug('%s reply: %s' % (projet_id, reply))
+ return reply, 200, None
+
+ @app.route('/maintenance/<session_id>', methods=['DELETE'])
+ def remove_session(session_id=None):
+ self.log.info('remove session %s'
+ % session_id)
+ self.maint_sessions[session_id].cleanup()
+ self.maint_sessions[session_id].stop()
+ del self.maint_sessions[session_id]
+ return 'OK'
+
+ @app.route('/shutdown', methods=['POST'])
+ def shutdown():
+ self.log.info('shutdown admin_tool server at %s' % time.time())
+ func = request.environ.get('werkzeug.server.shutdown')
+ if func is None:
+ raise RuntimeError('Not running with the Werkzeug Server')
+ func()
+ return 'admin_tool app shutting down...'
+
+ app.run(host=self.conf.admin_tool.ip, port=self.conf.admin_tool.port)
+
+ def project_input(self, session_id, project_id, data):
+ self.maint_sessions[session_id].project_input(project_id, data)
+
+ def project_get_instances(self, session_id, project_id):
+ return self.maint_sessions[session_id].project_get_instances(
+ project_id)
diff --git a/doctor_tests/app_manager/__init__.py b/doctor_tests/app_manager/__init__.py
new file mode 100644
index 00000000..c2f75918
--- /dev/null
+++ b/doctor_tests/app_manager/__init__.py
@@ -0,0 +1,40 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from oslo_config import cfg
+from oslo_utils import importutils
+import os
+
+
+OPTS = [
+ cfg.StrOpt('type',
+ default=os.environ.get('APP_MANAGER_TYPE', 'sample'),
+ choices=['sample', 'vnfm'],
+ help='the component of doctor app manager',
+ required=True),
+ cfg.StrOpt('ip',
+ default='127.0.0.1',
+ help='the ip of app manager',
+ required=True),
+ cfg.IntOpt('port',
+ default='12348',
+ help='the port of doctor app manager',
+ required=True),
+]
+
+
+_app_manager_name_class_mapping = {
+ 'sample': 'doctor_tests.app_manager.sample.SampleAppManager',
+ 'vnfm': 'doctor_tests.app_manager.vnfm.VNFM',
+}
+
+
+def get_app_manager(stack, conf, log):
+ app_manager_class = (
+ _app_manager_name_class_mapping.get(conf.app_manager.type))
+ return importutils.import_object(app_manager_class, stack, conf, log)
diff --git a/doctor_tests/app_manager/base.py b/doctor_tests/app_manager/base.py
new file mode 100644
index 00000000..0d424083
--- /dev/null
+++ b/doctor_tests/app_manager/base.py
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseAppManager(object):
+
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+
+ @abc.abstractmethod
+ def start(self):
+ pass
+
+ @abc.abstractmethod
+ def stop(self):
+ pass
diff --git a/doctor_tests/app_manager/sample.py b/doctor_tests/app_manager/sample.py
new file mode 100644
index 00000000..7ca35b97
--- /dev/null
+++ b/doctor_tests/app_manager/sample.py
@@ -0,0 +1,265 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from flask import Flask
+from flask import request
+import json
+import yaml
+import time
+from threading import Thread
+import requests
+
+from doctor_tests.app_manager.base import BaseAppManager
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import neutron_client
+from doctor_tests.os_clients import nova_client
+
+
+class SampleAppManager(BaseAppManager):
+
+ def __init__(self, stack, conf, log):
+ super(SampleAppManager, self).__init__(conf, log)
+ self.stack = stack
+ self.app = None
+
+ def start(self):
+ self.log.info('sample app manager start......')
+ self.app = AppManager(self.stack, self.conf, self, self.log)
+ self.app.start()
+
+ def stop(self):
+ self.log.info('sample app manager stop......')
+ if not self.app:
+ return
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ }
+ url = 'http://%s:%d/shutdown'\
+ % (self.conf.app_manager.ip,
+ self.conf.app_manager.port)
+ requests.post(url, data='', headers=headers)
+
+
+class AppManager(Thread):
+
+ def __init__(self, stack, conf, app_manager, log):
+ Thread.__init__(self)
+ self.stack = stack
+ self.conf = conf
+ self.port = self.conf.app_manager.port
+ self.app_manager = app_manager
+ self.log = log
+ self.intance_ids = None
+ self.auth = get_identity_auth(project=self.conf.doctor_project)
+ self.session = get_session(auth=self.auth)
+ self.nova = nova_client(self.conf.nova_version,
+ self.session)
+ self.neutron = neutron_client(session=self.session)
+ self.headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ self.orig_number_of_instances = self.number_of_instances()
+ self.ha_instances = self.get_ha_instances()
+ self.floating_ip = None
+ self.active_instance_id = self.active_instance_id()
+
+ def active_instance_id(self):
+ for instance in self.ha_instances:
+ network_interfaces = next(iter(instance.addresses.values()))
+ for network_interface in network_interfaces:
+ _type = network_interface.get('OS-EXT-IPS:type')
+ if _type == "floating":
+ if not self.floating_ip:
+ self.floating_ip = network_interface.get('addr')
+ self.log.debug('active_instance: %s %s' %
+ (instance.name, instance.id))
+ return instance.id
+ raise Exception("No active instance found")
+
+ def switch_over_ha_instance(self):
+ for instance in self.ha_instances:
+ if instance.id != self.active_instance_id:
+ self.log.info('Switch over to: %s %s' % (instance.name,
+ instance.id))
+ # Deprecated, need to use neutron instead
+ # instance.add_floating_ip(self.floating_ip)
+ port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa
+ floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa
+ self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa
+ # Have to update ha_instances as floating_ip changed
+ self.ha_instances = self.get_ha_instances()
+ self.active_instance_id = instance.id
+ break
+
+ def get_instance_ids(self):
+ ret = list()
+ for instance in self.nova.servers.list(detailed=False):
+ ret.append(instance.id)
+ return ret
+
+ def get_ha_instances(self):
+ ha_instances = list()
+ for instance in self.nova.servers.list(detailed=True):
+ if "doctor_ha_app_" in instance.name:
+ ha_instances.append(instance)
+ self.log.debug('ha_instances: %s' % instance.name)
+ return ha_instances
+
+ def _alarm_data_decoder(self, data):
+ if "[" in data or "{" in data:
+ # string to list or dict removing unicode
+ data = yaml.load(data.replace("u'", "'"))
+ return data
+
+ def _alarm_traits_decoder(self, data):
+ return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+ for t in data['reason_data']['event']['traits']})
+
+ def get_session_instance_ids(self, url, session_id):
+ ret = requests.get(url, data=None, headers=self.headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ self.log.info('get_instance_ids %s' % ret.json())
+ return ret.json()['instance_ids']
+
+ def scale_instances(self, number_of_instances):
+ number_of_instances_before = self.number_of_instances()
+
+ parameters = self.stack.parameters
+ parameters['nonha_intances'] += number_of_instances
+ self.stack.update(self.stack.stack_name,
+ self.stack.stack_id,
+ self.stack.template,
+ parameters=parameters,
+ files=self.stack.files)
+
+ number_of_instances_after = self.number_of_instances()
+ if (number_of_instances_before + number_of_instances !=
+ number_of_instances_after):
+ self.log.error('scale_instances with: %d from: %d ends up to: %d'
+ % (number_of_instances, number_of_instances_before,
+ number_of_instances_after))
+ raise Exception('scale_instances failed')
+
+ self.log.info('scaled insances from %d to %d' %
+ (number_of_instances_before,
+ number_of_instances_after))
+
+ def number_of_instances(self):
+ return len(self.nova.servers.list(detailed=False))
+
+ def run(self):
+ app = Flask('app_manager')
+
+ @app.route('/maintenance', methods=['POST'])
+ def maintenance_alarm():
+ data = json.loads(request.data.decode('utf8'))
+ try:
+ payload = self._alarm_traits_decoder(data)
+ except Exception:
+ payload = ({t[0]: t[2] for t in
+ data['reason_data']['event']['traits']})
+ self.log.error('cannot parse alarm data: %s' % payload)
+ raise Exception('sample app manager cannot parse alarm.'
+ 'Possibly trait data over 256 char')
+
+ self.log.info('sample app manager received data = %s' % payload)
+
+ state = payload['state']
+ reply_state = None
+ reply = dict()
+
+ self.log.info('sample app manager state: %s' % state)
+
+ if state == 'MAINTENANCE':
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ reply['instance_ids'] = instance_ids
+ reply_state = 'ACK_MAINTENANCE'
+
+ elif state == 'SCALE_IN':
+ # scale down 2 isntances that is VCPUS equaling to single
+ # compute node
+ self.scale_instances(-2)
+ reply['instance_ids'] = self.get_instance_ids()
+ reply_state = 'ACK_SCALE_IN'
+
+ elif state == 'MAINTENANCE_COMPLETE':
+ # possibly need to upscale
+ number_of_instances = self.number_of_instances()
+ if self.orig_number_of_instances > number_of_instances:
+ scale_instances = (self.orig_number_of_instances -
+ number_of_instances)
+ self.scale_instances(scale_instances)
+ reply_state = 'ACK_MAINTENANCE_COMPLETE'
+
+ elif state == 'PREPARE_MAINTENANCE':
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ self.log.info('sample app manager got instances: %s' %
+ instance_ids)
+ instance_actions = dict()
+ for instance_id in instance_ids:
+ instance_actions[instance_id] = "MIGRATE"
+ if instance_id == self.active_instance_id:
+ self.switch_over_ha_instance()
+ reply['instance_actions'] = instance_actions
+ reply_state = 'ACK_PREPARE_MAINTENANCE'
+
+ elif state == 'PLANNED_MAINTENANCE':
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ self.log.info('sample app manager got instances: %s' %
+ instance_ids)
+ instance_actions = dict()
+ for instance_id in instance_ids:
+ instance_actions[instance_id] = "MIGRATE"
+ if instance_id == self.active_instance_id:
+ self.switch_over_ha_instance()
+ reply['instance_actions'] = instance_actions
+ reply_state = 'ACK_PLANNED_MAINTENANCE'
+
+ elif state == 'INSTANCE_ACTION_DONE':
+ self.log.info('%s' % payload['instance_ids'])
+
+ else:
+ raise Exception('sample app manager received event with'
+ ' unknown state %s' % state)
+
+ if reply_state:
+ reply['session_id'] = payload['session_id']
+ reply['state'] = reply_state
+ url = payload['reply_url']
+ self.log.info('sample app manager reply: %s' % reply)
+ requests.put(url, data=json.dumps(reply), headers=self.headers)
+
+ return 'OK'
+
+ @app.route('/shutdown', methods=['POST'])
+ def shutdown():
+ self.log.info('shutdown app manager server at %s' % time.time())
+ func = request.environ.get('werkzeug.server.shutdown')
+ if func is None:
+ raise RuntimeError('Not running with the Werkzeug Server')
+ func()
+ return 'app manager shutting down...'
+
+ app.run(host="0.0.0.0", port=self.port)
diff --git a/doctor_tests/app_manager/vnfm.py b/doctor_tests/app_manager/vnfm.py
new file mode 100644
index 00000000..68fdbb88
--- /dev/null
+++ b/doctor_tests/app_manager/vnfm.py
@@ -0,0 +1,441 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from flask import Flask
+from flask import request
+import json
+import requests
+from threading import Thread
+import time
+import uuid
+import yaml
+
+from doctor_tests.app_manager.base import BaseAppManager
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import neutron_client
+from doctor_tests.os_clients import nova_client
+from doctor_tests.os_clients import keystone_client
+
+
+class VNFM(BaseAppManager):
+
+ def __init__(self, stack, conf, log):
+ super(VNFM, self).__init__(conf, log)
+ self.stack = stack
+ self.app = None
+
+ def start(self):
+ self.log.info('VNFM start......')
+ self.app = VNFManager(self.stack, self.conf, self, self.log)
+ self.app.start()
+
+ def stop(self):
+ self.log.info('VNFM stop......')
+ if not self.app:
+ return
+ self.app.delete_constraints()
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ }
+ url = 'http://%s:%d/shutdown'\
+ % (self.conf.app_manager.ip,
+ self.conf.app_manager.port)
+ requests.post(url, data='', headers=headers)
+
+
+class VNFManager(Thread):
+
+ def __init__(self, stack, conf, app_manager, log):
+ Thread.__init__(self)
+ self.stack = stack
+ self.conf = conf
+ self.port = self.conf.app_manager.port
+ self.app_manager = app_manager
+ self.log = log
+ self.intance_ids = None
+ self.auth = get_identity_auth(project=self.conf.doctor_project)
+ self.session = get_session(auth=self.auth)
+ self.keystone = keystone_client(
+ self.conf.keystone_version, self.session)
+ self.nova = nova_client(self.conf.nova_version,
+ self.session)
+ self.neutron = neutron_client(session=self.session)
+ self.headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ self.orig_number_of_instances = self.number_of_instances()
+ # List of instances
+ self.ha_instances = []
+ self.nonha_instances = []
+ # Different instance_id specific constraints {instanse_id: {},...}
+ self.instance_constraints = None
+ # Update existing instances to instance lists
+ self.update_instances()
+ nonha_instances = len(self.nonha_instances)
+ if nonha_instances < 7:
+ self.scale = 2
+ self.max_impacted = 2
+ else:
+ self.scale = int((nonha_instances) / 2)
+ self.max_impacted = self.scale - 1
+ self.log.info('Init nonha_instances: %s scale: %s: max_impacted %s' %
+ (nonha_instances, self.scale, self.max_impacted))
+ # Different instance groups constraints dict
+ self.ha_group = None
+ self.nonha_group = None
+ # Floating IP used in HA instance
+ self.floating_ip = None
+ # VNF project_id
+ self.project_id = None
+ # HA instance_id that is active / has floating IP
+ self.active_instance_id = self.active_instance_id()
+
+ services = self.keystone.services.list()
+ for service in services:
+ if service.type == 'maintenance':
+ self.log.info('maintenance service: %s:%s type %s'
+ % (service.name, service.id, service.type))
+ maint_id = service.id
+ self.maint_endpoint = [ep.url for ep in self.keystone.endpoints.list()
+ if ep.service_id == maint_id and
+ ep.interface == 'public'][0]
+ self.log.info('maintenance endpoint: %s' % self.maint_endpoint)
+ self.update_constraints_lock = False
+ self.update_constraints()
+
+ def delete_remote_instance_constraints(self, instance_id):
+ url = "%s/instance/%s" % (self.maint_endpoint, instance_id)
+ self.log.info('DELETE: %s' % url)
+ ret = requests.delete(url, data=None, headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def update_remote_instance_constraints(self, instance):
+ url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"])
+ self.log.info('PUT: %s' % url)
+ ret = requests.put(url, data=json.dumps(instance),
+ headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def delete_remote_group_constraints(self, instance_group):
+ url = "%s/instance_group/%s" % (self.maint_endpoint,
+ instance_group["group_id"])
+ self.log.info('DELETE: %s' % url)
+ ret = requests.delete(url, data=None, headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def update_remote_group_constraints(self, instance_group):
+ url = "%s/instance_group/%s" % (self.maint_endpoint,
+ instance_group["group_id"])
+ self.log.info('PUT: %s' % url)
+ ret = requests.put(url, data=json.dumps(instance_group),
+ headers=self.headers)
+ if ret.status_code != 200 and ret.status_code != 204:
+ raise Exception(ret.text)
+
+ def delete_constraints(self):
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ for instance_id in self.instance_constraints:
+ self.delete_remote_instance_constraints(instance_id)
+ self.delete_remote_group_constraints(self.nonha_group)
+ self.delete_remote_group_constraints(self.ha_group)
+
+ def update_constraints(self):
+ while self.update_constraints_lock:
+ self.log.info('Waiting update_constraints_lock...')
+ time.sleep(1)
+ self.update_constraints_lock = True
+ self.log.info('Update constraints')
+ if self.project_id is None:
+ self.project_id = self.keystone.projects.list(
+ name=self.conf.doctor_project)[0].id
+ if self.nonha_group is None:
+ # Nova does not support groupping instances that do not belong to
+ # anti-affinity server_groups. Anyhow all instances need groupping
+ self.nonha_group = {
+ "group_id": str(uuid.uuid4()),
+ "project_id": self.project_id,
+ "group_name": "doctor_nonha_app_group",
+ "anti_affinity_group": False,
+ "max_instances_per_host": 0,
+ "max_impacted_members": self.max_impacted,
+ "recovery_time": 2,
+ "resource_mitigation": True}
+ self.log.info('create doctor_nonha_app_group constraints: %s'
+ % self.nonha_group)
+ self.update_remote_group_constraints(self.nonha_group)
+ if self.ha_group is None:
+ group_id = [sg.id for sg in self.nova.server_groups.list()
+ if sg.name == "doctor_ha_app_group"][0]
+ self.ha_group = {
+ "group_id": group_id,
+ "project_id": self.project_id,
+ "group_name": "doctor_ha_app_group",
+ "anti_affinity_group": True,
+ "max_instances_per_host": 1,
+ "max_impacted_members": 1,
+ "recovery_time": 4,
+ "resource_mitigation": True}
+ self.log.info('create doctor_ha_app_group constraints: %s'
+ % self.ha_group)
+ self.update_remote_group_constraints(self.ha_group)
+ instance_constraints = {}
+ for ha_instance in self.ha_instances:
+ instance = {
+ "instance_id": ha_instance.id,
+ "project_id": self.project_id,
+ "group_id": self.ha_group["group_id"],
+ "instance_name": ha_instance.name,
+ "max_interruption_time": 120,
+ "migration_type": "MIGRATE",
+ "resource_mitigation": True,
+ "lead_time": 40}
+ self.log.info('create ha instance constraints: %s'
+ % instance)
+ instance_constraints[ha_instance.id] = instance
+ for nonha_instance in self.nonha_instances:
+ instance = {
+ "instance_id": nonha_instance.id,
+ "project_id": self.project_id,
+ "group_id": self.nonha_group["group_id"],
+ "instance_name": nonha_instance.name,
+ "max_interruption_time": 120,
+ "migration_type": "MIGRATE",
+ "resource_mitigation": True,
+ "lead_time": 40}
+ self.log.info('create nonha instance constraints: %s'
+ % instance)
+ instance_constraints[nonha_instance.id] = instance
+ if not self.instance_constraints:
+ # Initial instance constraints
+ self.log.info('create initial instances constraints...')
+ for instance in [instance_constraints[i] for i
+ in instance_constraints]:
+ self.update_remote_instance_constraints(instance)
+ self.instance_constraints = instance_constraints.copy()
+ else:
+ self.log.info('check instances constraints changes...')
+ added = [i for i in instance_constraints.keys()
+ if i not in self.instance_constraints]
+ deleted = [i for i in self.instance_constraints.keys()
+ if i not in instance_constraints]
+ modified = [i for i in instance_constraints.keys()
+ if (i not in added and i not in deleted and
+ instance_constraints[i] !=
+ self.instance_constraints[i])]
+ for instance_id in deleted:
+ self.delete_remote_instance_constraints(instance_id)
+ updated = added + modified
+ for instance in [instance_constraints[i] for i in updated]:
+ self.update_remote_instance_constraints(instance)
+ if updated or deleted:
+ # Some instance constraints have changed
+ self.instance_constraints = instance_constraints.copy()
+ self.update_constraints_lock = False
+
+ def active_instance_id(self):
+ # Need rertry as it takes time after heat template done before
+ # Floating IP in place
+ retry = 5
+ while retry > 0:
+ for instance in self.ha_instances:
+ network_interfaces = next(iter(instance.addresses.values()))
+ for network_interface in network_interfaces:
+ _type = network_interface.get('OS-EXT-IPS:type')
+ if _type == "floating":
+ if not self.floating_ip:
+ self.floating_ip = network_interface.get('addr')
+ self.log.debug('active_instance: %s %s' %
+ (instance.name, instance.id))
+ return instance.id
+ time.sleep(2)
+ self.update_instances()
+ retry -= 1
+ raise Exception("No active instance found")
+
+ def switch_over_ha_instance(self):
+ for instance in self.ha_instances:
+ if instance.id != self.active_instance_id:
+ self.log.info('Switch over to: %s %s' % (instance.name,
+ instance.id))
+ # Deprecated, need to use neutron instead
+ # instance.add_floating_ip(self.floating_ip)
+ port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa
+ floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa
+ self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa
+ # Have to update ha_instances as floating_ip changed
+ self.update_instances()
+ self.active_instance_id = instance.id
+ break
+
+ def get_instance_ids(self):
+ ret = list()
+ for instance in self.nova.servers.list(detailed=False):
+ ret.append(instance.id)
+ return ret
+
+ def update_instances(self):
+ instances = self.nova.servers.list(detailed=True)
+ self.ha_instances = [i for i in instances
+ if "doctor_ha_app_" in i.name]
+ self.nonha_instances = [i for i in instances
+ if "doctor_nonha_app_" in i.name]
+
+ def _alarm_data_decoder(self, data):
+ if "[" in data or "{" in data:
+ # string to list or dict removing unicode
+ data = yaml.load(data.replace("u'", "'"))
+ return data
+
+ def _alarm_traits_decoder(self, data):
+ return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+ for t in data['reason_data']['event']['traits']})
+
+ def get_session_instance_ids(self, url, session_id):
+ ret = requests.get(url, data=None, headers=self.headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ self.log.info('get_instance_ids %s' % ret.json())
+ return ret.json()['instance_ids']
+
+ def scale_instances(self, number_of_instances):
+ number_of_instances_before = self.number_of_instances()
+
+ parameters = self.stack.parameters
+ parameters['nonha_intances'] += number_of_instances
+ self.stack.update(self.stack.stack_name,
+ self.stack.stack_id,
+ self.stack.template,
+ parameters=parameters,
+ files=self.stack.files)
+
+ number_of_instances_after = self.number_of_instances()
+ if (number_of_instances_before + number_of_instances !=
+ number_of_instances_after):
+ self.log.error('scale_instances with: %d from: %d ends up to: %d'
+ % (number_of_instances, number_of_instances_before,
+ number_of_instances_after))
+ raise Exception('scale_instances failed')
+
+ self.log.info('scaled instances from %d to %d' %
+ (number_of_instances_before,
+ number_of_instances_after))
+
+ def number_of_instances(self):
+ return len(self.nova.servers.list(detailed=False))
+
+ def run(self):
+ app = Flask('VNFM')
+
+ @app.route('/maintenance', methods=['POST'])
+ def maintenance_alarm():
+ data = json.loads(request.data.decode('utf8'))
+ try:
+ payload = self._alarm_traits_decoder(data)
+ except Exception:
+ payload = ({t[0]: t[2] for t in
+ data['reason_data']['event']['traits']})
+ self.log.error('cannot parse alarm data: %s' % payload)
+ raise Exception('VNFM cannot parse alarm.'
+ 'Possibly trait data over 256 char')
+
+ self.log.info('VNFM received data = %s' % payload)
+
+ state = payload['state']
+ reply_state = None
+ reply = dict()
+
+ self.log.info('VNFM state: %s' % state)
+
+ if state == 'MAINTENANCE':
+ instance_ids = (self.get_session_instance_ids(
+ payload['instance_ids'],
+ payload['session_id']))
+ my_instance_ids = self.get_instance_ids()
+ invalid_instances = (
+ [instance_id for instance_id in instance_ids
+ if instance_id not in my_instance_ids])
+ if invalid_instances:
+ self.log.error('Invalid instances: %s' % invalid_instances)
+ reply_state = 'NACK_MAINTENANCE'
+ else:
+ reply_state = 'ACK_MAINTENANCE'
+
+ elif state == 'SCALE_IN':
+ # scale down "self.scale" instances that is VCPUS equaling
+ # at least a single compute node
+ self.scale_instances(-self.scale)
+ reply_state = 'ACK_SCALE_IN'
+
+ elif state == 'MAINTENANCE_COMPLETE':
+ # possibly need to upscale
+ number_of_instances = self.number_of_instances()
+ if self.orig_number_of_instances > number_of_instances:
+ scale_instances = (self.orig_number_of_instances -
+ number_of_instances)
+ self.scale_instances(scale_instances)
+ reply_state = 'ACK_MAINTENANCE_COMPLETE'
+
+ elif state == 'PREPARE_MAINTENANCE':
+ # TBD from contraints
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+ instance_ids = payload['instance_ids'][0]
+ self.log.info('VNFM got instance: %s' % instance_ids)
+ if instance_ids == self.active_instance_id:
+ self.switch_over_ha_instance()
+ # optional also in contraints
+ reply['instance_action'] = "MIGRATE"
+ reply_state = 'ACK_PREPARE_MAINTENANCE'
+
+ elif state == 'PLANNED_MAINTENANCE':
+ # TBD from contraints
+ if "MIGRATE" not in payload['allowed_actions']:
+ raise Exception('MIGRATE not supported')
+ instance_ids = payload['instance_ids'][0]
+ self.log.info('VNFM got instance: %s' % instance_ids)
+ if instance_ids == self.active_instance_id:
+ self.switch_over_ha_instance()
+ # optional also in contraints
+ reply['instance_action'] = "MIGRATE"
+ reply_state = 'ACK_PLANNED_MAINTENANCE'
+
+ elif state == 'INSTANCE_ACTION_DONE':
+ # TBD was action done in allowed window
+ self.log.info('%s' % payload['instance_ids'])
+ else:
+ raise Exception('VNFM received event with'
+ ' unknown state %s' % state)
+
+ if reply_state:
+ if self.conf.admin_tool.type == 'fenix':
+ self.headers['X-Auth-Token'] = self.session.get_token()
+ reply['state'] = reply_state
+ url = payload['reply_url']
+ self.log.info('VNFM reply: %s' % reply)
+ requests.put(url, data=json.dumps(reply), headers=self.headers)
+
+ return 'OK'
+
+ @app.route('/shutdown', methods=['POST'])
+ def shutdown():
+ self.log.info('shutdown VNFM server at %s' % time.time())
+ func = request.environ.get('werkzeug.server.shutdown')
+ if func is None:
+ raise RuntimeError('Not running with the Werkzeug Server')
+ func()
+ return 'VNFM shutting down...'
+
+ app.run(host="0.0.0.0", port=self.port)
diff --git a/doctor_tests/common/constants.py b/doctor_tests/common/constants.py
index 088ff633..201f3fc4 100644
--- a/doctor_tests/common/constants.py
+++ b/doctor_tests/common/constants.py
@@ -12,6 +12,10 @@ from collections import namedtuple
Host = namedtuple('Host', ['name', 'ip'])
+def is_fenix(conf):
+ return conf.admin_tool.type == 'fenix'
+
+
class Inspector(object):
CONGRESS = 'congress'
SAMPLE = 'sample'
diff --git a/doctor_tests/common/utils.py b/doctor_tests/common/utils.py
index 1a84c824..67ca4f4b 100644
--- a/doctor_tests/common/utils.py
+++ b/doctor_tests/common/utils.py
@@ -10,6 +10,7 @@ import json
import os
import paramiko
import re
+import subprocess
def load_json_file(full_path):
@@ -67,7 +68,7 @@ class SSHClient(object):
def __del__(self):
self.client.close()
- def ssh(self, command):
+ def ssh(self, command, raise_enabled=True):
if self.log:
self.log.info("Executing: %s" % command)
stdin, stdout, stderr = self.client.exec_command(command)
@@ -75,7 +76,7 @@ class SSHClient(object):
output = list()
for line in stdout.read().splitlines():
output.append(line.decode('utf-8'))
- if ret:
+ if ret and raise_enabled:
if self.log:
self.log.info("*** FAILED to run command %s (%s)"
% (command, ret))
@@ -97,6 +98,27 @@ class SSHClient(object):
ftp.close()
+class LocalSSH(object):
+
+ def __init__(self, log):
+ self.log = log
+ self.log.info('Init local ssh client')
+
+ def ssh(self, cmd):
+ ret = 0
+ output = "%s failed!!!" % cmd
+ try:
+ output = subprocess.check_output((cmd), shell=True,
+ universal_newlines=True)
+ except subprocess.CalledProcessError:
+ ret = 1
+ return ret, output
+
+ def scp(self, src_file, dst_file):
+ return subprocess.check_output("cp %s %s" % (src_file, dst_file),
+ shell=True)
+
+
def run_async(func):
from threading import Thread
from functools import wraps
diff --git a/doctor_tests/config.py b/doctor_tests/config.py
index dc05c0d8..cea1f0c9 100644
--- a/doctor_tests/config.py
+++ b/doctor_tests/config.py
@@ -11,6 +11,8 @@ import itertools
from oslo_config import cfg
from doctor_tests import alarm
+from doctor_tests import admin_tool
+from doctor_tests import app_manager
from doctor_tests import consumer
from doctor_tests import image
from doctor_tests import instance
@@ -30,6 +32,8 @@ def list_opts():
('monitor', monitor.OPTS),
('inspector', inspector.OPTS),
('consumer', consumer.OPTS),
+ ('admin_tool', admin_tool.OPTS),
+ ('app_manager', app_manager.OPTS),
('DEFAULT', itertools.chain(
os_clients.OPTS,
image.OPTS,
diff --git a/doctor_tests/consumer/__init__.py b/doctor_tests/consumer/__init__.py
index 2c66a547..e5a36506 100644
--- a/doctor_tests/consumer/__init__.py
+++ b/doctor_tests/consumer/__init__.py
@@ -21,7 +21,7 @@ OPTS = [
help='the ip of consumer',
required=True),
cfg.IntOpt('port',
- default='12346',
+ default=12346,
help='the port of doctor consumer',
required=True),
]
diff --git a/doctor_tests/image.py b/doctor_tests/image.py
index 9961b22d..50841ef6 100644
--- a/doctor_tests/image.py
+++ b/doctor_tests/image.py
@@ -7,7 +7,11 @@
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
import os
-import urllib.request
+try:
+ from urllib.request import urlopen
+except Exception:
+ from urllib2 import urlopen
+
from oslo_config import cfg
@@ -46,11 +50,14 @@ class Image(object):
def create(self):
self.log.info('image create start......')
-
images = {image.name: image for image in self.glance.images.list()}
+ if self.conf.image_name == 'cirros':
+ cirros = [image for image in images if 'cirros' in image]
+ if cirros:
+ self.conf.image_name = cirros[0]
if self.conf.image_name not in images:
if not os.path.exists(self.conf.image_filename):
- resp = urllib.request.urlopen(self.conf.image_download_url)
+ resp = urlopen(self.conf.image_download_url)
with open(self.conf.image_filename, "wb") as file:
file.write(resp.read())
self.image = \
diff --git a/doctor_tests/inspector/__init__.py b/doctor_tests/inspector/__init__.py
index 31291baf..50365a61 100644
--- a/doctor_tests/inspector/__init__.py
+++ b/doctor_tests/inspector/__init__.py
@@ -42,6 +42,10 @@ _inspector_name_class_mapping = {
}
-def get_inspector(conf, log):
+def get_inspector(conf, log, transport_url=None):
inspector_class = _inspector_name_class_mapping[conf.inspector.type]
- return importutils.import_object(inspector_class, conf, log)
+ if conf.inspector.type == 'sample':
+ return importutils.import_object(inspector_class, conf, log,
+ transport_url)
+ else:
+ return importutils.import_object(inspector_class, conf, log)
diff --git a/doctor_tests/inspector/congress.py b/doctor_tests/inspector/congress.py
index fb747ec5..7f918fb2 100644
--- a/doctor_tests/inspector/congress.py
+++ b/doctor_tests/inspector/congress.py
@@ -31,6 +31,8 @@ class CongressInspector(BaseInspector):
def __init__(self, conf, log):
super(CongressInspector, self).__init__(conf, log)
+ self.is_create_doctor_datasource = False
+ self.doctor_datasource_id = None
self.auth = get_identity_auth()
self.congress = congress_client(get_session(auth=self.auth))
self._init_driver_and_ds()
@@ -48,12 +50,6 @@ class CongressInspector(BaseInspector):
'version < nova_api_min_version(%s)'
% self.nova_api_min_version)
- # create doctor datasource if it's not exist
- if self.doctor_datasource not in datasources:
- self.congress.create_datasource(
- body={'driver': self.doctor_driver,
- 'name': self.doctor_datasource})
-
# check whether doctor driver exist
drivers = \
{driver['id']: driver for driver in
@@ -61,6 +57,14 @@ class CongressInspector(BaseInspector):
if self.doctor_driver not in drivers:
raise Exception('Do not support doctor driver in congress')
+ # create doctor datasource if it's not exist
+ if self.doctor_datasource not in datasources:
+ response = self.congress.create_datasource(
+ body={'driver': self.doctor_driver,
+ 'name': self.doctor_datasource})
+ self.doctor_datasource_id = response['id']
+ self.is_create_doctor_datasource = True
+
self.policy_rules = \
{rule['name']: rule for rule in
self.congress.list_policy_rules(self.policy)['results']}
@@ -86,6 +90,9 @@ class CongressInspector(BaseInspector):
for rule_name in self.rules.keys():
self._del_rule(rule_name)
+ if self.is_create_doctor_datasource:
+ self.congress.delete_datasource(self.doctor_datasource_id)
+
def _add_rule(self, rule_name, rule):
if rule_name not in self.policy_rules:
self.congress.create_policy_rule(self.policy,
diff --git a/doctor_tests/inspector/sample.py b/doctor_tests/inspector/sample.py
index 7742373d..c44db95d 100644
--- a/doctor_tests/inspector/sample.py
+++ b/doctor_tests/inspector/sample.py
@@ -10,9 +10,11 @@ import collections
from flask import Flask
from flask import request
import json
+import oslo_messaging
import time
from threading import Thread
import requests
+import yaml
from doctor_tests.common import utils
from doctor_tests.identity_auth import get_identity_auth
@@ -25,7 +27,7 @@ from doctor_tests.inspector.base import BaseInspector
class SampleInspector(BaseInspector):
event_type = 'compute.host.down'
- def __init__(self, conf, log):
+ def __init__(self, conf, log, trasport_url):
super(SampleInspector, self).__init__(conf, log)
self.inspector_url = self.get_inspector_url()
self.novaclients = list()
@@ -42,6 +44,17 @@ class SampleInspector(BaseInspector):
self.hostnames = list()
self.app = None
+ try:
+ transport = oslo_messaging.get_notification_transport(self.conf,
+ trasport_url)
+ self.notif = oslo_messaging.Notifier(transport,
+ 'compute.instance.update',
+ driver='messaging',
+ topics=['notifications'])
+ self.notif = self.notif.prepare(publisher_id='sample')
+ except Exception:
+ self.notif = None
+
def _init_novaclients(self):
self.NUMBER_OF_CLIENTS = self.conf.instance_count
auth = get_identity_auth(project=self.conf.doctor_project)
@@ -53,13 +66,13 @@ class SampleInspector(BaseInspector):
def _init_servers_list(self):
self.servers.clear()
opts = {'all_tenants': True}
- servers = self.nova.servers.list(search_opts=opts)
+ servers = self.nova.servers.list(detailed=True, search_opts=opts)
for server in servers:
try:
host = server.__dict__.get('OS-EXT-SRV-ATTR:host')
self.servers[host].append(server)
self.log.debug('get hostname=%s from server=%s'
- % (host, server))
+ % (host, str(server.name)))
except Exception as e:
self.log.info('can not get hostname from server=%s, error=%s'
% (server, e))
@@ -96,15 +109,52 @@ class SampleInspector(BaseInspector):
event_type = event['type']
if event_type == self.event_type:
self.hostnames.append(hostname)
+ if self.notif is not None:
+ thr0 = self._send_notif(hostname)
thr1 = self._disable_compute_host(hostname)
thr2 = self._vms_reset_state('error', hostname)
if self.conf.inspector.update_neutron_port_dp_status:
thr3 = self._set_ports_data_plane_status('DOWN', hostname)
+ if self.notif is not None:
+ thr0.join()
thr1.join()
thr2.join()
if self.conf.inspector.update_neutron_port_dp_status:
thr3.join()
+ def _alarm_data_decoder(self, data):
+ if "[" in data or "{" in data:
+ # string to list or dict removing unicode
+ data = yaml.load(data.replace("u'", "'"))
+ return data
+
+ def _alarm_traits_decoder(self, data):
+ return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+ for t in data['reason_data']['event']['traits']})
+
+ def maintenance(self, data):
+ try:
+ payload = self._alarm_traits_decoder(data)
+ except Exception:
+ payload = ({t[0]: t[2] for t in
+ data['reason_data']['event']['traits']})
+ self.log.error('cannot parse alarm data: %s' % payload)
+ raise Exception('sample inspector cannot parse alarm.'
+ 'Possibly trait data over 256 char')
+ self.log.info('sample inspector received data = %s' % payload)
+
+ state = payload['state']
+ host = payload['host']
+
+ if state == 'IN_MAINTENANCE':
+ self.log.info("sample inspector: disable %s automatic fault "
+ "management" % host)
+ elif state == 'MAINTENANCE_COMPLETE':
+ self.log.info("sample inspector: enable %s automatic fault "
+ "management" % host)
+ else:
+ raise("sample inspector couldn't handle state: %s" % state)
+
@utils.run_async
def _disable_compute_host(self, hostname):
self.nova.services.force_down(hostname, 'nova-compute', True)
@@ -122,8 +172,8 @@ class SampleInspector(BaseInspector):
nova.servers.reset_state(server, state)
vmdown_time = time.time()
self.vm_down_time = vmdown_time
- self.log.info('doctor mark vm(%s) error at %s'
- % (server, vmdown_time))
+ self.log.info('doctor mark vm(%s) %s at %s'
+ % (server, state, vmdown_time))
thrs = []
for nova, server in zip(self.novaclients, self.servers[hostname]):
@@ -133,6 +183,26 @@ class SampleInspector(BaseInspector):
t.join()
@utils.run_async
+ def _send_notif(self, hostname):
+
+ @utils.run_async
+ def _send_notif(server):
+ payload = dict(tenant_id=server.tenant_id,
+ instance_id=server.id,
+ state="error")
+ self.notif.info({'some': 'context'}, 'compute.instance.update',
+ payload)
+ self.log.info('doctor compute.instance.update vm(%s) error %s'
+ % (server, time.time()))
+
+ thrs = []
+ for server in self.servers[hostname]:
+ t = _send_notif(server)
+ thrs.append(t)
+ for t in thrs:
+ t.join()
+
+ @utils.run_async
def _set_ports_data_plane_status(self, status, hostname):
body = {'data_plane_status': status}
@@ -173,6 +243,11 @@ class InspectorApp(Thread):
self.inspector.handle_events(events)
return "OK"
+ @app.route('/maintenance', methods=['POST'])
+ def maintenance():
+ self.inspector.maintenance(request.json)
+ return "OK"
+
@app.route('/events/shutdown', methods=['POST'])
def shutdown():
self.log.info('shutdown inspector app server at %s' % time.time())
diff --git a/doctor_tests/installer/__init__.py b/doctor_tests/installer/__init__.py
index 31fce754..00a01667 100644
--- a/doctor_tests/installer/__init__.py
+++ b/doctor_tests/installer/__init__.py
@@ -13,25 +13,25 @@ from oslo_utils import importutils
OPTS = [
cfg.StrOpt('type',
- default=os.environ.get('INSTALLER_TYPE', 'local'),
- choices=['local', 'apex', 'daisy', 'fuel'],
+ default=os.environ.get('INSTALLER_TYPE', 'devstack'),
+ choices=['apex', 'daisy', 'fuel', 'devstack'],
help='the type of installer',
required=True),
cfg.StrOpt('ip',
default=os.environ.get('INSTALLER_IP', '127.0.0.1'),
help='the ip of installer'),
- cfg.StrOpt('username',
- default='root',
- help='the user name for login installer server',
- required=True),
+ cfg.StrOpt('key_file',
+ default=os.environ.get('SSH_KEY', None),
+ help='the key for user to login installer server',
+ required=False),
]
_installer_name_class_mapping = {
- 'local': 'doctor_tests.installer.local.LocalInstaller',
'apex': 'doctor_tests.installer.apex.ApexInstaller',
'daisy': 'doctor_tests.installer.daisy.DaisyInstaller',
- 'fuel': 'doctor_tests.installer.mcp.McpInstaller'
+ 'fuel': 'doctor_tests.installer.mcp.McpInstaller',
+ 'devstack': 'doctor_tests.installer.devstack.DevstackInstaller'
}
diff --git a/doctor_tests/installer/apex.py b/doctor_tests/installer/apex.py
index 1ce3eb65..3ec2100c 100644
--- a/doctor_tests/installer/apex.py
+++ b/doctor_tests/installer/apex.py
@@ -6,29 +6,45 @@
# which accompanies this distribution, and is available at
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
+import time
+
+from doctor_tests.common.constants import Inspector
+from doctor_tests.common.constants import is_fenix
+from doctor_tests.common.utils import get_doctor_test_root_dir
from doctor_tests.common.utils import SSHClient
from doctor_tests.installer.base import BaseInstaller
class ApexInstaller(BaseInstaller):
node_user_name = 'heat-admin'
- cm_set_script = 'set_ceilometer.py'
- cm_restore_script = 'restore_ceilometer.py'
+ installer_username = 'stack'
+ cm_set_script = 'set_config.py'
+ nc_set_compute_script = 'set_compute_config.py'
+ cg_set_script = 'set_congress.py'
+ fe_set_script = 'set_fenix.sh'
+ cm_restore_script = 'restore_config.py'
+ nc_restore_compute_script = 'restore_compute_config.py'
+ cg_restore_script = 'restore_congress.py'
+ ac_restart_script = 'restart_aodh.py'
+ ac_restore_script = 'restore_aodh.py'
+ python = 'python'
def __init__(self, conf, log):
super(ApexInstaller, self).__init__(conf, log)
self.client = SSHClient(self.conf.installer.ip,
- self.conf.installer.username,
+ self.installer_username,
+ key_filename=self.conf.installer.key_file,
look_for_keys=True)
self.key_file = None
self.controllers = list()
- self.controller_clients = list()
+ self.computes = list()
def setup(self):
self.log.info('Setup Apex installer start......')
-
self.key_file = self.get_ssh_key_from_installer()
- self.controllers = self.get_controller_ips()
+ self._get_overcloud_conf()
+ if is_fenix(self.conf):
+ self._copy_overcloudrc_to_controllers()
self.create_flavor()
self.set_apply_patches()
self.setup_stunnel()
@@ -42,16 +58,28 @@ class ApexInstaller(BaseInstaller):
key_path = '/home/stack/.ssh/id_rsa'
return self._get_ssh_key(self.client, key_path)
- def get_controller_ips(self):
- self.log.info('Get controller ips from Apex installer......')
-
- command = "source stackrc; " \
- "nova list | grep ' overcloud-controller-[0-9] ' " \
- "| sed -e 's/^.*ctlplane=//' |awk '{print $1}'"
- controllers = self._run_cmd_remote(self.client, command)
- self.log.info('Get controller_ips:%s from Apex installer'
- % controllers)
- return controllers
+ def _copy_overcloudrc_to_controllers(self):
+ for ip in self.controllers:
+ cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip)
+ self._run_cmd_remote(self.client, cmd)
+
+ def _get_overcloud_conf(self):
+ self.log.info('Get overcloud config details from Apex installer'
+ '......')
+
+ command = "source stackrc; nova list | grep ' overcloud-'"
+ raw_ips_list = self._run_cmd_remote(self.client, command)
+ for line in raw_ips_list:
+ ip = line.split('ctlplane=', 1)[1].split(" ", 1)[0]
+ if 'overcloud-controller-' in line:
+ self.controllers.append(ip)
+ elif 'overcloud-novacompute-' in line:
+ self.computes.append(ip)
+ command = "grep docker /home/stack/deploy_command"
+ self.use_containers = self._check_cmd_remote(self.client, command)
+ self.log.info('controller_ips:%s' % self.controllers)
+ self.log.info('compute_ips:%s' % self.computes)
+ self.log.info('use_containers:%s' % self.use_containers)
def get_host_ip_from_hostname(self, hostname):
self.log.info('Get host ip by hostname=%s from Apex installer......'
@@ -62,27 +90,121 @@ class ApexInstaller(BaseInstaller):
host_ips = self._run_cmd_remote(self.client, command)
return host_ips[0]
+ def _set_docker_restart_cmd(self, service):
+ # There can be multiple instances running so need to restart all
+ cmd = "for container in `sudo docker ps | grep "
+ cmd += service
+ cmd += " | awk '{print $1}'`; do sudo docker restart $container; \
+ done;"
+ return cmd
+
def set_apply_patches(self):
self.log.info('Set apply patches start......')
-
- restart_cm_cmd = 'sudo systemctl restart ' \
- 'openstack-ceilometer-notification.service'
+ fenix_files = None
+
+ set_scripts = [self.cm_set_script]
+
+ if self.use_containers:
+ restart_cmd = (self._set_docker_restart_cmd(
+ "ceilometer-notification"))
+ set_scripts.append(self.ac_restart_script)
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-ceilometer-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("nova-scheduler")
+ if is_fenix(self.conf):
+ set_scripts.append(self.fe_set_script)
+ testdir = get_doctor_test_root_dir()
+ fenix_files = ["Dockerfile", "run"]
+ else:
+ restart_cmd += ' openstack-nova-scheduler.service'
+ set_scripts.append(self.nc_set_compute_script)
+
+ if self.conf.inspector.type == Inspector.CONGRESS:
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("congress-server")
+ else:
+ restart_cmd += ' openstack-congress-server.service'
+ set_scripts.append(self.cg_set_script)
for node_ip in self.controllers:
client = SSHClient(node_ip, self.node_user_name,
key_filename=self.key_file)
- self.controller_clients.append(client)
+ if fenix_files is not None:
+ for fenix_file in fenix_files:
+ src_file = '{0}/{1}/{2}'.format(testdir,
+ 'admin_tool/fenix',
+ fenix_file)
+ client.scp(src_file, fenix_file)
self._run_apply_patches(client,
- restart_cm_cmd,
- self.cm_set_script)
+ restart_cmd,
+ set_scripts,
+ python=self.python)
+ time.sleep(5)
+
+ self.log.info('Set apply patches start......')
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd = self._set_docker_restart_cmd("nova")
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-nova-compute.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(client,
+ restart_cmd,
+ [self.nc_set_compute_script],
+ python=self.python)
+ time.sleep(5)
def restore_apply_patches(self):
self.log.info('restore apply patches start......')
- restart_cm_cmd = 'sudo systemctl restart ' \
- 'openstack-ceilometer-notification.service'
+ restore_scripts = [self.cm_restore_script]
+
+ if self.use_containers:
+ restart_cmd = (self._set_docker_restart_cmd(
+ "ceilometer-notification"))
+ restore_scripts.append(self.ac_restore_script)
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-ceilometer-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("nova-scheduler")
+ else:
+ restart_cmd += ' openstack-nova-scheduler.service'
+ restore_scripts.append(self.nc_restore_compute_script)
+
+ if self.conf.inspector.type == Inspector.CONGRESS:
+ if self.use_containers:
+ restart_cmd += self._set_docker_restart_cmd("congress-server")
+ else:
+ restart_cmd += ' openstack-congress-server.service'
+ restore_scripts.append(self.cg_restore_script)
- for client in self.controller_clients:
+ for node_ip in self.controllers:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
self._run_apply_patches(client,
- restart_cm_cmd,
- self.cm_restore_script)
+ restart_cmd,
+ restore_scripts,
+ python=self.python)
+
+ if self.conf.test_case != 'fault_management':
+ if self.use_containers:
+ restart_cmd = self._set_docker_restart_cmd("nova-compute")
+ else:
+ restart_cmd = 'sudo systemctl restart' \
+ ' openstack-nova-compute.service'
+ for node_ip in self.computes:
+ self._run_apply_patches(
+ client, restart_cmd,
+ [self.nc_restore_compute_script],
+ python=self.python)
diff --git a/doctor_tests/installer/base.py b/doctor_tests/installer/base.py
index 76bbeb1e..de4d2f2e 100644
--- a/doctor_tests/installer/base.py
+++ b/doctor_tests/installer/base.py
@@ -14,8 +14,9 @@ import pwd
import six
import stat
import subprocess
+import time
-from doctor_tests.common.utils import get_doctor_test_root_dir
+from doctor_tests.common import utils
from doctor_tests.identity_auth import get_session
from doctor_tests.os_clients import nova_client
@@ -26,6 +27,7 @@ class BaseInstaller(object):
self.conf = conf
self.log = log
self.servers = list()
+ self.use_containers = False
@abc.abstractproperty
def node_user_name(self):
@@ -58,22 +60,52 @@ class BaseInstaller(object):
def setup_stunnel(self):
self.log.info('Setup ssh stunnel in %s installer......'
% self.conf.installer.type)
+ tunnels = [self.conf.consumer.port]
+ if self.conf.test_case == 'maintenance':
+ tunnel_uptime = 1200
+ tunnels += [self.conf.app_manager.port, self.conf.inspector.port]
+ elif self.conf.test_case == 'all':
+ tunnel_uptime = 1800
+ tunnels += [self.conf.app_manager.port, self.conf.inspector.port]
+ else:
+ tunnel_uptime = 600
for node_ip in self.controllers:
+ for port in tunnels:
+ self.log.info('tunnel for port %s' % port)
+ cmd = ("ssh -o UserKnownHostsFile=/dev/null"
+ " -o StrictHostKeyChecking=no"
+ " -i %s %s@%s -R %s:localhost:%s"
+ " sleep %s > ssh_tunnel.%s.%s"
+ " 2>&1 < /dev/null "
+ % (self.key_file,
+ self.node_user_name,
+ node_ip,
+ port,
+ port,
+ tunnel_uptime,
+ node_ip,
+ port))
+ server = subprocess.Popen('exec ' + cmd, shell=True)
+ self.servers.append(server)
+ if self.conf.admin_tool.type == 'fenix':
+ port = self.conf.admin_tool.port
+ self.log.info('tunnel for port %s' % port)
cmd = ("ssh -o UserKnownHostsFile=/dev/null"
" -o StrictHostKeyChecking=no"
- " -i %s %s@%s -R %s:localhost:%s"
- " sleep 600 > ssh_tunnel.%s.log"
- " 2>&1 < /dev/null &"
+ " -i %s %s@%s -L %s:localhost:%s"
+ " sleep %s > ssh_tunnel.%s.%s"
+ " 2>&1 < /dev/null "
% (self.key_file,
self.node_user_name,
node_ip,
- self.conf.consumer.port,
- self.conf.consumer.port,
- node_ip))
- server = subprocess.Popen(cmd, shell=True)
+ port,
+ port,
+ tunnel_uptime,
+ node_ip,
+ port))
+ server = subprocess.Popen('exec ' + cmd, shell=True)
self.servers.append(server)
- server.communicate()
def _get_ssh_key(self, client, key_path):
self.log.info('Get SSH keys from %s installer......'
@@ -84,7 +116,8 @@ class BaseInstaller(object):
% self.conf.installer.type)
return self.key_file
- ssh_key = '{0}/{1}'.format(get_doctor_test_root_dir(), 'instack_key')
+ ssh_key = '{0}/{1}'.format(utils.get_doctor_test_root_dir(),
+ 'instack_key')
client.scp(key_path, ssh_key, method='get')
user = getpass.getuser()
uid = pwd.getpwnam(user).pw_uid
@@ -93,6 +126,10 @@ class BaseInstaller(object):
os.chmod(ssh_key, stat.S_IREAD)
return ssh_key
+ @abc.abstractmethod
+ def get_transport_url(self):
+ pass
+
def _run_cmd_remote(self, client, command):
self.log.info('Run command=%s in %s installer......'
% (command, self.conf.installer.type))
@@ -107,16 +144,48 @@ class BaseInstaller(object):
% (output, command, self.conf.installer.type))
return output
- def _run_apply_patches(self, client, restart_cmd, script_name):
- installer_dir = os.path.dirname(os.path.realpath(__file__))
- script_abs_path = '{0}/{1}/{2}'.format(installer_dir,
- 'common', script_name)
+ def _check_cmd_remote(self, client, command):
+ self.log.info('Check command=%s return in %s installer......'
+ % (command, self.conf.installer.type))
- client.scp(script_abs_path, script_name)
- cmd = 'sudo python %s' % script_name
- ret, output = client.ssh(cmd)
- if ret:
- raise Exception('Do the command in controller'
- ' node failed, ret=%s, cmd=%s, output=%s'
- % (ret, cmd, output))
- client.ssh(restart_cmd)
+ ret, output = client.ssh(command, raise_enabled=False)
+ self.log.info('return %s' % ret)
+ if ret == 0:
+ ret = True
+ else:
+ ret = False
+ return ret
+
+ @utils.run_async
+ def _run_apply_patches(self, client, restart_cmd, script_names,
+ python='python3'):
+ installer_dir = os.path.dirname(os.path.realpath(__file__))
+ if isinstance(script_names, list):
+ for script_name in script_names:
+ script_abs_path = '{0}/{1}/{2}'.format(installer_dir,
+ 'common', script_name)
+ if self.conf.installer.type == "devstack":
+ script_name = "/opt/stack/%s" % script_name
+ try:
+ client.scp(script_abs_path, script_name)
+ except Exception:
+ client.scp(script_abs_path, script_name)
+ try:
+ if ".py" in script_name:
+ cmd = 'sudo %s %s' % (python, script_name)
+ else:
+ cmd = 'sudo chmod 700 %s;sudo ./%s' % (script_name,
+ script_name)
+ ret, output = client.ssh(cmd)
+ self.log.info('Command %s output %s' % (cmd, output))
+ except Exception:
+ ret, output = client.ssh(cmd)
+ self.log.info('Command %s output %s' % (cmd, output))
+ if ret:
+ raise Exception('Do the command in remote'
+ ' node failed, ret=%s, cmd=%s, output=%s'
+ % (ret, cmd, output))
+ if 'nova' in restart_cmd or 'devstack@n-' in restart_cmd:
+ # Make sure scheduler has proper cpu_allocation_ratio
+ time.sleep(5)
+ client.ssh(restart_cmd)
diff --git a/doctor_tests/installer/common/congress.py b/doctor_tests/installer/common/congress.py
deleted file mode 100644
index cc58c390..00000000
--- a/doctor_tests/installer/common/congress.py
+++ /dev/null
@@ -1,51 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-
-
-def set_doctor_driver_conf(ssh_client, restart_cmd):
- cg_set_cmd = '''#!/bin/bash
-co_conf=/etc/congress/congress.conf
-co_conf_bak=/etc/congress/congress.conf.bak
-co_entry="congress.datasources.doctor_driver.DoctorDriver"
-if sudo grep -e "^drivers.*$co_entry" $co_conf; then
- echo "NOTE: congress is configured as we needed"
-else
- echo "modify the congress config"
- sudo cp $co_conf $co_conf_bak
- sudo sed -i -e "/^drivers/s/$/,$co_entry/" $co_conf
- %s
-fi
- ''' % (restart_cmd)
-
- ret, output = ssh_client.ssh(cg_set_cmd)
- if ret:
- raise Exception('Do the congress command in controller node failed...'
- 'ret=%s, cmd=%s, output=%s'
- % (ret, cg_set_cmd, output))
-
-
-def restore_doctor_driver_conf(ssh_client, restart_cmd):
- cg_restore_cmd = '''#!/bin/bash
-co_conf=/etc/congress/congress.conf
-co_conf_bak=/etc/congress/congress.conf.bak
-if [ -e $co_conf_bak ]; then
- echo "restore the congress config"
- sudo cp $co_conf_bak $co_conf
- sudo rm $co_conf_bak
- %s
-else
- echo "Do not need to restore the congress config"
-fi
- ''' % (restart_cmd)
-
- ret, output = ssh_client.ssh(cg_restore_cmd)
- if ret:
- raise Exception('Do the congress command in controller node failed...'
- 'ret=%s, cmd=%s, output=%s'
- % (ret, cg_restore_cmd, output))
diff --git a/doctor_tests/installer/common/restart_aodh.py b/doctor_tests/installer/common/restart_aodh.py
new file mode 100644
index 00000000..4473bdca
--- /dev/null
+++ b/doctor_tests/installer/common/restart_aodh.py
@@ -0,0 +1,42 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import socket
+import subprocess
+
+
+def restart_aodh_event_alarm():
+ # Restart aodh-evaluator docker with localhost as controller host ip
+ # This makes our alarm sending look the same as without container
+
+ orig_docker_id = subprocess.check_output("docker ps | grep aodh-evaluator "
+ "| awk '{print $1}'", shell=True)
+ get_docker_startup = (
+ 'docker inspect --format=\'{{range .Config.Env}} -e "{{.}}" {{end}} '
+ '{{range .Mounts}} -v {{.Source}}:{{.Destination}}{{if .Mode}}:'
+ '{{.Mode}}{{end}}{{end}} -ti {{.Config.Image}}\''
+ )
+ docker_start = subprocess.check_output("%s %s" % (get_docker_startup,
+ orig_docker_id), shell=True)
+ with open("orig_docker_id", 'w') as oid:
+ oid.write(orig_docker_id)
+ oid.close()
+ subprocess.check_output("docker stop %s" % orig_docker_id, shell=True)
+ ip = socket.gethostbyname(socket.gethostname())
+
+ ae_start = '-d --add-host="localhost:%s" %s' % (ip, docker_start)
+ subprocess.check_output("docker run %s" % ae_start, shell=True)
+ new_docker_id = subprocess.check_output("docker ps | grep aodh-evaluator "
+ " | awk '{print $1}'", shell=True)
+ if orig_docker_id == new_docker_id:
+ raise Exception("Docker ids matching!")
+ with open("new_docker_id", 'w') as nid:
+ nid.write(new_docker_id)
+ nid.close()
+
+restart_aodh_event_alarm()
diff --git a/doctor_tests/installer/common/restore_aodh.py b/doctor_tests/installer/common/restore_aodh.py
new file mode 100644
index 00000000..b55eae8d
--- /dev/null
+++ b/doctor_tests/installer/common/restore_aodh.py
@@ -0,0 +1,32 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import subprocess
+
+
+def restore_aodh_event_alarm():
+ # Remove modified docker and restore original
+ orig = "orig_docker_id"
+ new = "new_docker_id"
+ if os.path.isfile(orig):
+ with open("orig_docker_id", 'r') as oid:
+ orig_docker_id = oid.read()
+ oid.close()
+ if os.path.isfile(new):
+ with open("new_docker_id", 'r') as nid:
+ new_docker_id = nid.read()
+ nid.close()
+ subprocess.check_output("docker stop %s" % new_docker_id,
+ shell=True)
+ subprocess.check_output("docker rm %s" % new_docker_id, shell=True)
+ os.remove(new)
+ subprocess.check_output("docker start %s" % orig_docker_id, shell=True)
+ os.remove(orig)
+
+restore_aodh_event_alarm()
diff --git a/doctor_tests/installer/common/restore_ceilometer.py b/doctor_tests/installer/common/restore_ceilometer.py
deleted file mode 100644
index d25b9ede..00000000
--- a/doctor_tests/installer/common/restore_ceilometer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-import os
-import shutil
-
-ep_file = '/etc/ceilometer/event_pipeline.yaml'
-ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
-
-
-def restore_ep_config():
-
- if not os.path.isfile(ep_file_bak):
- print('Bak_file:%s does not exist.' % ep_file_bak)
- else:
- print('restore')
- shutil.copyfile(ep_file_bak, ep_file)
- os.remove(ep_file_bak)
- return
-
-
-restore_ep_config()
diff --git a/doctor_tests/installer/common/restore_compute_config.py b/doctor_tests/installer/common/restore_compute_config.py
new file mode 100644
index 00000000..82e10a66
--- /dev/null
+++ b/doctor_tests/installer/common/restore_compute_config.py
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def restore_cpu_allocation_ratio():
+ for nova_file_bak in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.bak", # noqa
+ "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.bak", # noqa
+ "/etc/nova/nova.bak"]:
+ if os.path.isfile(nova_file_bak):
+ nova_file = nova_file_bak.replace(".bak", ".conf")
+ print('restoring nova.bak.')
+ shutil.copyfile(nova_file_bak, nova_file)
+ os.remove(nova_file_bak)
+ return
+ print('nova.bak does not exist.')
+ return
+
+restore_cpu_allocation_ratio()
diff --git a/doctor_tests/installer/common/restore_config.py b/doctor_tests/installer/common/restore_config.py
new file mode 100644
index 00000000..5cb83b27
--- /dev/null
+++ b/doctor_tests/installer/common/restore_config.py
@@ -0,0 +1,48 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+cbase = "/var/lib/config-data/puppet-generated/ceilometer"
+if not os.path.isdir(cbase):
+ cbase = ""
+
+
+def restore_ep_config():
+ ep_file = cbase + '/etc/ceilometer/event_pipeline.yaml'
+ ep_file_bak = cbase + '/etc/ceilometer/event_pipeline.yaml.bak'
+
+ if not os.path.isfile(ep_file_bak):
+ print('Bak_file:%s does not exist.' % ep_file_bak)
+ else:
+ print('restore')
+ shutil.copyfile(ep_file_bak, ep_file)
+ os.remove(ep_file_bak)
+ return
+
+
+def restore_ed_config():
+ ed_file = cbase + '/etc/ceilometer/event_definitions.yaml'
+ ed_file_bak = cbase + '/etc/ceilometer/event_definitions.bak'
+
+ if not os.path.isfile(ed_file_bak):
+ print("Bak_file doesn't exist: %s." % ed_file_bak)
+ else:
+ print('restore: %s' % ed_file)
+ if os.stat(ed_file_bak).st_size == 0:
+ print('Bak_file empty, so removing also: %s' % ed_file)
+ os.remove(ed_file)
+ else:
+ shutil.copyfile(ed_file_bak, ed_file)
+ os.remove(ed_file_bak)
+ return
+
+restore_ep_config()
+restore_ed_config()
diff --git a/doctor_tests/installer/common/restore_congress.py b/doctor_tests/installer/common/restore_congress.py
new file mode 100644
index 00000000..576f1b16
--- /dev/null
+++ b/doctor_tests/installer/common/restore_congress.py
@@ -0,0 +1,29 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def restore_drivers_config():
+ co_base = "/var/lib/config-data/puppet-generated/congress"
+ if not os.path.isdir(co_base):
+ co_base = ""
+ co_conf = co_base + "/etc/congress/congress.conf"
+ co_conf_bak = co_base + "/etc/congress/congress.conf.bak"
+
+ if not os.path.isfile(co_conf_bak):
+ print('Bak_file:%s does not exist.' % co_conf_bak)
+ else:
+ print('restore: %s' % co_conf)
+ shutil.copyfile(co_conf_bak, co_conf)
+ os.remove(co_conf_bak)
+ return
+
+
+restore_drivers_config()
diff --git a/doctor_tests/installer/common/set_ceilometer.py b/doctor_tests/installer/common/set_ceilometer.py
deleted file mode 100644
index 4050aaef..00000000
--- a/doctor_tests/installer/common/set_ceilometer.py
+++ /dev/null
@@ -1,45 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-import os
-import shutil
-import yaml
-
-ep_file = '/etc/ceilometer/event_pipeline.yaml'
-ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
-event_notifier_topic = 'notifier://?topic=alarm.all'
-
-
-def set_notifier_topic():
- config_modified = False
-
- if not os.path.isfile(ep_file):
- raise Exception("File doesn't exist: %s." % ep_file)
-
- with open(ep_file, 'r') as file:
- config = yaml.safe_load(file)
-
- sinks = config['sinks']
- for sink in sinks:
- if sink['name'] == 'event_sink':
- publishers = sink['publishers']
- if event_notifier_topic not in publishers:
- print('Add event notifier in ceilometer')
- publishers.append(event_notifier_topic)
- config_modified = True
- else:
- print('NOTE: event notifier is configured'
- 'in ceilometer as we needed')
-
- if config_modified:
- shutil.copyfile(ep_file, ep_file_bak)
- with open(ep_file, 'w+') as file:
- file.write(yaml.safe_dump(config))
-
-
-set_notifier_topic()
diff --git a/doctor_tests/installer/common/set_compute_config.py b/doctor_tests/installer/common/set_compute_config.py
new file mode 100644
index 00000000..615f1895
--- /dev/null
+++ b/doctor_tests/installer/common/set_compute_config.py
@@ -0,0 +1,53 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def set_cpu_allocation_ratio():
+ nova_file_bak = None
+ for nova_file in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf", # noqa
+ "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.conf", # noqa
+ "/etc/nova/nova.conf"]:
+ if os.path.isfile(nova_file):
+ nova_file_bak = nova_file.replace(".conf", ".bak")
+ break
+
+ if nova_file_bak is None:
+ raise Exception("Could not find nova.conf")
+ # TODO (tojuvone): Unfortunately ConfigParser did not produce working conf
+ fcheck = open(nova_file)
+ found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio"
+ in ca])
+ fcheck.close()
+ change = False
+ found = False
+ if found_list and len(found_list):
+ for car in found_list:
+ if car.startswith('#'):
+ continue
+ if car.startswith('cpu_allocation_ratio'):
+ found = True
+ if "1.0" not in car.split('=')[1]:
+ change = True
+ if not found or change:
+ # need to add or change
+ shutil.copyfile(nova_file, nova_file_bak)
+ fin = open(nova_file_bak)
+ fout = open(nova_file, "wt")
+ for line in fin:
+ if change and line.startswith("cpu_allocation_ratio"):
+ line = "cpu_allocation_ratio=1.0"
+ if not found and line.startswith("[DEFAULT]"):
+ line += "cpu_allocation_ratio=1.0\n"
+ fout.write(line)
+ fin.close()
+ fout.close()
+
+set_cpu_allocation_ratio()
diff --git a/doctor_tests/installer/common/set_config.py b/doctor_tests/installer/common/set_config.py
new file mode 100644
index 00000000..e66d4c2c
--- /dev/null
+++ b/doctor_tests/installer/common/set_config.py
@@ -0,0 +1,163 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+import yaml
+
+
+cbase = "/var/lib/config-data/puppet-generated/ceilometer"
+if not os.path.isdir(cbase):
+ cbase = ""
+
+
+def set_notifier_topic():
+ ep_file = cbase + '/etc/ceilometer/event_pipeline.yaml'
+ ep_file_bak = cbase + '/etc/ceilometer/event_pipeline.yaml.bak'
+ event_notifier_topic = 'notifier://?topic=alarm.all'
+ config_modified = False
+
+ if not os.path.isfile(ep_file):
+ raise Exception("File doesn't exist: %s." % ep_file)
+
+ with open(ep_file, 'r') as file:
+ config = yaml.safe_load(file)
+
+ sinks = config['sinks']
+ for sink in sinks:
+ if sink['name'] == 'event_sink':
+ publishers = sink['publishers']
+ if event_notifier_topic not in publishers:
+ print('Add event notifier in ceilometer')
+ publishers.append(event_notifier_topic)
+ config_modified = True
+ else:
+ print('NOTE: event notifier is configured'
+ 'in ceilometer as we needed')
+
+ if config_modified:
+ shutil.copyfile(ep_file, ep_file_bak)
+ with open(ep_file, 'w+') as file:
+ file.write(yaml.safe_dump(config))
+
+
+def set_event_definitions():
+ ed_file = cbase + '/etc/ceilometer/event_definitions.yaml'
+ ed_file_bak = cbase + '/etc/ceilometer/event_definitions.bak'
+ orig_ed_file_exist = True
+ modify_config = False
+
+ if not os.path.isfile(ed_file):
+ # Deployment did not modify file, so it did not exist
+ src_file = '/etc/ceilometer/event_definitions.yaml'
+ if not os.path.isfile(src_file):
+ config = []
+ orig_ed_file_exist = False
+ else:
+ shutil.copyfile('/etc/ceilometer/event_definitions.yaml', ed_file)
+ if orig_ed_file_exist:
+ with open(ed_file, 'r') as file:
+ config = yaml.safe_load(file)
+
+ et_list = [et['event_type'] for et in config]
+
+ if 'compute.instance.update' in et_list:
+ print('NOTE: compute.instance.update allready configured')
+ else:
+ print('NOTE: add compute.instance.update to event_definitions.yaml')
+ modify_config = True
+ instance_update = {
+ 'event_type': 'compute.instance.update',
+ 'traits': {
+ 'deleted_at': {'fields': 'payload.deleted_at',
+ 'type': 'datetime'},
+ 'disk_gb': {'fields': 'payload.disk_gb',
+ 'type': 'int'},
+ 'display_name': {'fields': 'payload.display_name'},
+ 'ephemeral_gb': {'fields': 'payload.ephemeral_gb',
+ 'type': 'int'},
+ 'host': {'fields': 'publisher_id.`split(., 1, 1)`'},
+ 'instance_id': {'fields': 'payload.instance_id'},
+ 'instance_type': {'fields': 'payload.instance_type'},
+ 'instance_type_id': {'fields': 'payload.instance_type_id',
+ 'type': 'int'},
+ 'launched_at': {'fields': 'payload.launched_at',
+ 'type': 'datetime'},
+ 'memory_mb': {'fields': 'payload.memory_mb',
+ 'type': 'int'},
+ 'old_state': {'fields': 'payload.old_state'},
+ 'os_architecture': {
+ 'fields':
+ "payload.image_meta.'org.openstack__1__architecture'"},
+ 'os_distro': {
+ 'fields':
+ "payload.image_meta.'org.openstack__1__os_distro'"},
+ 'os_version': {
+ 'fields':
+ "payload.image_meta.'org.openstack__1__os_version'"},
+ 'resource_id': {'fields': 'payload.instance_id'},
+ 'root_gb': {'fields': 'payload.root_gb',
+ 'type': 'int'},
+ 'service': {'fields': 'publisher_id.`split(., 0, -1)`'},
+ 'state': {'fields': 'payload.state'},
+ 'tenant_id': {'fields': 'payload.tenant_id'},
+ 'user_id': {'fields': 'payload.user_id'},
+ 'vcpus': {'fields': 'payload.vcpus', 'type': 'int'}
+ }
+ }
+ config.append(instance_update)
+
+ if 'maintenance.scheduled' in et_list:
+ print('NOTE: maintenance.scheduled allready configured')
+ else:
+ print('NOTE: add maintenance.scheduled to event_definitions.yaml')
+ modify_config = True
+ mscheduled = {
+ 'event_type': 'maintenance.scheduled',
+ 'traits': {
+ 'allowed_actions': {'fields': 'payload.allowed_actions'},
+ 'instance_ids': {'fields': 'payload.instance_ids'},
+ 'reply_url': {'fields': 'payload.reply_url'},
+ 'actions_at': {'fields': 'payload.actions_at',
+ 'type': 'datetime'},
+ 'reply_at': {'fields': 'payload.reply_at', 'type': 'datetime'},
+ 'state': {'fields': 'payload.state'},
+ 'session_id': {'fields': 'payload.session_id'},
+ 'project_id': {'fields': 'payload.project_id'},
+ 'metadata': {'fields': 'payload.metadata'}
+ }
+ }
+ config.append(mscheduled)
+
+ if 'maintenance.host' in et_list:
+ print('NOTE: maintenance.host allready configured')
+ else:
+ print('NOTE: add maintenance.host to event_definitions.yaml')
+ modify_config = True
+ mhost = {
+ 'event_type': 'maintenance.host',
+ 'traits': {
+ 'host': {'fields': 'payload.host'},
+ 'project_id': {'fields': 'payload.project_id'},
+ 'state': {'fields': 'payload.state'},
+ 'session_id': {'fields': 'payload.session_id'}
+ }
+ }
+ config.append(mhost)
+
+ if modify_config:
+ if orig_ed_file_exist:
+ shutil.copyfile(ed_file, ed_file_bak)
+ else:
+ with open(ed_file_bak, 'w+') as file:
+ file.close()
+ with open(ed_file, 'w+') as file:
+ file.write(yaml.safe_dump(config))
+
+set_notifier_topic()
+set_event_definitions()
diff --git a/doctor_tests/installer/common/set_congress.py b/doctor_tests/installer/common/set_congress.py
new file mode 100644
index 00000000..7961df32
--- /dev/null
+++ b/doctor_tests/installer/common/set_congress.py
@@ -0,0 +1,39 @@
+##############################################################################
+# Copyright (c) 2018 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from six.moves import configparser
+import os
+import shutil
+
+
+def set_drivers_config():
+ co_base = "/var/lib/config-data/puppet-generated/congress"
+ if not os.path.isdir(co_base):
+ co_base = ""
+ co_conf = co_base + "/etc/congress/congress.conf"
+ co_conf_bak = co_base + "/etc/congress/congress.conf.bak"
+ doctor_driver = "congress.datasources.doctor_driver.DoctorDriver"
+ config_modified = False
+
+ config = configparser.ConfigParser()
+ config.read(co_conf)
+ drivers = config.get('DEFAULT', 'drivers')
+
+ if doctor_driver not in drivers:
+ config_modified = True
+ drivers += ',' + doctor_driver
+
+ config.set('DEFAULT', 'drivers', drivers)
+
+ if config_modified:
+ shutil.copyfile(co_conf, co_conf_bak)
+ with open(co_conf, 'w') as configfile:
+ config.write(configfile)
+
+
+set_drivers_config()
diff --git a/doctor_tests/installer/common/set_fenix.sh b/doctor_tests/installer/common/set_fenix.sh
new file mode 100644
index 00000000..bd1eae47
--- /dev/null
+++ b/doctor_tests/installer/common/set_fenix.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+
+##############################################################################
+# Copyright (c) 2019 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+# Config files
+docker -v >/dev/null || {
+echo "Fenix needs docker to be installed..."
+ver=`grep "UBUNTU_CODENAME" /etc/os-release | cut -d '=' -f 2`
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $ver stable"
+apt install apt-transport-https ca-certificates curl software-properties-common
+apt update
+apt-cache policy docker-ce
+apt-get install -y docker-ce docker-ce-cli containerd.io
+dpkg -r --force-depends golang-docker-credential-helpers
+}
+
+docker ps | grep fenix -q && {
+REMOTE=`git ls-remote https://opendev.org/x/fenix HEAD | awk '{ print $1}'`
+LOCAL=`docker exec -t fenix git rev-parse @`
+if [[ "$LOCAL" =~ "$REMOTE" ]]; then
+ # Difference in above string ending marks, so cannot compare equal
+ echo "Fenix start: Already running latest $LOCAL equals $REMOTE"
+ exit 0
+else
+ echo "Fenix container needs to be recreated $LOCAL not $REMOTE"
+ # Remove previous container
+ for img in `docker image list | grep "^fenix" | awk '{print $1}'`; do
+ for dock in `docker ps --all -f "ancestor=$img" | grep "$img" | awk '{print $1}'`; do
+ docker stop $dock; docker rm $dock;
+ done;
+ docker image rm $img;
+ done
+fi
+} || echo "Fenix container needs to be created..."
+
+cp /root/keystonercv3 .
+
+transport=`grep -m1 "^transport" /etc/nova/nova.conf`
+. keystonercv3
+
+echo "[DEFAULT]" > fenix.conf
+echo "port = 12347" >> fenix.conf
+echo $transport >> fenix.conf
+
+echo "[database]" >> fenix.conf
+MYSQLIP=`grep -m1 "^connection" /etc/nova/nova.conf | sed -e "s/.*@//;s/\/.*//"`
+echo "connection = mysql+pymysql://fenix:fenix@$MYSQLIP/fenix" >> fenix.conf
+
+echo "[service_user]" >> fenix.conf
+echo "os_auth_url = $OS_AUTH_URL" >> fenix.conf
+echo "os_username = $OS_USERNAME" >> fenix.conf
+echo "os_password = $OS_PASSWORD" >> fenix.conf
+echo "os_user_domain_name = $OS_USER_DOMAIN_NAME" >> fenix.conf
+echo "os_project_name = $OS_PROJECT_NAME" >> fenix.conf
+echo "os_project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix.conf
+
+echo "[DEFAULT]" > fenix-api.conf
+echo "port = 12347" >> fenix-api.conf
+echo $transport >> fenix-api.conf
+
+echo "[keystone_authtoken]" >> fenix-api.conf
+echo "auth_url = $OS_AUTH_URL" >> fenix-api.conf
+echo "auth_type = password" >> fenix-api.conf
+echo "project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf
+echo "project_name = $OS_PROJECT_NAME" >> fenix-api.conf
+echo "user_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf
+echo "password = $OS_PASSWORD" >> fenix-api.conf
+echo "username = $OS_USERNAME" >> fenix-api.conf
+echo "cafile = /opt/stack/data/ca-bundle.pem" >> fenix-api.conf
+
+openstack service list | grep -q maintenance || {
+openstack service create --name fenix --enable maintenance
+openstack endpoint create --region $OS_REGION_NAME --enable fenix public http://localhost:12347/v1
+}
+
+# Mysql pw
+# MYSQLPW=`cat /var/lib/config-data/mysql/etc/puppet/hieradata/service_configs.json | grep mysql | grep root_password | awk -F": " '{print $2}' | awk -F"\"" '{print $2}'`
+MYSQLPW=root
+
+# Fenix DB
+[ `mysql -uroot -p$MYSQLPW -e "SELECT host, user FROM mysql.user;" | grep fenix | wc -l` -eq 0 ] && {
+ mysql -uroot -p$MYSQLPW -hlocalhost -e "CREATE USER 'fenix'@'localhost' IDENTIFIED BY 'fenix';"
+ mysql -uroot -p$MYSQLPW -hlocalhost -e "GRANT ALL PRIVILEGES ON fenix.* TO 'fenix'@'' identified by 'fenix';FLUSH PRIVILEGES;"
+}
+mysql -ufenix -pfenix -hlocalhost -e "DROP DATABASE IF EXISTS fenix;"
+mysql -ufenix -pfenix -hlocalhost -e "CREATE DATABASE fenix CHARACTER SET utf8;"
+
+# Build Fenix container and run it
+chmod 700 run
+docker build --build-arg OPENSTACK=master --build-arg BRANCH=master --network host $PWD -t fenix | tail -1
+docker run --network host -d --name fenix -p 12347:12347 -ti fenix
+if [ $? -eq 0 ]; then
+ echo "Fenix start: OK"
+else
+ echo "Fenix start: FAILED"
+fi
+# To debug check log from fenix container
+# docker exec -ti fenix tail -f /var/log/fenix-engine.log
diff --git a/doctor_tests/installer/common/vitrage.py b/doctor_tests/installer/common/vitrage.py
index 30a73f5d..801adff5 100644
--- a/doctor_tests/installer/common/vitrage.py
+++ b/doctor_tests/installer/common/vitrage.py
@@ -9,8 +9,11 @@
import os
+vi_base = "/var/lib/config-data/puppet-generated/vitrage"
+if not os.path.isdir(vi_base):
+ vi_base = ""
vitrage_template_file = \
- '/etc/vitrage/templates/vitrage_host_down_scenarios.yaml'
+ vi_base + '/etc/vitrage/templates/vitrage_host_down_scenarios.yaml'
template = """
metadata:
diff --git a/doctor_tests/installer/daisy.py b/doctor_tests/installer/daisy.py
index 52ccb7c8..e4499d9c 100644
--- a/doctor_tests/installer/daisy.py
+++ b/doctor_tests/installer/daisy.py
@@ -12,11 +12,12 @@ from doctor_tests.installer.base import BaseInstaller
class DaisyInstaller(BaseInstaller):
node_user_name = 'root'
+ installer_username = 'root'
def __init__(self, conf, log):
super(DaisyInstaller, self).__init__(conf, log)
self.client = SSHClient(self.conf.installer.ip,
- self.conf.installer.username,
+ self.installer_username,
password='r00tme')
self.key_file = None
self.controllers = list()
diff --git a/doctor_tests/installer/devstack.py b/doctor_tests/installer/devstack.py
new file mode 100644
index 00000000..02f3601a
--- /dev/null
+++ b/doctor_tests/installer/devstack.py
@@ -0,0 +1,151 @@
+##############################################################################
+# Copyright (c) 2019 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import socket
+import time
+
+from doctor_tests.common.utils import SSHClient
+from doctor_tests.common.utils import LocalSSH
+from doctor_tests.identity_auth import get_session
+from doctor_tests.installer.base import BaseInstaller
+from doctor_tests.os_clients import nova_client
+
+
+class DevstackInstaller(BaseInstaller):
+ node_user_name = None
+ cm_set_script = 'set_config.py'
+ nc_set_compute_script = 'set_compute_config.py'
+ cm_restore_script = 'restore_config.py'
+ nc_restore_compute_script = 'restore_compute_config.py'
+ ac_restart_script = 'restart_aodh.py'
+ ac_restore_script = 'restore_aodh.py'
+ python = 'python'
+
+ def __init__(self, conf, log):
+ super(DevstackInstaller, self).__init__(conf, log)
+ # Run Doctor under users home. sudo hides other env param to be used
+ home, self.node_user_name = (iter(os.environ.get('VIRTUAL_ENV')
+ .split('/', 3)[1:3]))
+ # Migration needs to work so ssh should have proper key defined
+ self.key_file = '/%s/%s/.ssh/id_rsa' % (home, self.node_user_name)
+ self.log.info('ssh uses: %s and %s' % (self.node_user_name,
+ self.key_file))
+ self.controllers = ([ip for ip in
+ socket.gethostbyname_ex(socket.gethostname())[2]
+ if not ip.startswith('127.')] or
+ [[(s.connect(('8.8.8.8', 53)),
+ s.getsockname()[0], s.close())
+ for s in [socket.socket(socket.AF_INET,
+ socket.SOCK_DGRAM)]][0][1]])
+ conf.admin_tool.ip = self.controllers[0]
+ self.computes = list()
+ self.nova = nova_client(conf.nova_version, get_session())
+
+ def setup(self):
+ self.log.info('Setup Devstack installer start......')
+ self._get_devstack_conf()
+ self.create_flavor()
+ self.set_apply_patches()
+
+ def cleanup(self):
+ self.restore_apply_patches()
+
+ def get_ssh_key_from_installer(self):
+ return self.key_file
+
+ def get_transport_url(self):
+ client = LocalSSH(self.log)
+ cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf'
+ ret, url = client.ssh(cmd)
+ url = url.split("= ", 1)[1][:-1]
+ self.log.info('get_transport_url %s' % url)
+ return url
+
+ def get_host_ip_from_hostname(self, hostname):
+ return [hvisor.__getattr__('host_ip') for hvisor in self.hvisors
+ if hvisor.__getattr__('hypervisor_hostname') == hostname][0]
+
+ def _get_devstack_conf(self):
+ self.log.info('Get devstack config details for Devstack installer'
+ '......')
+ self.hvisors = self.nova.hypervisors.list(detailed=True)
+ self.log.info('checking hypervisors.......')
+ self.computes = [hvisor.__getattr__('host_ip') for hvisor in
+ self.hvisors]
+ self.use_containers = False
+ self.log.info('controller_ips:%s' % self.controllers)
+ self.log.info('compute_ips:%s' % self.computes)
+ self.log.info('use_containers:%s' % self.use_containers)
+
+ def _set_docker_restart_cmd(self, service):
+ # There can be multiple instances running so need to restart all
+ cmd = "for container in `sudo docker ps | grep "
+ cmd += service
+ cmd += " | awk '{print $1}'`; do sudo docker restart $container; \
+ done;"
+ return cmd
+
+ def set_apply_patches(self):
+ self.log.info('Set apply patches start......')
+
+ set_scripts = [self.cm_set_script]
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@ceilometer-anotification.service'
+
+ client = LocalSSH(self.log)
+ self._run_apply_patches(client,
+ restart_cmd,
+ set_scripts,
+ python=self.python)
+ time.sleep(7)
+
+ self.log.info('Set apply patches start......')
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@n-cpu.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(client,
+ restart_cmd,
+ [self.nc_set_compute_script],
+ python=self.python)
+ time.sleep(7)
+
+ def restore_apply_patches(self):
+ self.log.info('restore apply patches start......')
+
+ restore_scripts = [self.cm_restore_script]
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@ceilometer-anotification.service'
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd += ' devstack@n-sch.service'
+ restore_scripts.append(self.nc_restore_compute_script)
+
+ client = LocalSSH(self.log)
+ self._run_apply_patches(client,
+ restart_cmd,
+ restore_scripts,
+ python=self.python)
+
+ if self.conf.test_case != 'fault_management':
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' devstack@n-cpu.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(
+ client, restart_cmd,
+ [self.nc_restore_compute_script],
+ python=self.python)
diff --git a/doctor_tests/installer/local.py b/doctor_tests/installer/local.py
deleted file mode 100644
index fee14f33..00000000
--- a/doctor_tests/installer/local.py
+++ /dev/null
@@ -1,118 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-import os
-import shutil
-import subprocess
-
-from doctor_tests.installer.base import BaseInstaller
-from doctor_tests.installer.common.vitrage import \
- set_vitrage_host_down_template
-from doctor_tests.common.constants import Inspector
-from doctor_tests.common.utils import load_json_file
-from doctor_tests.common.utils import write_json_file
-
-
-class LocalInstaller(BaseInstaller):
- node_user_name = 'root'
-
- nova_policy_file = '/etc/nova/policy.json'
- nova_policy_file_backup = '%s%s' % (nova_policy_file, '.bak')
-
- def __init__(self, conf, log):
- super(LocalInstaller, self).__init__(conf, log)
- self.policy_modified = False
- self.add_policy_file = False
-
- def setup(self):
- self.get_ssh_key_from_installer()
- self.set_apply_patches()
-
- def cleanup(self):
- self.restore_apply_patches()
-
- def get_ssh_key_from_installer(self):
- self.log.info('Assuming SSH keys already exchanged with computer'
- 'for local installer type')
- return None
-
- def get_host_ip_from_hostname(self, hostname):
- self.log.info('Get host ip from host name in local installer......')
-
- cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname)
- server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
- stdout, stderr = server.communicate()
- host_ip = stdout.strip().decode("utf-8")
-
- self.log.info('Get host_ip:%s from host_name:%s in local installer'
- % (host_ip, hostname))
- return host_ip
-
- def set_apply_patches(self):
- self._set_nova_policy()
- if self.conf.inspector.type == Inspector.VITRAGE:
- set_vitrage_host_down_template()
- os.system('sudo systemctl restart devstack@vitrage-graph.service')
-
- def restore_apply_patches(self):
- self._restore_nova_policy()
-
- def _set_nova_policy(self):
- host_status_policy = 'os_compute_api:servers:show:host_status'
- host_status_rule = 'rule:admin_or_owner'
- policy_data = {
- 'context_is_admin': 'role:admin',
- 'owner': 'user_id:%(user_id)s',
- 'admin_or_owner': 'rule:context_is_admin or rule:owner',
- host_status_policy: host_status_rule
- }
-
- if os.path.isfile(self.nova_policy_file):
- data = load_json_file(self.nova_policy_file)
- if host_status_policy in data:
- rule_origion = data[host_status_policy]
- if host_status_rule == rule_origion:
- self.log.info('Do not need to modify nova policy.')
- self.policy_modified = False
- else:
- # update the host_status_policy
- data[host_status_policy] = host_status_rule
- self.policy_modified = True
- else:
- # add the host_status_policy, if the admin_or_owner is not
- # defined, add it also
- for policy, rule in policy_data.items():
- if policy not in data:
- data[policy] = rule
- self.policy_modified = True
- if self.policy_modified:
- self.log.info('Nova policy is Modified.')
- shutil.copyfile(self.nova_policy_file,
- self.nova_policy_file_backup)
- else:
- # file does not exit, create a new one and add the policy
- self.log.info('Nova policy file not exist. Creating a new one')
- data = policy_data
- self.add_policy_file = True
-
- if self.policy_modified or self.add_policy_file:
- write_json_file(self.nova_policy_file, data)
- os.system('sudo systemctl restart devstack@n-api.service')
-
- def _restore_nova_policy(self):
- if self.policy_modified:
- shutil.copyfile(self.nova_policy_file_backup,
- self.nova_policy_file)
- os.remove(self.nova_policy_file_backup)
- elif self.add_policy_file:
- os.remove(self.nova_policy_file)
-
- if self.add_policy_file or self.policy_modified:
- os.system('sudo systemctl restart devstack@n-api.service')
- self.add_policy_file = False
- self.policy_modified = False
diff --git a/doctor_tests/installer/mcp.py b/doctor_tests/installer/mcp.py
index 8ba9f000..7659c9e2 100644
--- a/doctor_tests/installer/mcp.py
+++ b/doctor_tests/installer/mcp.py
@@ -1,5 +1,5 @@
##############################################################################
-# Copyright (c) 2018 ZTE Corporation and others.
+# Copyright (c) 2019 ZTE Corporation and others.
#
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
@@ -7,58 +7,117 @@
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
from os.path import isfile
+import re
+import time
+from doctor_tests.common.constants import is_fenix
+from doctor_tests.common.utils import get_doctor_test_root_dir
from doctor_tests.common.utils import SSHClient
from doctor_tests.installer.base import BaseInstaller
class McpInstaller(BaseInstaller):
node_user_name = 'ubuntu'
- cm_set_script = 'set_ceilometer.py'
- cm_restore_script = 'restore_ceilometer.py'
+
+ cm_set_script = 'set_config.py'
+ nc_set_compute_script = 'set_compute_config.py'
+ fe_set_script = 'set_fenix.sh'
+ cm_restore_script = 'restore_config.py'
+ nc_restore_compute_script = 'restore_compute_config.py'
+ ac_restart_script = 'restart_aodh.py'
+ ac_restore_script = 'restore_aodh.py'
+ python = 'python3'
def __init__(self, conf, log):
super(McpInstaller, self).__init__(conf, log)
self.key_file = self.get_ssh_key_from_installer()
self.client = SSHClient(self.conf.installer.ip,
self.node_user_name,
- key_filename=self.key_file)
+ key_filename=self.key_file,
+ look_for_keys=True)
self.controllers = list()
self.controller_clients = list()
+ self.computes = list()
def setup(self):
self.log.info('Setup MCP installer start......')
-
- self.controllers = self.get_controller_ips()
+ self.get_node_ips()
self.create_flavor()
- self.set_apply_patches()
+ if is_fenix(self.conf):
+ self.set_apply_patches()
self.setup_stunnel()
def cleanup(self):
- self.restore_apply_patches()
+ if is_fenix(self.conf):
+ self.restore_apply_patches()
for server in self.servers:
server.terminate()
def get_ssh_key_from_installer(self):
self.log.info('Get SSH keys from MCP......')
- # Assuming mcp.rsa is already mapped to functest container
- # if not, only the test runs on jumphost can get the ssh_key
- # default in path /var/lib/opnfv/mcp.rsa
+ # Default in path /var/lib/opnfv/mcp.rsa
ssh_key = '/root/.ssh/id_rsa'
mcp_key = '/var/lib/opnfv/mcp.rsa'
- return ssh_key if isfile(ssh_key) else mcp_key
-
- def get_controller_ips(self):
- self.log.info('Get controller ips from Mcp installer......')
-
- command = "sudo salt --out yaml 'ctl*' " \
- "pillar.get _param:openstack_control_address |" \
- "awk '{print $2}'"
- controllers = self._run_cmd_remote(self.client, command)
- self.log.info('Get controller_ips:%s from Mcp installer'
- % controllers)
- return controllers
+ return mcp_key if isfile(mcp_key) else ssh_key
+
+ def get_transport_url(self):
+ client = SSHClient(self.controllers[0], self.node_user_name,
+ key_filename=self.key_file)
+ try:
+ cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf'
+ ret, url = client.ssh(cmd)
+
+ if ret:
+ raise Exception('Exec command to get transport from '
+ 'controller(%s) in MCP installer failed, '
+ 'ret=%s, output=%s'
+ % (self.controllers[0], ret, url))
+ elif self.controllers[0] not in url:
+ # need to use ip instead of hostname
+ url = (re.sub("@.*:", "@%s:" % self.controllers[0],
+ url[0].split("=", 1)[1]))
+ except Exception:
+ cmd = 'grep -i "^rabbit" /etc/nova/nova.conf'
+ ret, lines = client.ssh(cmd)
+ if ret:
+ raise Exception('Exec command to get transport from '
+ 'controller(%s) in MCP installer failed, '
+ 'ret=%s, output=%s'
+ % (self.controllers[0], ret, url))
+ else:
+ for line in lines.split('\n'):
+ if line.startswith("rabbit_userid"):
+ rabbit_userid = line.split("=")
+ if line.startswith("rabbit_port"):
+ rabbit_port = line.split("=")
+ if line.startswith("rabbit_password"):
+ rabbit_password = line.split("=")
+ url = "rabbit://%s:%s@%s:%s/?ssl=0" % (rabbit_userid,
+ rabbit_password,
+ self.controllers[0],
+ rabbit_port)
+ self.log.info('get_transport_url %s' % url)
+ return url
+
+ def _copy_overcloudrc_to_controllers(self):
+ for ip in self.controllers:
+ cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip)
+ self._run_cmd_remote(self.client, cmd)
+
+ def get_node_ips(self):
+ self.log.info('Get node ips from Mcp installer......')
+
+ command = 'sudo salt "*" --out yaml pillar.get _param:single_address'
+ node_details = self._run_cmd_remote(self.client, command)
+
+ self.controllers = [line.split()[1] for line in node_details
+ if line.startswith("ctl")]
+ self.computes = [line.split()[1] for line in node_details
+ if line.startswith("cmp")]
+
+ self.log.info('controller_ips:%s' % self.controllers)
+ self.log.info('compute_ips:%s' % self.computes)
def get_host_ip_from_hostname(self, hostname):
command = "sudo salt --out yaml '%s*' " \
@@ -69,21 +128,80 @@ class McpInstaller(BaseInstaller):
def set_apply_patches(self):
self.log.info('Set apply patches start......')
+ fenix_files = None
+ set_scripts = [self.cm_set_script]
+ thrs = []
+
+ restart_cmd = 'sudo systemctl restart' \
+ ' ceilometer-agent-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ if is_fenix(self.conf):
+ set_scripts.append(self.fe_set_script)
+ testdir = get_doctor_test_root_dir()
+ fenix_files = ["Dockerfile", "run"]
+ restart_cmd += ' nova-scheduler.service'
+ set_scripts.append(self.nc_set_compute_script)
- restart_cm_cmd = 'sudo service ceilometer-agent-notification restart'
for node_ip in self.controllers:
client = SSHClient(node_ip, self.node_user_name,
key_filename=self.key_file)
- self.controller_clients.append(client)
- self._run_apply_patches(client,
- restart_cm_cmd,
- self.cm_set_script)
+ if fenix_files is not None:
+ for fenix_file in fenix_files:
+ src_file = '{0}/{1}/{2}'.format(testdir,
+ 'admin_tool/fenix',
+ fenix_file)
+ client.scp(src_file, fenix_file)
+ thrs.append(self._run_apply_patches(client,
+ restart_cmd,
+ set_scripts,
+ python=self.python))
+ time.sleep(5)
+
+ self.log.info('Set apply patches start......')
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd = 'sudo systemctl restart nova-compute.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ thrs.append(self._run_apply_patches(
+ client,
+ restart_cmd,
+ [self.nc_set_compute_script],
+ python=self.python))
+ time.sleep(5)
+ # If Fenix container ir build, it needs to be ready before continue
+ for thr in thrs:
+ thr.join()
def restore_apply_patches(self):
self.log.info('restore apply patches start......')
- restart_cm_cmd = 'sudo service ceilometer-agent-notification restart'
- for client in self.controller_clients:
+ restore_scripts = [self.cm_restore_script]
+
+ restore_scripts.append(self.ac_restore_script)
+ restart_cmd = 'sudo systemctl restart' \
+ ' ceilometer-agent-notification.service'
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd += ' nova-scheduler.service'
+ restore_scripts.append(self.nc_restore_compute_script)
+
+ for node_ip in self.controllers:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
self._run_apply_patches(client,
- restart_cm_cmd,
- self.cm_restore_script)
+ restart_cmd,
+ restore_scripts,
+ python=self.python)
+
+ if self.conf.test_case != 'fault_management':
+ restart_cmd = 'sudo systemctl restart nova-compute.service'
+ for node_ip in self.computes:
+ client = SSHClient(node_ip, self.node_user_name,
+ key_filename=self.key_file)
+ self._run_apply_patches(
+ client, restart_cmd,
+ [self.nc_restore_compute_script],
+ python=self.python)
diff --git a/doctor_tests/main.py b/doctor_tests/main.py
index 61facb61..7573faec 100644
--- a/doctor_tests/main.py
+++ b/doctor_tests/main.py
@@ -1,5 +1,5 @@
##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
+# Copyright (c) 2019 ZTE Corporation and others.
#
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
@@ -10,6 +10,7 @@ import os
from os.path import isfile, join
import sys
import time
+from traceback import format_exc
from doctor_tests import config
from doctor_tests.identity_auth import get_identity_auth
@@ -17,8 +18,9 @@ from doctor_tests.identity_auth import get_session
from doctor_tests.image import Image
from doctor_tests.installer import get_installer
import doctor_tests.logger as doctor_log
-from doctor_tests.os_clients import nova_client
from doctor_tests.scenario.fault_management import FaultManagement
+from doctor_tests.os_clients import nova_client
+from doctor_tests.scenario.maintenance import Maintenance
from doctor_tests.user import User
@@ -41,7 +43,6 @@ class DoctorTest(object):
def setup(self):
# prepare the cloud env
self.installer.setup()
-
# preparing VM image...
self.image.create()
@@ -49,37 +50,50 @@ class DoctorTest(object):
self.user.create()
def test_fault_management(self):
- try:
- LOG.info('doctor fault management test starting.......')
-
- self.fault_management = \
- FaultManagement(self.conf, self.installer, self.user, LOG)
-
- # prepare test env
- self.fault_management.setup()
-
- # wait for aodh alarms are updated in caches for event evaluator,
- # sleep time should be larger than event_alarm_cache_ttl
- # (default 60)
- # (tojuvone) Fraser currently needs 120
- time.sleep(120)
-
- # injecting host failure...
- # NOTE (umar) add INTERFACE_NAME logic to host injection
- self.fault_management.start()
- time.sleep(10)
-
- # verify the test results
- # NOTE (umar) copy remote monitor.log file when monitor=collectd
- self.fault_management.check_host_status('down')
- self.fault_management.check_notification_time()
-
- except Exception as e:
- LOG.error('doctor fault management test failed, '
- 'Exception=%s' % e)
- sys.exit(1)
- finally:
- self.fault_management.cleanup()
+ retry = 2
+ # Retry once if notified_time is None
+ while retry > 0:
+ try:
+ self.fault_management = None
+ LOG.info('doctor fault management test starting.......')
+ transport_url = self.installer.get_transport_url()
+ self.fault_management = \
+ FaultManagement(self.conf, self.installer, self.user, LOG,
+ transport_url)
+
+ # prepare test env
+ self.fault_management.setup()
+
+ # wait for aodh alarms are updated in caches for event
+ # evaluator,sleep time should be larger than
+ # event_alarm_cache_ttl (default 60)
+ # (tojuvone) Fraser currently needs 120
+ time.sleep(120)
+
+ # injecting host failure...
+ # NOTE (umar) add INTERFACE_NAME logic to host injection
+ self.fault_management.start()
+ time.sleep(30)
+
+ # verify the test results
+ # NOTE (umar) copy remote monitor.log file when
+ # monitor=collectd
+ self.fault_management.check_host_status('down')
+ self.fault_management.check_notification_time()
+ retry = 0
+
+ except Exception as e:
+ LOG.error('doctor fault management test failed, '
+ 'Exception=%s' % e)
+ if 'notified_time=None' in str(e):
+ retry -= 1
+ LOG.info('doctor fault management retry')
+ continue
+ LOG.error(format_exc())
+ sys.exit(1)
+ finally:
+ if self.fault_management is not None:
+ self.fault_management.cleanup()
def _amount_compute_nodes(self):
services = self.nova.services.list(binary='nova-compute')
@@ -92,20 +106,44 @@ class DoctorTest(object):
LOG.info('not enough compute nodes, skipping doctor '
'maintenance test')
return
+ elif self.conf.installer.type not in ['apex', 'fuel', 'devstack']:
+ LOG.info('not supported installer, skipping doctor '
+ 'maintenance test')
+ return
try:
+ maintenance = None
LOG.info('doctor maintenance test starting.......')
- # TODO (tojuvone) test setup and actual test
+ trasport_url = self.installer.get_transport_url()
+ maintenance = Maintenance(trasport_url, self.conf, LOG)
+ maintenance.setup_maintenance(self.user)
+
+ # wait for aodh alarms are updated in caches for event evaluator,
+ # sleep time should be larger than event_alarm_cache_ttl
+ # (default 60)
+ LOG.info('wait aodh for 120s.......')
+ time.sleep(120)
+
+ session_id = maintenance.start_maintenance()
+ maintenance.wait_maintenance_complete(session_id)
+
+ LOG.info('doctor maintenance complete.......')
+
except Exception as e:
LOG.error('doctor maintenance test failed, Exception=%s' % e)
+ LOG.error(format_exc())
sys.exit(1)
- # TODO (tojuvone) finally: test case specific cleanup
+ finally:
+ if maintenance is not None:
+ maintenance.cleanup_maintenance()
def run(self):
"""run doctor tests"""
try:
LOG.info('doctor test starting.......')
+
# prepare common test env
self.setup()
+
if self.conf.test_case == 'all':
self.test_fault_management()
self.test_maintenance()
@@ -119,6 +157,7 @@ class DoctorTest(object):
% function)
except Exception as e:
LOG.error('doctor test failed, Exception=%s' % e)
+ LOG.error(format_exc())
sys.exit(1)
finally:
self.cleanup()
diff --git a/doctor_tests/maintenance_hot_tpl.yaml b/doctor_tests/maintenance_hot_tpl.yaml
new file mode 100644
index 00000000..e2e47023
--- /dev/null
+++ b/doctor_tests/maintenance_hot_tpl.yaml
@@ -0,0 +1,119 @@
+---
+heat_template_version: 2017-02-24
+description: Doctor Maintenance test case
+
+parameters:
+ ext_net:
+ type: string
+ default: external
+ flavor_vcpus:
+ type: number
+ default: 24
+ maint_image:
+ type: string
+ default: cirros
+ ha_intances:
+ type: number
+ default: 2
+ nonha_intances:
+ type: number
+ default: 4
+ app_manager_alarm_url:
+ type: string
+ default: http://0.0.0.0:12348/maintenance
+ inpector_alarm_url:
+ type: string
+ default: http://0.0.0.0:12345/maintenance
+
+
+resources:
+ int_net:
+ type: OS::Neutron::Net
+
+ int_subnet:
+ type: OS::Neutron::Subnet
+ properties:
+ network_id: {get_resource: int_net}
+ cidr: "9.9.9.0/24"
+ dns_nameservers: ["8.8.8.8"]
+ ip_version: 4
+
+ int_router:
+ type: OS::Neutron::Router
+ properties:
+ external_gateway_info: {network: {get_param: ext_net}}
+
+ int_interface:
+ type: OS::Neutron::RouterInterface
+ properties:
+ router_id: {get_resource: int_router}
+ subnet: {get_resource: int_subnet}
+
+ maint_instance_flavor:
+ type: OS::Nova::Flavor
+ properties:
+ name: doctor_maint_flavor
+ ram: 512
+ vcpus: {get_param: flavor_vcpus}
+ disk: 1
+
+ ha_app_svrgrp:
+ type: OS::Nova::ServerGroup
+ properties:
+ name: doctor_ha_app_group
+ policies: ['anti-affinity']
+
+ floating_ip:
+ type: OS::Nova::FloatingIP
+ properties:
+ pool: {get_param: ext_net}
+
+ multi_ha_instances:
+ type: OS::Heat::ResourceGroup
+ properties:
+ count: {get_param: ha_intances}
+ resource_def:
+ type: OS::Nova::Server
+ properties:
+ name: doctor_ha_app_%index%
+ flavor: {get_resource: maint_instance_flavor}
+ image: {get_param: maint_image}
+ networks:
+ - network: {get_resource: int_net}
+ scheduler_hints:
+ group: {get_resource: ha_app_svrgrp}
+
+ multi_nonha_instances:
+ type: OS::Heat::ResourceGroup
+ properties:
+ count: {get_param: nonha_intances}
+ resource_def:
+ type: OS::Nova::Server
+ properties:
+ name: doctor_nonha_app_%index%
+ flavor: {get_resource: maint_instance_flavor}
+ image: {get_param: maint_image}
+ networks:
+ - network: {get_resource: int_net}
+
+ association:
+ type: OS::Nova::FloatingIPAssociation
+ properties:
+ floating_ip: {get_resource: floating_ip}
+ server_id: {get_attr: [multi_ha_instances, resource.0]}
+
+ app_manager_alarm:
+ type: OS::Aodh::EventAlarm
+ properties:
+ alarm_actions:
+ - {get_param: app_manager_alarm_url}
+ event_type: "maintenance.scheduled"
+ repeat_actions: true
+
+ inpector_alarm:
+ type: OS::Aodh::EventAlarm
+ properties:
+ alarm_actions:
+ - {get_param: inpector_alarm_url}
+ event_type: "maintenance.host"
+ repeat_actions: true
diff --git a/doctor_tests/os_clients.py b/doctor_tests/os_clients.py
index 640281df..7ab4e9b4 100644
--- a/doctor_tests/os_clients.py
+++ b/doctor_tests/os_clients.py
@@ -11,6 +11,7 @@ from oslo_config import cfg
import aodhclient.client as aodhclient
from congressclient.v1 import client as congressclient
import glanceclient.client as glanceclient
+import heatclient.client as heatclient
from keystoneclient import client as ks_client
from neutronclient.v2_0 import client as neutronclient
import novaclient.client as novaclient
@@ -23,6 +24,7 @@ OPTS = [
cfg.StrOpt('aodh_version', default='2', help='aodh version'),
cfg.StrOpt('vitrage_version', default='1', help='vitrage version'),
cfg.StrOpt('keystone_version', default='v3', help='keystone version'),
+ cfg.StrOpt('heat_version', default='1', help='heat version'),
]
@@ -31,6 +33,11 @@ def glance_client(version, session):
session=session)
+def heat_client(version, session):
+ return heatclient.Client(version=version,
+ session=session)
+
+
def keystone_client(version, session):
return ks_client.Client(version=version,
session=session)
diff --git a/doctor_tests/scenario/fault_management.py b/doctor_tests/scenario/fault_management.py
index b1fe8099..0271dffe 100644
--- a/doctor_tests/scenario/fault_management.py
+++ b/doctor_tests/scenario/fault_management.py
@@ -32,7 +32,7 @@ dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
sleep 1
sudo ip link set $dev down
echo "doctor set link down at" $(date "+%s.%N")
-sleep 10
+sleep 30
sudo ip link set $dev up
sleep 1
"""
@@ -40,7 +40,7 @@ sleep 1
class FaultManagement(object):
- def __init__(self, conf, installer, user, log):
+ def __init__(self, conf, installer, user, log, transport_url):
self.conf = conf
self.log = log
self.user = user
@@ -55,7 +55,7 @@ class FaultManagement(object):
self.network = Network(self.conf, log)
self.instance = Instance(self.conf, log)
self.alarm = Alarm(self.conf, log)
- self.inspector = get_inspector(self.conf, log)
+ self.inspector = get_inspector(self.conf, log, transport_url)
self.monitor = get_monitor(self.conf,
self.inspector.get_inspector_url(),
log)
@@ -111,7 +111,10 @@ class FaultManagement(object):
server = servers.get(vm_name)
if not server:
raise Exception('Can not find instance: vm_name(%s)' % vm_name)
- host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
+ # use hostname without domain name which is mapped to the cell
+ hostname = \
+ server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
+ host_name = hostname.split('.')[0]
host_ip = self.installer.get_host_ip_from_hostname(host_name)
self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
@@ -180,10 +183,17 @@ class FaultManagement(object):
notification_time = \
self.consumer.notified_time - \
self.monitor.detected_time
+
+ self.log.info('doctor fault management notification_time=%s'
+ % notification_time)
+
if notification_time < 1 and notification_time > 0:
- self.log.info('doctor fault management test successfully,'
- 'notification_time=%s' % notification_time)
+ self.log.info('doctor fault management test successfully')
else:
+ if self.conf.profiler_type:
+ self.log.info('run doctor fault management profile.......')
+ self.run_profiler()
+
raise Exception('doctor fault management test failed, '
'notification_time=%s' % notification_time)
@@ -202,6 +212,10 @@ class FaultManagement(object):
detected = self.monitor.detected_time
notified = self.consumer.notified_time
+ if None in [vmdown, hostdown, detected, notified]:
+ self.log.info('one of the time for profiler is None, return')
+ return
+
# TODO(yujunz) check the actual delay to verify time sync status
# expected ~1s delay from $trigger to $linkdown
relative_start = linkdown
diff --git a/doctor_tests/scenario/maintenance.py b/doctor_tests/scenario/maintenance.py
new file mode 100644
index 00000000..e6cdcccd
--- /dev/null
+++ b/doctor_tests/scenario/maintenance.py
@@ -0,0 +1,250 @@
+##############################################################################
+# Copyright (c) 2019 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import datetime
+import json
+import requests
+import time
+
+from doctor_tests.admin_tool import get_admin_tool
+from doctor_tests.app_manager import get_app_manager
+from doctor_tests.common.utils import get_doctor_test_root_dir
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.inspector import get_inspector
+from doctor_tests.os_clients import keystone_client
+from doctor_tests.os_clients import neutron_client
+from doctor_tests.os_clients import nova_client
+from doctor_tests.stack import Stack
+
+
+class Maintenance(object):
+
+ def __init__(self, trasport_url, conf, log):
+ self.conf = conf
+ self.log = log
+ self.admin_session = get_session()
+ self.keystone = keystone_client(
+ self.conf.keystone_version, get_session())
+ self.nova = nova_client(conf.nova_version, get_session())
+ auth = get_identity_auth(project=self.conf.doctor_project)
+ self.neutron = neutron_client(get_session(auth=auth))
+ self.stack = Stack(self.conf, self.log)
+ if self.conf.installer.type == "devstack":
+ self.endpoint_ip = trasport_url.split("@", 1)[1].split(":", 1)[0]
+ else:
+ self.endpoint_ip = self.conf.admin_tool.ip
+ self.endpoint = "http://%s:12347/" % self.endpoint_ip
+ if self.conf.admin_tool.type == 'sample':
+ self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log)
+ self.endpoint += 'maintenance'
+ else:
+ self.endpoint += 'v1/maintenance'
+ self.app_manager = get_app_manager(self.stack, self.conf, self.log)
+ self.inspector = get_inspector(self.conf, self.log, trasport_url)
+
+ def get_external_network(self):
+ ext_net = None
+ networks = self.neutron.list_networks()['networks']
+ for network in networks:
+ if network['router:external']:
+ ext_net = network['name']
+ break
+ if ext_net is None:
+ raise Exception("external network not defined")
+ return ext_net
+
+ def setup_maintenance(self, user):
+ # each hypervisor needs to have same amount of vcpus and they
+ # need to be free before test
+ hvisors = self.nova.hypervisors.list(detailed=True)
+ prev_vcpus = 0
+ prev_hostname = ''
+ self.log.info('checking hypervisors.......')
+ for hvisor in hvisors:
+ vcpus = hvisor.__getattr__('vcpus')
+ vcpus_used = hvisor.__getattr__('vcpus_used')
+ hostname = hvisor.__getattr__('hypervisor_hostname')
+ if vcpus < 2:
+ raise Exception('not enough vcpus (%d) on %s' %
+ (vcpus, hostname))
+ if vcpus_used > 0:
+ if self.conf.test_case == 'all':
+ # VCPU might not yet be free after fault_management test
+ self.log.info('%d vcpus used on %s, retry...'
+ % (vcpus_used, hostname))
+ time.sleep(15)
+ hvisor = self.nova.hypervisors.get(hvisor.id)
+ vcpus_used = hvisor.__getattr__('vcpus_used')
+ if vcpus_used > 0:
+ raise Exception('%d vcpus used on %s'
+ % (vcpus_used, hostname))
+ if prev_vcpus != 0 and prev_vcpus != vcpus:
+ raise Exception('%d vcpus on %s does not match to'
+ '%d on %s'
+ % (vcpus, hostname,
+ prev_vcpus, prev_hostname))
+ prev_vcpus = vcpus
+ prev_hostname = hostname
+
+ # maintenance flavor made so that 2 instances take whole node
+ flavor_vcpus = int(vcpus / 2)
+ compute_nodes = len(hvisors)
+ amount_actstdby_instances = 2
+ amount_noredundancy_instances = 2 * compute_nodes - 2
+ self.log.info('testing %d computes with %d vcpus each'
+ % (compute_nodes, vcpus))
+ self.log.info('testing %d actstdby and %d noredundancy instances'
+ % (amount_actstdby_instances,
+ amount_noredundancy_instances))
+ max_instances = (amount_actstdby_instances +
+ amount_noredundancy_instances)
+ max_cores = compute_nodes * vcpus
+
+ user.update_quota(max_instances, max_cores)
+
+ test_dir = get_doctor_test_root_dir()
+ template_file = '{0}/{1}'.format(test_dir, 'maintenance_hot_tpl.yaml')
+ files, template = self.stack.get_hot_tpl(template_file)
+
+ ext_net = self.get_external_network()
+
+ parameters = {'ext_net': ext_net,
+ 'flavor_vcpus': flavor_vcpus,
+ 'maint_image': self.conf.image_name,
+ 'nonha_intances': amount_noredundancy_instances,
+ 'ha_intances': amount_actstdby_instances}
+
+ self.log.info('creating maintenance stack.......')
+ self.log.info('parameters: %s' % parameters)
+
+ self.stack.create('doctor_test_maintenance',
+ template,
+ parameters=parameters,
+ files=files)
+
+ if self.conf.admin_tool.type == 'sample':
+ self.admin_tool.start()
+ else:
+ # TBD Now we expect Fenix is running in self.conf.admin_tool.port
+ pass
+ # Inspector before app_manager, as floating ip might come late
+ self.inspector.start()
+ self.app_manager.start()
+
+ def start_maintenance(self):
+ self.log.info('start maintenance.......')
+ hvisors = self.nova.hypervisors.list(detailed=True)
+ maintenance_hosts = list()
+ for hvisor in hvisors:
+ hostname = hvisor.__getattr__('hypervisor_hostname')
+ maintenance_hosts.append(hostname)
+ url = self.endpoint
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+ if self.conf.admin_tool.type == 'fenix':
+ headers['X-Auth-Token'] = self.admin_session.get_token()
+ self.log.info('url %s headers %s' % (url, headers))
+ retries = 12
+ ret = None
+ while retries > 0:
+ # let's start maintenance 20sec from now, so projects will have
+ # time to ACK to it before that
+ maintenance_at = (datetime.datetime.utcnow() +
+ datetime.timedelta(seconds=30)
+ ).strftime('%Y-%m-%d %H:%M:%S')
+
+ data = {'state': 'MAINTENANCE',
+ 'maintenance_at': maintenance_at,
+ 'metadata': {'openstack_version': 'Train'}}
+
+ if self.conf.app_manager.type == 'vnfm':
+ data['workflow'] = 'vnf'
+ else:
+ data['workflow'] = 'default'
+
+ if self.conf.admin_tool.type == 'sample':
+ data['hosts'] = maintenance_hosts
+ else:
+ data['hosts'] = []
+ try:
+ ret = requests.post(url, data=json.dumps(data),
+ headers=headers)
+ except Exception:
+ if retries == 0:
+ raise Exception('admin tool did not respond in 120s')
+ else:
+ self.log.info('admin tool not ready, retry in 10s')
+ retries = retries - 1
+ time.sleep(10)
+ continue
+ break
+ if not ret:
+ raise Exception("admin tool did not respond")
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ return ret.json()['session_id']
+
+ def remove_maintenance_session(self, session_id):
+ self.log.info('remove maintenance session %s.......' % session_id)
+
+ url = ('%s/%s' % (self.endpoint, session_id))
+
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+
+ if self.conf.admin_tool.type == 'fenix':
+ headers['X-Auth-Token'] = self.admin_session.get_token()
+
+ ret = requests.delete(url, data=None, headers=headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+
+ def get_maintenance_state(self, session_id):
+
+ url = ('%s/%s' % (self.endpoint, session_id))
+
+ headers = {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'}
+
+ if self.conf.admin_tool.type == 'fenix':
+ headers['X-Auth-Token'] = self.admin_session.get_token()
+
+ ret = requests.get(url, data=None, headers=headers)
+ if ret.status_code != 200:
+ raise Exception(ret.text)
+ return ret.json()['state']
+
+ def wait_maintenance_complete(self, session_id):
+ retries = 90
+ state = None
+ time.sleep(300)
+ while (state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and
+ retries > 0):
+ time.sleep(10)
+ state = self.get_maintenance_state(session_id)
+ retries = retries - 1
+ self.remove_maintenance_session(session_id)
+ self.log.info('maintenance %s ended with state %s' %
+ (session_id, state))
+ if state == 'MAINTENANCE_FAILED':
+ raise Exception('maintenance %s failed' % session_id)
+ elif retries == 0:
+ raise Exception('maintenance %s not completed within 20min' %
+ session_id)
+
+ def cleanup_maintenance(self):
+ if self.conf.admin_tool.type == 'sample':
+ self.admin_tool.stop()
+ self.app_manager.stop()
+ self.inspector.stop()
+ self.log.info('stack delete start.......')
+ self.stack.delete()
diff --git a/doctor_tests/stack.py b/doctor_tests/stack.py
new file mode 100644
index 00000000..8a921beb
--- /dev/null
+++ b/doctor_tests/stack.py
@@ -0,0 +1,118 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import time
+
+from heatclient.common.template_utils import get_template_contents
+from heatclient import exc as heat_excecption
+
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import heat_client
+
+
+class Stack(object):
+
+ def __init__(self, conf, log):
+ self.conf = conf
+ self.log = log
+ auth = get_identity_auth(project=self.conf.doctor_project)
+ self.heat = heat_client(self.conf.heat_version,
+ get_session(auth=auth))
+ self.stack_name = None
+ self.stack_id = None
+ self.template = None
+ self.parameters = {}
+ self.files = {}
+
+ # standard yaml.load will not work for hot tpl becasue of date format in
+ # heat_template_version is not string
+ def get_hot_tpl(self, template_file):
+ if not os.path.isfile(template_file):
+ raise Exception('File(%s) does not exist' % template_file)
+ return get_template_contents(template_file=template_file)
+
+ def _wait_stack_action_complete(self, action):
+ action_in_progress = '%s_IN_PROGRESS' % action
+ action_complete = '%s_COMPLETE' % action
+ action_failed = '%s_FAILED' % action
+
+ status = action_in_progress
+ stack_retries = 160
+ while status == action_in_progress and stack_retries > 0:
+ time.sleep(2)
+ try:
+ stack = self.heat.stacks.get(self.stack_name)
+ except heat_excecption.HTTPNotFound:
+ if action == 'DELETE':
+ # Might happen you never get status as stack deleted
+ status = action_complete
+ break
+ else:
+ raise Exception('unable to get stack')
+ status = stack.stack_status
+ stack_retries = stack_retries - 1
+ if stack_retries == 0 and status != action_complete:
+ raise Exception("stack %s not completed within 5min, status:"
+ " %s" % (action, status))
+ elif status == action_complete:
+ self.log.info('stack %s %s' % (self.stack_name, status))
+ elif status == action_failed:
+ raise Exception("stack %s failed" % action)
+ else:
+ self.log.error('stack %s %s' % (self.stack_name, status))
+ raise Exception("stack %s unknown result" % action)
+
+ def wait_stack_delete(self):
+ self._wait_stack_action_complete('DELETE')
+
+ def wait_stack_create(self):
+ self._wait_stack_action_complete('CREATE')
+
+ def wait_stack_update(self):
+ self._wait_stack_action_complete('UPDATE')
+
+ def create(self, stack_name, template, parameters={}, files={}):
+ self.stack_name = stack_name
+ self.template = template
+ self.parameters = parameters
+ self.files = files
+ stack = self.heat.stacks.create(stack_name=self.stack_name,
+ files=files,
+ template=template,
+ parameters=parameters)
+ self.stack_id = stack['stack']['id']
+ try:
+ self.wait_stack_create()
+ except Exception:
+ # It might not always work at first
+ self.log.info('retry creating maintenance stack.......')
+ self.delete()
+ time.sleep(5)
+ stack = self.heat.stacks.create(stack_name=self.stack_name,
+ files=files,
+ template=template,
+ parameters=parameters)
+ self.stack_id = stack['stack']['id']
+ self.wait_stack_create()
+
+ def update(self, stack_name, stack_id, template, parameters={}, files={}):
+ self.heat.stacks.update(stack_name=stack_name,
+ stack_id=stack_id,
+ files=files,
+ template=template,
+ parameters=parameters)
+ self.wait_stack_update()
+
+ def delete(self):
+ if self.stack_id is not None:
+ self.heat.stacks.delete(self.stack_name)
+ self.wait_stack_delete()
+ else:
+ self.log.info('no stack to delete')
diff --git a/doctor_tests/user.py b/doctor_tests/user.py
index fee3e1fb..2cd9757f 100644
--- a/doctor_tests/user.py
+++ b/doctor_tests/user.py
@@ -8,12 +8,12 @@
##############################################################################
import os
+from keystoneclient import exceptions as ks_exceptions
from oslo_config import cfg
from doctor_tests.identity_auth import get_session
from doctor_tests.os_clients import keystone_client
from doctor_tests.os_clients import nova_client
-from keystoneclient import exceptions as ks_exceptions
OPTS = [
@@ -53,10 +53,11 @@ class User(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
+ self.def_quota = None
+ self.restore_def_quota = False
self.keystone = keystone_client(
self.conf.keystone_version, get_session())
- self.nova = \
- nova_client(conf.nova_version, get_session())
+ self.nova = nova_client(conf.nova_version, get_session())
self.users = {}
self.projects = {}
self.roles = {}
@@ -83,10 +84,9 @@ class User(object):
domain=self.conf.doctor_domain_id)}
if self.conf.doctor_project not in self.projects:
self.log.info('create project......')
- test_project = \
- self.keystone.projects.create(
- self.conf.doctor_project,
- self.conf.doctor_domain_id)
+ test_project = self.keystone.projects.create(
+ self.conf.doctor_project,
+ self.conf.doctor_domain_id)
self.projects[test_project.name] = test_project
else:
self.log.info('project %s already created......'
@@ -129,7 +129,6 @@ class User(object):
def _add_user_role_in_project(self, is_admin=False):
"""add test user with test role in test project"""
-
project = self.projects.get(self.conf.doctor_project)
user_name = 'admin' if is_admin else self.conf.doctor_user
@@ -151,6 +150,13 @@ class User(object):
self.keystone.roles.grant(role, user=user, project=project)
roles_for_user[role_name] = role
+ def _restore_default_quota(self):
+ if self.def_quota is not None and self.restore_def_quota:
+ self.log.info('restore default quota......')
+ self.nova.quota_classes.update('default',
+ instances=self.def_quota.instances,
+ cores=self.def_quota.cores)
+
def delete(self):
"""delete the test user, project and role"""
self.log.info('user delete start......')
@@ -159,6 +165,8 @@ class User(object):
user = self.users.get(self.conf.doctor_user)
role = self.roles.get(self.conf.doctor_role)
+ self._restore_default_quota()
+
if project:
if 'admin' in self.roles_for_admin:
self.keystone.roles.revoke(
@@ -177,23 +185,45 @@ class User(object):
self.keystone.projects.delete(project)
self.log.info('user delete end......')
- def update_quota(self):
- self.log.info('user quota update start......')
+ def update_quota(self, instances=None, cores=None):
+ self.log.info('quota update start......')
project = self.projects.get(self.conf.doctor_project)
+
user = self.users.get(self.conf.doctor_user)
+ if instances is not None:
+ quota_instances = instances
+ else:
+ quota_instances = self.conf.quota_instances
+ if cores is not None:
+ quota_cores = cores
+ else:
+ quota_cores = self.conf.quota_cores
+
if project and user:
+ # default needs to be at least the same as with doctor_user
+ self.log.info('default quota update start......')
+
+ self.def_quota = self.nova.quota_classes.get('default')
+ if quota_instances > self.def_quota.instances:
+ self.restore_def_quota = True
+ self.nova.quota_classes.update('default',
+ instances=quota_instances)
+ if quota_cores > self.def_quota.cores:
+ self.restore_def_quota = True
+ self.nova.quota_classes.update('default',
+ cores=quota_cores)
+ self.log.info('user quota update start......')
self.quota = self.nova.quotas.get(project.id,
user_id=user.id)
- if self.conf.quota_instances > self.quota.instances:
- self.nova.quotas.update(
- project.id,
- instances=self.conf.quota_instances,
- user_id=user.id)
- if self.conf.quota_cores > self.quota.cores:
+ if quota_instances > self.quota.instances:
+ self.nova.quotas.update(project.id,
+ instances=quota_instances,
+ user_id=user.id)
+ if quota_cores > self.quota.cores:
self.nova.quotas.update(project.id,
- cores=self.conf.quota_cores,
+ cores=quota_cores,
user_id=user.id)
- self.log.info('user quota update end......')
else:
raise Exception('No project or role for update quota')
+ self.log.info('quota update end......')
diff --git a/requirements.txt b/requirements.txt
index b60878fc..1eab2a04 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,22 @@
# The order of packages is significant, because pip processes them in the order
# of appearance. Changing the order has an impact on the overall integration
# process, which may cause wedges in the gate later.
-Flask!=0.11,<1.0,>=0.10 # BSD
-paramiko>=2.0.0 # LGPLv2.1+
+Flask!=0.11 # BSD
+paramiko # LGPLv2.1+
scp
-requests>=2.14.2 # Apache-2.0
-oslo.config>=5.2.0 # Apache-2.0
-python-openstackclient>=3.12.0 # Apache-2.0
-python-ceilometerclient>=2.5.0 # Apache-2.0
-aodhclient>=0.9.0 # Apache-2.0
-python-keystoneclient>=3.8.0 # Apache-2.0
-python-neutronclient>=6.7.0 # Apache-2.0
-python-novaclient>=9.1.0 # Apache-2.0
-python-congressclient<2000,>=1.9.0 # Apache-2.0
-python-glanceclient>=2.8.0 # Apache-2.0
-python-vitrageclient>=2.0.0 # Apache-2.0
-virtualenv>=14.0.6 # MIT
+requests # Apache-2.0
+oslo.config!=4.3.0,!=4.4.0 # Apache-2.0
+python-openstackclient # Apache-2.0
+oslo.messaging # Apache-2.0
+oslo.versionedobjects # Apache-2.0
+python-ceilometerclient # Apache-2.0
+aodhclient # Apache-2.0
+python-keystoneclient!=2.1.0 # Apache-2.0
+python-neutronclient # Apache-2.0
+python-novaclient # Apache-2.0
+python-congressclient<2000 # Apache-2.0
+python-glanceclient # Apache-2.0
+python-vitrageclient # Apache-2.0
+virtualenv # MIT
+python-heatclient # Apache-2.0
flake8<2.6.0,>=2.5.4 # MIT
diff --git a/tox.ini b/tox.ini
index 53efbfe3..2937c329 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,12 +1,12 @@
[tox]
minversion = 2.3.1
-envlist = py35, pep8
+envlist = py36,pep8,docs,docs-linkcheck
skipsdist = True
[testenv]
usedevelop = True
-install_command = pip install \
- -chttps://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/pike \
+install_command = pip3 install \
+ -chttps://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/stein \
{opts} {packages}
setenv = VIRTUAL_ENV={envdir}
deps = -r{toxinidir}/requirements.txt
@@ -28,9 +28,14 @@ passenv =
CI_DEBUG
INSTALLER_TYPE
INSTALLER_IP
- PROFILER_TYPE
+ INSPECTOR_TYPE
+ ADMIN_TOOL_TYPE
+ TEST_CASE
+ SSH_KEY
+ APP_MANAGER_TYPE
changedir = {toxinidir}/doctor_tests
commands = doctor-test
+ /usr/bin/find {toxinidir} -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete
[testenv:pep8]
changedir = {toxinidir}
@@ -47,3 +52,16 @@ enable-extensions=H106,H203
builtins = _
filename = *.py,app.wsgi
exclude=.venv,.git,.tox,dist,doc,*lib/python*,*egg,build,tests
+
+[testenv:docs]
+changedir = {toxinidir}
+deps = -rdocs/requirements.txt
+commands =
+ sphinx-build -b html -n -d {envtmpdir}/doctrees ./docs/ {toxinidir}/docs/_build/html
+ echo "Generated docs available in {toxinidir}/docs/_build/html"
+whitelist_externals = echo
+
+[testenv:docs-linkcheck]
+changedir = {toxinidir}
+deps = -rdocs/requirements.txt
+commands = sphinx-build -b linkcheck -d {envtmpdir}/doctrees ./docs/ {toxinidir}/docs/_build/linkcheck