diff options
46 files changed, 1318 insertions, 475 deletions
diff --git a/docs/conf.py b/docs/conf.py index eb12e74b..3c9978bb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1 +1,2 @@ from docs_conf.conf import * # noqa: F401,F403 +master_doc = 'index' diff --git a/docs/development/index.rst b/docs/development/index.rst index 2dc16a82..a7d2817b 100644 --- a/docs/development/index.rst +++ b/docs/development/index.rst @@ -2,18 +2,18 @@ .. http://creativecommons.org/licenses/by/4.0 .. (c) 2016 OPNFV. +.. _development: -====== -Doctor -====== +=========== +Development +=========== .. toctree:: :maxdepth: 2 - ./design/index.rst - ./requirements/index.rst - ./manuals/index.rst - ./overview/functest_scenario/index.rst + ./design/index + ./overview/index + ./requirements/index Indices ======= diff --git a/docs/development/overview/index.rst b/docs/development/overview/index.rst index 956e73e3..f6d78d57 100644 --- a/docs/development/overview/index.rst +++ b/docs/development/overview/index.rst @@ -3,11 +3,12 @@ .. _doctor-overview: -************************ -Doctor Development Guide -************************ +******** +Overview +******** .. toctree:: :maxdepth: 2 + overview.rst testing.rst diff --git a/docs/development/overview/overview.rst b/docs/development/overview/overview.rst new file mode 100644 index 00000000..21f5439e --- /dev/null +++ b/docs/development/overview/overview.rst @@ -0,0 +1,52 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + +Platform overview +""""""""""""""""" + +Doctor platform provides these features since `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_: + +* Immediate Notification +* Consistent resource state awareness for compute host down +* Valid compute host status given to VM owner + +These features enable high availability of Network Services on top of +the virtualized infrastructure. Immediate notification allows VNF managers +(VNFM) to process recovery actions promptly once a failure has occurred. +Same framework can also be utilized to have VNFM awareness about +infrastructure maintenance. + +Consistency of resource state is necessary to execute recovery actions +properly in the VIM. + +Ability to query host status gives VM owner the possibility to get +consistent state information through an API in case of a compute host +fault. + +The Doctor platform consists of the following components: + +* OpenStack Compute (Nova) +* OpenStack Networking (Neutron) +* OpenStack Telemetry (Ceilometer) +* OpenStack Alarming (AODH) +* Doctor Sample Inspector, OpenStack Congress or OpenStack Vitrage +* Doctor Sample Monitor or any monitor supported by Congress or Vitrage + +.. note:: + Doctor Sample Monitor is used in Doctor testing. However in real + implementation like Vitrage, there are several other monitors supported. + +You can see an overview of the Doctor platform and how components interact in +:numref:`figure-p1`. + + +Maintenance use case provides these features since `Iruya Release <https://wiki.opnfv.org/display/SWREL/Iruya>`_: + +* Infrastructure maintenance and upgrade workflow +* Interaction between VNFM and infrastructe workflow + +Since `Jerma Release <https://wiki.opnfv.org/display/SWREL/Jerma>`_ maintenance +use case also supports 'ETSI FEAT03' implementation to have the infrastructure +maintenance and upgrade fully optimized while keeping zero impact on VNF +service. + diff --git a/docs/development/requirements/index.rst b/docs/development/requirements/index.rst index fceaebf0..ccc35cb8 100644 --- a/docs/development/requirements/index.rst +++ b/docs/development/requirements/index.rst @@ -3,9 +3,9 @@ .. _doctor-requirements: -**************************************** -Doctor: Fault Management and Maintenance -**************************************** +********************************************** +Requirements: Fault Management and Maintenance +********************************************** :Project: Doctor, https://wiki.opnfv.org/doctor :Editors: Ashiq Khan (NTT DOCOMO), Gerald Kunzmann (NTT DOCOMO) diff --git a/docs/index.rst b/docs/index.rst index 4dedb98d..b8e8bfd0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,6 @@ Fault Management and Maintenance (Doctor) :numbered: :maxdepth: 2 - release/index development/index - + release/index + testing/index diff --git a/docs/release/configguide/feature.configuration.rst b/docs/release/configguide/feature.configuration.rst index 64928eea..8fbff50e 100644 --- a/docs/release/configguide/feature.configuration.rst +++ b/docs/release/configguide/feature.configuration.rst @@ -159,3 +159,57 @@ You can configure the Sample Monitor as follows (Example for Apex deployment): "http://127.0.0.1:$INSPECTOR_PORT/events" > monitor.log 2>&1 & **Collectd Monitor** + +OpenStack components +==================== + +In OPNFV and with Doctor testing you can have all OpenStack components configured +as needed. Here is sample of the needed configuration modifications. + +Ceilometer +---------- + +/etc/ceilometer/event_definitions.yaml: +# Maintenance use case needs new alarm definitions to be added +- event_type: maintenance.scheduled + traits: + actions_at: + fields: payload.maintenance_at + type: datetime + allowed_actions: + fields: payload.allowed_actions + host_id: + fields: payload.host_id + instances: + fields: payload.instances + metadata: + fields: payload.metadata + project_id: + fields: payload.project_id + reply_url: + fields: payload.reply_url + session_id: + fields: payload.session_id + state: + fields: payload.state +- event_type: maintenance.host + traits: + host: + fields: payload.host + project_id: + fields: payload.project_id + session_id: + fields: payload.session_id + state: + fields: payload.state + +/etc/ceilometer/event_pipeline.yaml: +# Maintenance and Fault management both needs these to be added + - notifier:// + - notifier://?topic=alarm.all + +Nova +---- + +/etc/nova/nova.conf +cpu_allocation_ratio=1.0 diff --git a/docs/release/configguide/index.rst b/docs/release/configguide/index.rst index b1e7c33d..c2331115 100644 --- a/docs/release/configguide/index.rst +++ b/docs/release/configguide/index.rst @@ -3,9 +3,9 @@ .. _doctor-configguide: -************************* -Doctor Installation Guide -************************* +************************** +Doctor Configuration Guide +************************** .. toctree:: :maxdepth: 2 diff --git a/docs/release/index.rst b/docs/release/index.rst index 8a1bf405..67eb4c5f 100644 --- a/docs/release/index.rst +++ b/docs/release/index.rst @@ -2,14 +2,18 @@ .. http://creativecommons.org/licenses/by/4.0 .. (c) 2017 OPNFV. +.. _release: -====== -Doctor -====== +======= +Release +======= .. toctree:: :maxdepth: 2 + ./configguide/index.rst ./installation/index.rst + ./release-notes/index.rst + ./scenarios/fault_management/fault_management.rst + ./scenarios/maintenance/maintenance.rst ./userguide/index.rst - diff --git a/docs/development/manuals/index.rst b/docs/release/installation/index.rst index f705f94a..f6527e5d 100644 --- a/docs/development/manuals/index.rst +++ b/docs/release/installation/index.rst @@ -1,13 +1,13 @@ .. This work is licensed under a Creative Commons Attribution 4.0 International License. .. http://creativecommons.org/licenses/by/4.0 -.. _doctor-manuals: +.. _doctor-configguide: -******* -Manuals -******* +************************* +Doctor Installation Guide +************************* .. toctree:: + :maxdepth: 2 -.. include:: mark-host-down_manual.rst -.. include:: get-valid-server-state.rst + installation.rst diff --git a/docs/release/installation/installation.rst b/docs/release/installation/installation.rst new file mode 100644 index 00000000..564f19fd --- /dev/null +++ b/docs/release/installation/installation.rst @@ -0,0 +1,44 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + +Doctor Installation +==================== + +You can clone doctor project in OPNFV installer jumphost or if you are not +in OPNFV environment you can clone Doctor to DevStack controller node + +git clone https://gerrit.opnfv.org/gerrit/doctor + +In DevStack controller here is a sample of including what Doctor testing +will require for sample fault management testing and for maintenance +testing using Fenix + +.. code-block:: bash + + git clone https://github.com/openstack/devstack -b stable/train + +.. code-block:: bash + + cd devstack vi local.conf + +.. code-block:: bash + + [[local|localrc]] + GIT_BASE=https://git.openstack.org + HOST_IP=<host_ip> + ADMIN_PASSWORD=admin + DATABASE_PASSWORD=admin + RABBIT_PASSWORD=admin + SERVICE_PASSWORD=admin + LOGFILE=/opt/stack/stack.sh.log + + PUBLIC_INTERFACE=eth0 + + CEILOMETER_EVENT_ALARM=True + + ENABLED_SERVICES=key,rabbit,mysql,fenix-engine,fenix-api,aodh-evaluator,aodh-notifier,aodh-api + + enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer stable/train + enable_plugin aodh https://git.openstack.org/openstack/aodh stable/train + enable_plugin gnocchi https://github.com/openstack/gnocchi + enable_plugin fenix https://opendev.org/x/fenix master diff --git a/docs/release/release-notes/release-notes.rst b/docs/release/release-notes/release-notes.rst index 92775557..b525335e 100644 --- a/docs/release/release-notes/release-notes.rst +++ b/docs/release/release-notes/release-notes.rst @@ -7,33 +7,41 @@ This document provides the release notes for Iruya version of Doctor. Important notes =============== -In Iruya release there has not been many changes. - -All testing is now being made with Fuel installer. Maintenance use case -is now only tested against latest upstream Fenix. Only sample inspector is -tested as Fuel do not support Vitrage or Congress. +Jerma release has mainly been for finalizing maintenance use case testing +supporting the ETSI FEAT03 defined interactino between VNFM and infrastructure. +This is mainly to have infrastructure maintenance and upgrade operations +opttimized as fast as they can while keeping VNFs on top with zero impact +on their service. + +Further more this is the final release of Doctor and the more deep testing is +moving more to upstream projects like Fenix for the maintenance. Also in +this release we have made sure that all Doctor testing and any deeper testing +with ehe upstream projects can be done in DevStack. This also makes DevStack +the most important installer. Summary ======= -Iruya Doctor framework uses OpenStack Stein integrated into its test cases. +Jerma Doctor framework uses OpenStack Train integrated into its test cases. Release Data ============ Doctor changes -- Maintenance use case updated to support latest version of Fenix running - in container on controller node -- Maintenance use case now support Fuel installer -- Doctor updated to use OpenStack Stein and only python 3.6 -- Testing only sample inspector as lacking installer support for - Vitrage and Congress +- Maintenance use case updated to support latest version of Fenix. +- Maintenance use case now supports ETSI FEAT03 optimization with Fenix. +- Doctor testing is now preferred to be done in DevStack environment + where one can easily select OpenStack release from Rocky to Ussuri to + test Doctor functionality. Latest OPNFV Fuel can also be used for the + OpenStack version it supports. -Releng changes +Doctor CI -- Doctor testing running with python 3.6 and with sample inspector -- Doctor is only tested with Fuel installer +- Doctor tested with fuel installer. +- Fault management use case is tested with sample inspector. +- Maintenance use case is tested with sample implementation and towards + the latest Fenix version. The includes the new ETSI FEAT03 optimization. Version change ^^^^^^^^^^^^^^ @@ -41,12 +49,13 @@ Version change Module version changes ~~~~~~~~~~~~~~~~~~~~~~ -- OpenStack has changed from Rocky to Stein since previous Hunter release. +- OpenStack has changed Train Document version changes ~~~~~~~~~~~~~~~~~~~~~~~~ -N/A +All documentation is updated to OPNFV unified format according to +documentation guidelines. Small updates in many documents. Reason for version ^^^^^^^^^^^^^^^^^^ @@ -56,11 +65,14 @@ N/A Feature additions ~~~~~~~~~~~~~~~~~ -+--------------------+--------------------------------------------------------------+ -| **JIRA REFERENCE** | **SLOGAN** | -+--------------------+--------------------------------------------------------------+ -| DOCTOR-134 | Update Doctor maintenance use case to work with latest Fenix | -+--------------------+--------------------------------------------------------------+ ++--------------------+--------------------------------------------+ +| **JIRA REFERENCE** | **SLOGAN** | ++--------------------+--------------------------------------------+ +| DOCTOR-137 | VNFM maintenance with ETSI changes | ++--------------------+--------------------------------------------+ +| DOCTOR-136 | DevStack support | ++--------------------+--------------------------------------------+ + Deliverables ------------ @@ -127,3 +139,8 @@ References For more information about the OPNFV Doctor latest work, please see: https://wiki.opnfv.org/display/doctor/Doctor+Home + +Further information about ETSI FEAT03 optimization can be found from Fenix +Documentation: + +https://fenix.readthedocs.io/en/latest diff --git a/docs/release/release-notes/releasenotes_iruya.rst b/docs/release/release-notes/releasenotes_iruya.rst new file mode 100644 index 00000000..92775557 --- /dev/null +++ b/docs/release/release-notes/releasenotes_iruya.rst @@ -0,0 +1,129 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + + +This document provides the release notes for Iruya version of Doctor. + +Important notes +=============== + +In Iruya release there has not been many changes. + +All testing is now being made with Fuel installer. Maintenance use case +is now only tested against latest upstream Fenix. Only sample inspector is +tested as Fuel do not support Vitrage or Congress. + +Summary +======= + +Iruya Doctor framework uses OpenStack Stein integrated into its test cases. + +Release Data +============ + +Doctor changes + +- Maintenance use case updated to support latest version of Fenix running + in container on controller node +- Maintenance use case now support Fuel installer +- Doctor updated to use OpenStack Stein and only python 3.6 +- Testing only sample inspector as lacking installer support for + Vitrage and Congress + +Releng changes + +- Doctor testing running with python 3.6 and with sample inspector +- Doctor is only tested with Fuel installer + +Version change +^^^^^^^^^^^^^^ + +Module version changes +~~~~~~~~~~~~~~~~~~~~~~ + +- OpenStack has changed from Rocky to Stein since previous Hunter release. + +Document version changes +~~~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +Reason for version +^^^^^^^^^^^^^^^^^^ + +N/A + +Feature additions +~~~~~~~~~~~~~~~~~ + ++--------------------+--------------------------------------------------------------+ +| **JIRA REFERENCE** | **SLOGAN** | ++--------------------+--------------------------------------------------------------+ +| DOCTOR-134 | Update Doctor maintenance use case to work with latest Fenix | ++--------------------+--------------------------------------------------------------+ + +Deliverables +------------ + +Software deliverables +===================== + +None + +Documentation deliverables +========================== + +https://git.opnfv.org/doctor/tree/docs + +Known Limitations, Issues and Workarounds +========================================= + +System Limitations +^^^^^^^^^^^^^^^^^^ + +Maintenance test case requirements: + +- Minimum number of nodes: 1 Controller, 3 Computes +- Min number of VCPUs: 2 VCPUs for each compute + +Known issues +^^^^^^^^^^^^ + +None + +Workarounds +^^^^^^^^^^^ + +None + +Test Result +=========== + +Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='fuel' | SUCCESS | ++--------------------------------------+--------------+ + +Doctor CI results with TEST_CASE='maintenance' and INSPECTOR_TYPE=sample +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='fuel' | SUCCESS | +| ADMIN_TOOL_TYPE='fenix' *) | | ++--------------------------------------+--------------+ + +*) Sample implementation not updated according to latest upstream Fenix + and is currently not being tested. + +References +========== + +For more information about the OPNFV Doctor latest work, please see: + +https://wiki.opnfv.org/display/doctor/Doctor+Home diff --git a/docs/release/scenarios/fault_management/fault_management.rst b/docs/release/scenarios/fault_management/fault_management.rst new file mode 100644 index 00000000..99371201 --- /dev/null +++ b/docs/release/scenarios/fault_management/fault_management.rst @@ -0,0 +1,90 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + + +Running test cases +"""""""""""""""""" + +Functest will call the "doctor_tests/main.py" in Doctor to run the test job. +Doctor testing can also be triggered by tox on OPNFV installer jumphost. Tox +is normally used for functional, module and coding style testing in Python +project. + +Currently 'MCP' and 'devstack' installer are supported. + + +Fault management use case +""""""""""""""""""""""""" + +* A consumer of the NFVI wants to receive immediate notifications about faults + in the NFVI affecting the proper functioning of the virtual resources. + Therefore, such faults have to be detected as quickly as possible, and, when + a critical error is observed, the affected consumer is immediately informed + about the fault and can switch over to the STBY configuration. + +The faults to be monitored (and at which detection rate) will be configured by +the consumer. Once a fault is detected, the Inspector in the Doctor +architecture will check the resource map maintained by the Controller, to find +out which virtual resources are affected and then update the resources state. +The Notifier will receive the failure event requests sent from the Controller, +and notify the consumer(s) of the affected resources according to the alarm +configuration. + +Detailed workflow information is as follows: + +* Consumer(VNFM): (step 0) creates resources (network, server/instance) and an + event alarm on state down notification of that server/instance or Neutron + port. + +* Monitor: (step 1) periodically checks nodes, such as ping from/to each + dplane nic to/from gw of node, (step 2) once it fails to send out event + with "raw" fault event information to Inspector + +* Inspector: when it receives an event, it will (step 3) mark the host down + ("mark-host-down"), (step 4) map the PM to VM, and change the VM status to + down. In network failure case, also Neutron port is changed to down. + +* Controller: (step 5) sends out instance update event to Ceilometer. In network + failure case, also Neutron port is changed to down and corresponding event is + sent to Ceilometer. + +* Notifier: (step 6) Ceilometer transforms and passes the events to AODH, + (step 7) AODH will evaluate events with the registered alarm definitions, + then (step 8) it will fire the alarm to the "consumer" who owns the + instance + +* Consumer(VNFM): (step 9) receives the event and (step 10) recreates a new + instance + +Fault management test case +"""""""""""""""""""""""""" + +Functest will call the 'doctor-test' command in Doctor to run the test job. + +The following steps are executed: + +Firstly, get the installer ip according to the installer type. Then ssh to +the installer node to get the private key for accessing to the cloud. As +'fuel' installer, ssh to the controller node to modify nova and ceilometer +configurations. + +Secondly, prepare image for booting VM, then create a test project and test +user (both default to doctor) for the Doctor tests. + +Thirdly, boot a VM under the doctor project and check the VM status to verify +that the VM is launched completely. Then get the compute host info where the VM +is launched to verify connectivity to the target compute host. Get the consumer +ip according to the route to compute ip and create an alarm event in Ceilometer +using the consumer ip. + +Fourthly, the Doctor components are started, and, based on the above preparation, +a failure is injected to the system, i.e. the network of compute host is +disabled for 3 minutes. To ensure the host is down, the status of the host +will be checked. + +Finally, the notification time, i.e. the time between the execution of step 2 +(Monitor detects failure) and step 9 (Consumer receives failure notification) +is calculated. + +According to the Doctor requirements, the Doctor test is successful if the +notification time is below 1 second. diff --git a/docs/development/overview/functest_scenario/images/Fault-management-design.png b/docs/release/scenarios/maintenance/images/Fault-management-design.png Binary files differindex 6d98cdec..6d98cdec 100644 --- a/docs/development/overview/functest_scenario/images/Fault-management-design.png +++ b/docs/release/scenarios/maintenance/images/Fault-management-design.png diff --git a/docs/development/overview/functest_scenario/images/LICENSE b/docs/release/scenarios/maintenance/images/LICENSE index 21a2d03d..21a2d03d 100644 --- a/docs/development/overview/functest_scenario/images/LICENSE +++ b/docs/release/scenarios/maintenance/images/LICENSE diff --git a/docs/development/overview/functest_scenario/images/Maintenance-design.png b/docs/release/scenarios/maintenance/images/Maintenance-design.png Binary files differindex 8f21db6a..8f21db6a 100644 --- a/docs/development/overview/functest_scenario/images/Maintenance-design.png +++ b/docs/release/scenarios/maintenance/images/Maintenance-design.png diff --git a/docs/development/overview/functest_scenario/images/Maintenance-workflow.png b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png Binary files differindex 9b65fd59..9b65fd59 100644 --- a/docs/development/overview/functest_scenario/images/Maintenance-workflow.png +++ b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png diff --git a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst b/docs/release/scenarios/maintenance/maintenance.rst index 4505dd8f..ecfe76b1 100644 --- a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst +++ b/docs/release/scenarios/maintenance/maintenance.rst @@ -2,142 +2,6 @@ .. http://creativecommons.org/licenses/by/4.0 - -Platform overview -""""""""""""""""" - -Doctor platform provides these features since `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_: - -* Immediate Notification -* Consistent resource state awareness for compute host down -* Valid compute host status given to VM owner - -These features enable high availability of Network Services on top of -the virtualized infrastructure. Immediate notification allows VNF managers -(VNFM) to process recovery actions promptly once a failure has occurred. -Same framework can also be utilized to have VNFM awareness about -infrastructure maintenance. - -Consistency of resource state is necessary to execute recovery actions -properly in the VIM. - -Ability to query host status gives VM owner the possibility to get -consistent state information through an API in case of a compute host -fault. - -The Doctor platform consists of the following components: - -* OpenStack Compute (Nova) -* OpenStack Networking (Neutron) -* OpenStack Telemetry (Ceilometer) -* OpenStack Alarming (AODH) -* Doctor Sample Inspector, OpenStack Congress or OpenStack Vitrage -* Doctor Sample Monitor or any monitor supported by Congress or Vitrage - -.. note:: - Doctor Sample Monitor is used in Doctor testing. However in real - implementation like Vitrage, there are several other monitors supported. - -You can see an overview of the Doctor platform and how components interact in -:numref:`figure-p1`. - -.. figure:: ./images/Fault-management-design.png - :name: figure-p1 - :width: 100% - - Doctor platform and typical sequence - -Detailed information on the Doctor architecture can be found in the Doctor -requirements documentation: -http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html - -Running test cases -"""""""""""""""""" - -Functest will call the "doctor_tests/main.py" in Doctor to run the test job. -Doctor testing can also be triggered by tox on OPNFV installer jumphost. Tox -is normally used for functional, module and coding style testing in Python -project. - -Currently, 'Apex', 'MCP' and 'local' installer are supported. - - -Fault management use case -""""""""""""""""""""""""" - -* A consumer of the NFVI wants to receive immediate notifications about faults - in the NFVI affecting the proper functioning of the virtual resources. - Therefore, such faults have to be detected as quickly as possible, and, when - a critical error is observed, the affected consumer is immediately informed - about the fault and can switch over to the STBY configuration. - -The faults to be monitored (and at which detection rate) will be configured by -the consumer. Once a fault is detected, the Inspector in the Doctor -architecture will check the resource map maintained by the Controller, to find -out which virtual resources are affected and then update the resources state. -The Notifier will receive the failure event requests sent from the Controller, -and notify the consumer(s) of the affected resources according to the alarm -configuration. - -Detailed workflow information is as follows: - -* Consumer(VNFM): (step 0) creates resources (network, server/instance) and an - event alarm on state down notification of that server/instance or Neutron - port. - -* Monitor: (step 1) periodically checks nodes, such as ping from/to each - dplane nic to/from gw of node, (step 2) once it fails to send out event - with "raw" fault event information to Inspector - -* Inspector: when it receives an event, it will (step 3) mark the host down - ("mark-host-down"), (step 4) map the PM to VM, and change the VM status to - down. In network failure case, also Neutron port is changed to down. - -* Controller: (step 5) sends out instance update event to Ceilometer. In network - failure case, also Neutron port is changed to down and corresponding event is - sent to Ceilometer. - -* Notifier: (step 6) Ceilometer transforms and passes the events to AODH, - (step 7) AODH will evaluate events with the registered alarm definitions, - then (step 8) it will fire the alarm to the "consumer" who owns the - instance - -* Consumer(VNFM): (step 9) receives the event and (step 10) recreates a new - instance - -Fault management test case -"""""""""""""""""""""""""" - -Functest will call the 'doctor-test' command in Doctor to run the test job. - -The following steps are executed: - -Firstly, get the installer ip according to the installer type. Then ssh to -the installer node to get the private key for accessing to the cloud. As -'fuel' installer, ssh to the controller node to modify nova and ceilometer -configurations. - -Secondly, prepare image for booting VM, then create a test project and test -user (both default to doctor) for the Doctor tests. - -Thirdly, boot a VM under the doctor project and check the VM status to verify -that the VM is launched completely. Then get the compute host info where the VM -is launched to verify connectivity to the target compute host. Get the consumer -ip according to the route to compute ip and create an alarm event in Ceilometer -using the consumer ip. - -Fourthly, the Doctor components are started, and, based on the above preparation, -a failure is injected to the system, i.e. the network of compute host is -disabled for 3 minutes. To ensure the host is down, the status of the host -will be checked. - -Finally, the notification time, i.e. the time between the execution of step 2 -(Monitor detects failure) and step 9 (Consumer receives failure notification) -is calculated. - -According to the Doctor requirements, the Doctor test is successful if the -notification time is below 1 second. - Maintenance use case """""""""""""""""""" @@ -249,7 +113,8 @@ After all computes are maintained, `admin tool` can send `MAINTENANCE_COMPLETE` to tell maintenance/upgrade is now complete. For `app manager` this means he can scale back to full capacity. -This is the current sample implementation and test case. Real life -implementation is started in OpenStack Fenix project and there we should -eventually address requirements more deeply and update the test case with Fenix -implementation. +There is currently sample implementation on VNFM and test case. In +infrastructure side there is sample implementation of 'admin_tool' and +there is also support for the OpenStack Fenix that extends the use case to +support 'ETSI FEAT03' for VNFM interaction and to optimize the whole +infrastructure mainteannce and upgrade. diff --git a/docs/development/manuals/get-valid-server-state.rst b/docs/release/userguide/get-valid-server-state.rst index 824ea3c2..824ea3c2 100644 --- a/docs/development/manuals/get-valid-server-state.rst +++ b/docs/release/userguide/get-valid-server-state.rst diff --git a/docs/release/userguide/index.rst b/docs/release/userguide/index.rst index eee855dc..577072c7 100644 --- a/docs/release/userguide/index.rst +++ b/docs/release/userguide/index.rst @@ -11,3 +11,6 @@ Doctor User Guide :maxdepth: 2 feature.userguide.rst + get-valid-server-state.rst + mark-host-down_manual.rst + monitors.rst diff --git a/docs/development/manuals/mark-host-down_manual.rst b/docs/release/userguide/mark-host-down_manual.rst index 3815205d..3815205d 100644 --- a/docs/development/manuals/mark-host-down_manual.rst +++ b/docs/release/userguide/mark-host-down_manual.rst diff --git a/docs/development/manuals/monitors.rst b/docs/release/userguide/monitors.rst index eeb5e226..eeb5e226 100644 --- a/docs/development/manuals/monitors.rst +++ b/docs/release/userguide/monitors.rst diff --git a/docs/testing/developer/index.rst b/docs/testing/developer/index.rst new file mode 100644 index 00000000..dfbcfa74 --- /dev/null +++ b/docs/testing/developer/index.rst @@ -0,0 +1,13 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. SPDX-License-Identifier: CC-BY-4.0 +.. (c) Open Platform for NFV Project, Inc. and its contributors + +********* +Developer +********* + +.. toctree:: + :numbered: + :maxdepth: 2 + + testing.rst diff --git a/docs/development/overview/testing.rst b/docs/testing/developer/testing.rst index 663d4c3f..6a929130 100644 --- a/docs/development/overview/testing.rst +++ b/docs/testing/developer/testing.rst @@ -38,11 +38,19 @@ export TEST_CASE with different values: export TEST_CASE='fault_management' #Maintenance (requires 3 compute nodes) export TEST_CASE='maintenance' - #Use Fenix in maintenance testing instead of sample admin_tool - export ADMIN_TOOL_TYPE='fenix' #Run both tests cases export TEST_CASE='all' + #Use Fenix in maintenance testing instead of sample admin_tool + #This is only for 'mainteanance' test case + export ADMIN_TOOL_TYPE='fenix' + export APP_MANAGER_TYPE='vnfm' + + #Run in different installer jumphost 'fuel' or 'apex' + #In multinode DevStack you run Doctor in controller node + #with value export APP_MANAGER_TYPE=vnfm + export INSTALLER_TYPE='fuel' + Run Python Test Script ~~~~~~~~~~~~~~~~~~~~~~ @@ -59,7 +67,8 @@ environment and then run the test. .. _doctor.sample.conf: https://git.opnfv.org/doctor/tree/etc/doctor.sample.conf -In OPNFV Apex jumphost you can run Doctor testing as follows using tox: +In OPNFV testing environment jumphost you can run Doctor testing as follows +using tox: .. code-block:: bash @@ -69,31 +78,5 @@ In OPNFV Apex jumphost you can run Doctor testing as follows using tox: git clone https://gerrit.opnfv.org/gerrit/doctor cd doctor sudo -E tox - -Run Functest Suite -================== - -Functest supports Doctor testing by triggering the test script above in a -Functest container. You can run the Doctor test with the following steps: - -.. code-block:: bash - - DOCKER_TAG=latest - docker pull docker.io/opnfv/functest-features:${DOCKER_TAG} - docker run --privileged=true -id \ - -e INSTALLER_TYPE=${INSTALLER_TYPE} \ - -e INSTALLER_IP=${INSTALLER_IP} \ - -e INSPECTOR_TYPE=sample \ - docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash - docker exec <container_id> functest testcase run doctor-notification - -See `Functest Userguide`_ for more information. - -.. _Functest Userguide: :doc:`<functest:testing/user/userguide>` - - -For testing with stable version, change DOCKER_TAG to 'stable' or other release -tag identifier. - -Tips -==== + +Note! In DevStack you run Doctor in controller node. diff --git a/docs/testing/index.rst b/docs/testing/index.rst new file mode 100644 index 00000000..3fae9568 --- /dev/null +++ b/docs/testing/index.rst @@ -0,0 +1,15 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. SPDX-License-Identifier: CC-BY-4.0 +.. (c) Open Platform for NFV Project, Inc. and its contributors + +.. _testing: + +======= +Testing +======= + +.. toctree:: + :maxdepth: 2 + + ./developer/index.rst + ./user/index.rst diff --git a/docs/testing/user/index.rst b/docs/testing/user/index.rst new file mode 100644 index 00000000..1be9c7eb --- /dev/null +++ b/docs/testing/user/index.rst @@ -0,0 +1,13 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. SPDX-License-Identifier: CC-BY-4.0 +.. (c) Open Platform for NFV Project, Inc. and its contributors + +**** +User +**** + +.. toctree:: + :numbered: + :maxdepth: 2 + + testing.rst diff --git a/docs/testing/user/testing.rst b/docs/testing/user/testing.rst new file mode 100644 index 00000000..6172d26a --- /dev/null +++ b/docs/testing/user/testing.rst @@ -0,0 +1,30 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + +Run Functest Suite (obsolete) +============================= + +Functest supports Doctor testing by triggering the test script above in a +Functest container. You can run the Doctor test with the following steps: + +.. code-block:: bash + + DOCKER_TAG=latest + docker pull docker.io/opnfv/functest-features:${DOCKER_TAG} + docker run --privileged=true -id \ + -e INSTALLER_TYPE=${INSTALLER_TYPE} \ + -e INSTALLER_IP=${INSTALLER_IP} \ + -e INSPECTOR_TYPE=sample \ + docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash + docker exec <container_id> functest testcase run doctor-notification + +See `Functest Userguide`_ for more information. + +.. _Functest Userguide: :doc:`<functest:testing/user/userguide>` + + +For testing with stable version, change DOCKER_TAG to 'stable' or other release +tag identifier. + +Tips +==== diff --git a/doctor_tests/admin_tool/fenix/Dockerfile b/doctor_tests/admin_tool/fenix/Dockerfile index 5804b207..202380eb 100644 --- a/doctor_tests/admin_tool/fenix/Dockerfile +++ b/doctor_tests/admin_tool/fenix/Dockerfile @@ -21,7 +21,7 @@ RUN apk --no-cache add ca-certificates && \ if [ ! -e /usr/bin/pip ]; then ln -s pip3 /usr/bin/pip ; fi && \ if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python3 /usr/bin/python; fi && \ pip3 install --upgrade pip && \ - pip3 install alembic aodhclient decorator flask Flask-RESTful eventlet \ + pip3 install alembic aodhclient decorator flask Flask-RESTful eventlet jsonschema \ keystoneauth1 keystonemiddleware python-novaclient oslo.config pecan \ oslo.db oslo.log oslo.messaging oslo.serialization oslo.service oslo_policy \ oslotest oslo.utils pbr pymysql six sqlalchemy -cupper-constraints.txt && \ diff --git a/doctor_tests/app_manager/__init__.py b/doctor_tests/app_manager/__init__.py index 717d6587..c2f75918 100644 --- a/doctor_tests/app_manager/__init__.py +++ b/doctor_tests/app_manager/__init__.py @@ -8,12 +8,13 @@ ############################################################################## from oslo_config import cfg from oslo_utils import importutils +import os OPTS = [ cfg.StrOpt('type', - default='sample', - choices=['sample'], + default=os.environ.get('APP_MANAGER_TYPE', 'sample'), + choices=['sample', 'vnfm'], help='the component of doctor app manager', required=True), cfg.StrOpt('ip', @@ -28,7 +29,8 @@ OPTS = [ _app_manager_name_class_mapping = { - 'sample': 'doctor_tests.app_manager.sample.SampleAppManager' + 'sample': 'doctor_tests.app_manager.sample.SampleAppManager', + 'vnfm': 'doctor_tests.app_manager.vnfm.VNFM', } diff --git a/doctor_tests/app_manager/sample.py b/doctor_tests/app_manager/sample.py index 94049aa2..7ca35b97 100644 --- a/doctor_tests/app_manager/sample.py +++ b/doctor_tests/app_manager/sample.py @@ -165,7 +165,7 @@ class AppManager(Thread): data = json.loads(request.data.decode('utf8')) try: payload = self._alarm_traits_decoder(data) - except: + except Exception: payload = ({t[0]: t[2] for t in data['reason_data']['event']['traits']}) self.log.error('cannot parse alarm data: %s' % payload) diff --git a/doctor_tests/app_manager/vnfm.py b/doctor_tests/app_manager/vnfm.py new file mode 100644 index 00000000..68fdbb88 --- /dev/null +++ b/doctor_tests/app_manager/vnfm.py @@ -0,0 +1,441 @@ +############################################################################## +# Copyright (c) 2018 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +from flask import Flask +from flask import request +import json +import requests +from threading import Thread +import time +import uuid +import yaml + +from doctor_tests.app_manager.base import BaseAppManager +from doctor_tests.identity_auth import get_identity_auth +from doctor_tests.identity_auth import get_session +from doctor_tests.os_clients import neutron_client +from doctor_tests.os_clients import nova_client +from doctor_tests.os_clients import keystone_client + + +class VNFM(BaseAppManager): + + def __init__(self, stack, conf, log): + super(VNFM, self).__init__(conf, log) + self.stack = stack + self.app = None + + def start(self): + self.log.info('VNFM start......') + self.app = VNFManager(self.stack, self.conf, self, self.log) + self.app.start() + + def stop(self): + self.log.info('VNFM stop......') + if not self.app: + return + self.app.delete_constraints() + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + } + url = 'http://%s:%d/shutdown'\ + % (self.conf.app_manager.ip, + self.conf.app_manager.port) + requests.post(url, data='', headers=headers) + + +class VNFManager(Thread): + + def __init__(self, stack, conf, app_manager, log): + Thread.__init__(self) + self.stack = stack + self.conf = conf + self.port = self.conf.app_manager.port + self.app_manager = app_manager + self.log = log + self.intance_ids = None + self.auth = get_identity_auth(project=self.conf.doctor_project) + self.session = get_session(auth=self.auth) + self.keystone = keystone_client( + self.conf.keystone_version, self.session) + self.nova = nova_client(self.conf.nova_version, + self.session) + self.neutron = neutron_client(session=self.session) + self.headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json'} + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + self.orig_number_of_instances = self.number_of_instances() + # List of instances + self.ha_instances = [] + self.nonha_instances = [] + # Different instance_id specific constraints {instanse_id: {},...} + self.instance_constraints = None + # Update existing instances to instance lists + self.update_instances() + nonha_instances = len(self.nonha_instances) + if nonha_instances < 7: + self.scale = 2 + self.max_impacted = 2 + else: + self.scale = int((nonha_instances) / 2) + self.max_impacted = self.scale - 1 + self.log.info('Init nonha_instances: %s scale: %s: max_impacted %s' % + (nonha_instances, self.scale, self.max_impacted)) + # Different instance groups constraints dict + self.ha_group = None + self.nonha_group = None + # Floating IP used in HA instance + self.floating_ip = None + # VNF project_id + self.project_id = None + # HA instance_id that is active / has floating IP + self.active_instance_id = self.active_instance_id() + + services = self.keystone.services.list() + for service in services: + if service.type == 'maintenance': + self.log.info('maintenance service: %s:%s type %s' + % (service.name, service.id, service.type)) + maint_id = service.id + self.maint_endpoint = [ep.url for ep in self.keystone.endpoints.list() + if ep.service_id == maint_id and + ep.interface == 'public'][0] + self.log.info('maintenance endpoint: %s' % self.maint_endpoint) + self.update_constraints_lock = False + self.update_constraints() + + def delete_remote_instance_constraints(self, instance_id): + url = "%s/instance/%s" % (self.maint_endpoint, instance_id) + self.log.info('DELETE: %s' % url) + ret = requests.delete(url, data=None, headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def update_remote_instance_constraints(self, instance): + url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"]) + self.log.info('PUT: %s' % url) + ret = requests.put(url, data=json.dumps(instance), + headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def delete_remote_group_constraints(self, instance_group): + url = "%s/instance_group/%s" % (self.maint_endpoint, + instance_group["group_id"]) + self.log.info('DELETE: %s' % url) + ret = requests.delete(url, data=None, headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def update_remote_group_constraints(self, instance_group): + url = "%s/instance_group/%s" % (self.maint_endpoint, + instance_group["group_id"]) + self.log.info('PUT: %s' % url) + ret = requests.put(url, data=json.dumps(instance_group), + headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def delete_constraints(self): + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + for instance_id in self.instance_constraints: + self.delete_remote_instance_constraints(instance_id) + self.delete_remote_group_constraints(self.nonha_group) + self.delete_remote_group_constraints(self.ha_group) + + def update_constraints(self): + while self.update_constraints_lock: + self.log.info('Waiting update_constraints_lock...') + time.sleep(1) + self.update_constraints_lock = True + self.log.info('Update constraints') + if self.project_id is None: + self.project_id = self.keystone.projects.list( + name=self.conf.doctor_project)[0].id + if self.nonha_group is None: + # Nova does not support groupping instances that do not belong to + # anti-affinity server_groups. Anyhow all instances need groupping + self.nonha_group = { + "group_id": str(uuid.uuid4()), + "project_id": self.project_id, + "group_name": "doctor_nonha_app_group", + "anti_affinity_group": False, + "max_instances_per_host": 0, + "max_impacted_members": self.max_impacted, + "recovery_time": 2, + "resource_mitigation": True} + self.log.info('create doctor_nonha_app_group constraints: %s' + % self.nonha_group) + self.update_remote_group_constraints(self.nonha_group) + if self.ha_group is None: + group_id = [sg.id for sg in self.nova.server_groups.list() + if sg.name == "doctor_ha_app_group"][0] + self.ha_group = { + "group_id": group_id, + "project_id": self.project_id, + "group_name": "doctor_ha_app_group", + "anti_affinity_group": True, + "max_instances_per_host": 1, + "max_impacted_members": 1, + "recovery_time": 4, + "resource_mitigation": True} + self.log.info('create doctor_ha_app_group constraints: %s' + % self.ha_group) + self.update_remote_group_constraints(self.ha_group) + instance_constraints = {} + for ha_instance in self.ha_instances: + instance = { + "instance_id": ha_instance.id, + "project_id": self.project_id, + "group_id": self.ha_group["group_id"], + "instance_name": ha_instance.name, + "max_interruption_time": 120, + "migration_type": "MIGRATE", + "resource_mitigation": True, + "lead_time": 40} + self.log.info('create ha instance constraints: %s' + % instance) + instance_constraints[ha_instance.id] = instance + for nonha_instance in self.nonha_instances: + instance = { + "instance_id": nonha_instance.id, + "project_id": self.project_id, + "group_id": self.nonha_group["group_id"], + "instance_name": nonha_instance.name, + "max_interruption_time": 120, + "migration_type": "MIGRATE", + "resource_mitigation": True, + "lead_time": 40} + self.log.info('create nonha instance constraints: %s' + % instance) + instance_constraints[nonha_instance.id] = instance + if not self.instance_constraints: + # Initial instance constraints + self.log.info('create initial instances constraints...') + for instance in [instance_constraints[i] for i + in instance_constraints]: + self.update_remote_instance_constraints(instance) + self.instance_constraints = instance_constraints.copy() + else: + self.log.info('check instances constraints changes...') + added = [i for i in instance_constraints.keys() + if i not in self.instance_constraints] + deleted = [i for i in self.instance_constraints.keys() + if i not in instance_constraints] + modified = [i for i in instance_constraints.keys() + if (i not in added and i not in deleted and + instance_constraints[i] != + self.instance_constraints[i])] + for instance_id in deleted: + self.delete_remote_instance_constraints(instance_id) + updated = added + modified + for instance in [instance_constraints[i] for i in updated]: + self.update_remote_instance_constraints(instance) + if updated or deleted: + # Some instance constraints have changed + self.instance_constraints = instance_constraints.copy() + self.update_constraints_lock = False + + def active_instance_id(self): + # Need rertry as it takes time after heat template done before + # Floating IP in place + retry = 5 + while retry > 0: + for instance in self.ha_instances: + network_interfaces = next(iter(instance.addresses.values())) + for network_interface in network_interfaces: + _type = network_interface.get('OS-EXT-IPS:type') + if _type == "floating": + if not self.floating_ip: + self.floating_ip = network_interface.get('addr') + self.log.debug('active_instance: %s %s' % + (instance.name, instance.id)) + return instance.id + time.sleep(2) + self.update_instances() + retry -= 1 + raise Exception("No active instance found") + + def switch_over_ha_instance(self): + for instance in self.ha_instances: + if instance.id != self.active_instance_id: + self.log.info('Switch over to: %s %s' % (instance.name, + instance.id)) + # Deprecated, need to use neutron instead + # instance.add_floating_ip(self.floating_ip) + port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa + floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa + self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa + # Have to update ha_instances as floating_ip changed + self.update_instances() + self.active_instance_id = instance.id + break + + def get_instance_ids(self): + ret = list() + for instance in self.nova.servers.list(detailed=False): + ret.append(instance.id) + return ret + + def update_instances(self): + instances = self.nova.servers.list(detailed=True) + self.ha_instances = [i for i in instances + if "doctor_ha_app_" in i.name] + self.nonha_instances = [i for i in instances + if "doctor_nonha_app_" in i.name] + + def _alarm_data_decoder(self, data): + if "[" in data or "{" in data: + # string to list or dict removing unicode + data = yaml.load(data.replace("u'", "'")) + return data + + def _alarm_traits_decoder(self, data): + return ({str(t[0]): self._alarm_data_decoder(str(t[2])) + for t in data['reason_data']['event']['traits']}) + + def get_session_instance_ids(self, url, session_id): + ret = requests.get(url, data=None, headers=self.headers) + if ret.status_code != 200: + raise Exception(ret.text) + self.log.info('get_instance_ids %s' % ret.json()) + return ret.json()['instance_ids'] + + def scale_instances(self, number_of_instances): + number_of_instances_before = self.number_of_instances() + + parameters = self.stack.parameters + parameters['nonha_intances'] += number_of_instances + self.stack.update(self.stack.stack_name, + self.stack.stack_id, + self.stack.template, + parameters=parameters, + files=self.stack.files) + + number_of_instances_after = self.number_of_instances() + if (number_of_instances_before + number_of_instances != + number_of_instances_after): + self.log.error('scale_instances with: %d from: %d ends up to: %d' + % (number_of_instances, number_of_instances_before, + number_of_instances_after)) + raise Exception('scale_instances failed') + + self.log.info('scaled instances from %d to %d' % + (number_of_instances_before, + number_of_instances_after)) + + def number_of_instances(self): + return len(self.nova.servers.list(detailed=False)) + + def run(self): + app = Flask('VNFM') + + @app.route('/maintenance', methods=['POST']) + def maintenance_alarm(): + data = json.loads(request.data.decode('utf8')) + try: + payload = self._alarm_traits_decoder(data) + except Exception: + payload = ({t[0]: t[2] for t in + data['reason_data']['event']['traits']}) + self.log.error('cannot parse alarm data: %s' % payload) + raise Exception('VNFM cannot parse alarm.' + 'Possibly trait data over 256 char') + + self.log.info('VNFM received data = %s' % payload) + + state = payload['state'] + reply_state = None + reply = dict() + + self.log.info('VNFM state: %s' % state) + + if state == 'MAINTENANCE': + instance_ids = (self.get_session_instance_ids( + payload['instance_ids'], + payload['session_id'])) + my_instance_ids = self.get_instance_ids() + invalid_instances = ( + [instance_id for instance_id in instance_ids + if instance_id not in my_instance_ids]) + if invalid_instances: + self.log.error('Invalid instances: %s' % invalid_instances) + reply_state = 'NACK_MAINTENANCE' + else: + reply_state = 'ACK_MAINTENANCE' + + elif state == 'SCALE_IN': + # scale down "self.scale" instances that is VCPUS equaling + # at least a single compute node + self.scale_instances(-self.scale) + reply_state = 'ACK_SCALE_IN' + + elif state == 'MAINTENANCE_COMPLETE': + # possibly need to upscale + number_of_instances = self.number_of_instances() + if self.orig_number_of_instances > number_of_instances: + scale_instances = (self.orig_number_of_instances - + number_of_instances) + self.scale_instances(scale_instances) + reply_state = 'ACK_MAINTENANCE_COMPLETE' + + elif state == 'PREPARE_MAINTENANCE': + # TBD from contraints + if "MIGRATE" not in payload['allowed_actions']: + raise Exception('MIGRATE not supported') + instance_ids = payload['instance_ids'][0] + self.log.info('VNFM got instance: %s' % instance_ids) + if instance_ids == self.active_instance_id: + self.switch_over_ha_instance() + # optional also in contraints + reply['instance_action'] = "MIGRATE" + reply_state = 'ACK_PREPARE_MAINTENANCE' + + elif state == 'PLANNED_MAINTENANCE': + # TBD from contraints + if "MIGRATE" not in payload['allowed_actions']: + raise Exception('MIGRATE not supported') + instance_ids = payload['instance_ids'][0] + self.log.info('VNFM got instance: %s' % instance_ids) + if instance_ids == self.active_instance_id: + self.switch_over_ha_instance() + # optional also in contraints + reply['instance_action'] = "MIGRATE" + reply_state = 'ACK_PLANNED_MAINTENANCE' + + elif state == 'INSTANCE_ACTION_DONE': + # TBD was action done in allowed window + self.log.info('%s' % payload['instance_ids']) + else: + raise Exception('VNFM received event with' + ' unknown state %s' % state) + + if reply_state: + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + reply['state'] = reply_state + url = payload['reply_url'] + self.log.info('VNFM reply: %s' % reply) + requests.put(url, data=json.dumps(reply), headers=self.headers) + + return 'OK' + + @app.route('/shutdown', methods=['POST']) + def shutdown(): + self.log.info('shutdown VNFM server at %s' % time.time()) + func = request.environ.get('werkzeug.server.shutdown') + if func is None: + raise RuntimeError('Not running with the Werkzeug Server') + func() + return 'VNFM shutting down...' + + app.run(host="0.0.0.0", port=self.port) diff --git a/doctor_tests/common/utils.py b/doctor_tests/common/utils.py index 1a8840dd..67ca4f4b 100644 --- a/doctor_tests/common/utils.py +++ b/doctor_tests/common/utils.py @@ -10,6 +10,7 @@ import json import os import paramiko import re +import subprocess def load_json_file(full_path): @@ -97,6 +98,27 @@ class SSHClient(object): ftp.close() +class LocalSSH(object): + + def __init__(self, log): + self.log = log + self.log.info('Init local ssh client') + + def ssh(self, cmd): + ret = 0 + output = "%s failed!!!" % cmd + try: + output = subprocess.check_output((cmd), shell=True, + universal_newlines=True) + except subprocess.CalledProcessError: + ret = 1 + return ret, output + + def scp(self, src_file, dst_file): + return subprocess.check_output("cp %s %s" % (src_file, dst_file), + shell=True) + + def run_async(func): from threading import Thread from functools import wraps diff --git a/doctor_tests/image.py b/doctor_tests/image.py index 9961b22d..50841ef6 100644 --- a/doctor_tests/image.py +++ b/doctor_tests/image.py @@ -7,7 +7,11 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## import os -import urllib.request +try: + from urllib.request import urlopen +except Exception: + from urllib2 import urlopen + from oslo_config import cfg @@ -46,11 +50,14 @@ class Image(object): def create(self): self.log.info('image create start......') - images = {image.name: image for image in self.glance.images.list()} + if self.conf.image_name == 'cirros': + cirros = [image for image in images if 'cirros' in image] + if cirros: + self.conf.image_name = cirros[0] if self.conf.image_name not in images: if not os.path.exists(self.conf.image_filename): - resp = urllib.request.urlopen(self.conf.image_download_url) + resp = urlopen(self.conf.image_download_url) with open(self.conf.image_filename, "wb") as file: file.write(resp.read()) self.image = \ diff --git a/doctor_tests/inspector/sample.py b/doctor_tests/inspector/sample.py index 70156b20..c44db95d 100644 --- a/doctor_tests/inspector/sample.py +++ b/doctor_tests/inspector/sample.py @@ -52,7 +52,7 @@ class SampleInspector(BaseInspector): driver='messaging', topics=['notifications']) self.notif = self.notif.prepare(publisher_id='sample') - except: + except Exception: self.notif = None def _init_novaclients(self): @@ -135,7 +135,7 @@ class SampleInspector(BaseInspector): def maintenance(self, data): try: payload = self._alarm_traits_decoder(data) - except: + except Exception: payload = ({t[0]: t[2] for t in data['reason_data']['event']['traits']}) self.log.error('cannot parse alarm data: %s' % payload) diff --git a/doctor_tests/installer/__init__.py b/doctor_tests/installer/__init__.py index 2b9ad83d..00a01667 100644 --- a/doctor_tests/installer/__init__.py +++ b/doctor_tests/installer/__init__.py @@ -13,8 +13,8 @@ from oslo_utils import importutils OPTS = [ cfg.StrOpt('type', - default=os.environ.get('INSTALLER_TYPE', 'local'), - choices=['local', 'apex', 'daisy', 'fuel'], + default=os.environ.get('INSTALLER_TYPE', 'devstack'), + choices=['apex', 'daisy', 'fuel', 'devstack'], help='the type of installer', required=True), cfg.StrOpt('ip', @@ -28,10 +28,10 @@ OPTS = [ _installer_name_class_mapping = { - 'local': 'doctor_tests.installer.local.LocalInstaller', 'apex': 'doctor_tests.installer.apex.ApexInstaller', 'daisy': 'doctor_tests.installer.daisy.DaisyInstaller', - 'fuel': 'doctor_tests.installer.mcp.McpInstaller' + 'fuel': 'doctor_tests.installer.mcp.McpInstaller', + 'devstack': 'doctor_tests.installer.devstack.DevstackInstaller' } diff --git a/doctor_tests/installer/base.py b/doctor_tests/installer/base.py index b2270654..de4d2f2e 100644 --- a/doctor_tests/installer/base.py +++ b/doctor_tests/installer/base.py @@ -11,7 +11,6 @@ import getpass import grp import os import pwd -import re import six import stat import subprocess @@ -127,47 +126,9 @@ class BaseInstaller(object): os.chmod(ssh_key, stat.S_IREAD) return ssh_key + @abc.abstractmethod def get_transport_url(self): - client = utils.SSHClient(self.controllers[0], self.node_user_name, - key_filename=self.key_file) - if self.use_containers: - ncbase = "/var/lib/config-data/puppet-generated/nova" - else: - ncbase = "" - try: - cmd = 'sudo grep "^transport_url" %s/etc/nova/nova.conf' % ncbase - ret, url = client.ssh(cmd) - if ret: - raise Exception('Exec command to get transport from ' - 'controller(%s) failed, ' - 'ret=%s, output=%s' - % (self.controllers[0], ret, url)) - elif self.controllers[0] not in url: - # need to use ip instead of hostname - ret = (re.sub("@.*:", "@%s:" % self.controllers[0], - url[0].split("=", 1)[1])) - except: - cmd = 'grep -i "^rabbit" %s/etc/nova/nova.conf' % ncbase - ret, lines = client.ssh(cmd) - if ret: - raise Exception('Exec command to get transport from ' - 'controller(%s) in Apex installer failed, ' - 'ret=%s, output=%s' - % (self.controllers[0], ret, url)) - else: - for line in lines.split('\n'): - if line.startswith("rabbit_userid"): - rabbit_userid = line.split("=") - if line.startswith("rabbit_port"): - rabbit_port = line.split("=") - if line.startswith("rabbit_password"): - rabbit_password = line.split("=") - ret = "rabbit://%s:%s@%s:%s/?ssl=0" % (rabbit_userid, - rabbit_password, - self.controllers[0], - rabbit_port) - self.log.debug('get_transport_url %s' % ret) - return ret + pass def _run_cmd_remote(self, client, command): self.log.info('Run command=%s in %s installer......' @@ -199,14 +160,15 @@ class BaseInstaller(object): def _run_apply_patches(self, client, restart_cmd, script_names, python='python3'): installer_dir = os.path.dirname(os.path.realpath(__file__)) - if isinstance(script_names, list): for script_name in script_names: script_abs_path = '{0}/{1}/{2}'.format(installer_dir, 'common', script_name) + if self.conf.installer.type == "devstack": + script_name = "/opt/stack/%s" % script_name try: client.scp(script_abs_path, script_name) - except: + except Exception: client.scp(script_abs_path, script_name) try: if ".py" in script_name: @@ -216,14 +178,14 @@ class BaseInstaller(object): script_name) ret, output = client.ssh(cmd) self.log.info('Command %s output %s' % (cmd, output)) - except: + except Exception: ret, output = client.ssh(cmd) - + self.log.info('Command %s output %s' % (cmd, output)) if ret: raise Exception('Do the command in remote' ' node failed, ret=%s, cmd=%s, output=%s' % (ret, cmd, output)) - if 'nova' in restart_cmd: + if 'nova' in restart_cmd or 'devstack@n-' in restart_cmd: # Make sure scheduler has proper cpu_allocation_ratio time.sleep(5) client.ssh(restart_cmd) diff --git a/doctor_tests/installer/common/set_compute_config.py b/doctor_tests/installer/common/set_compute_config.py index 76ac649b..615f1895 100644 --- a/doctor_tests/installer/common/set_compute_config.py +++ b/doctor_tests/installer/common/set_compute_config.py @@ -26,9 +26,9 @@ def set_cpu_allocation_ratio(): found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio" in ca]) fcheck.close() + change = False + found = False if found_list and len(found_list): - change = False - found = False for car in found_list: if car.startswith('#'): continue diff --git a/doctor_tests/installer/common/set_fenix.sh b/doctor_tests/installer/common/set_fenix.sh index aac376cd..bd1eae47 100644 --- a/doctor_tests/installer/common/set_fenix.sh +++ b/doctor_tests/installer/common/set_fenix.sh @@ -22,14 +22,15 @@ apt-get install -y docker-ce docker-ce-cli containerd.io dpkg -r --force-depends golang-docker-credential-helpers } -docker ps | grep fenix >/dev/null && { -REMOTE=`docker exec -ti fenix git rev-parse origin/master` -LOCAL=`docker exec -ti fenix git rev-parse @` -if [ $LOCAL = $REMOTE ]; then - echo "Fenix start: Already running latest" +docker ps | grep fenix -q && { +REMOTE=`git ls-remote https://opendev.org/x/fenix HEAD | awk '{ print $1}'` +LOCAL=`docker exec -t fenix git rev-parse @` +if [[ "$LOCAL" =~ "$REMOTE" ]]; then + # Difference in above string ending marks, so cannot compare equal + echo "Fenix start: Already running latest $LOCAL equals $REMOTE" exit 0 else - echo "Fenix container needs to be recreated..." + echo "Fenix container needs to be recreated $LOCAL not $REMOTE" # Remove previous container for img in `docker image list | grep "^fenix" | awk '{print $1}'`; do for dock in `docker ps --all -f "ancestor=$img" | grep "$img" | awk '{print $1}'`; do @@ -75,6 +76,11 @@ echo "password = $OS_PASSWORD" >> fenix-api.conf echo "username = $OS_USERNAME" >> fenix-api.conf echo "cafile = /opt/stack/data/ca-bundle.pem" >> fenix-api.conf +openstack service list | grep -q maintenance || { +openstack service create --name fenix --enable maintenance +openstack endpoint create --region $OS_REGION_NAME --enable fenix public http://localhost:12347/v1 +} + # Mysql pw # MYSQLPW=`cat /var/lib/config-data/mysql/etc/puppet/hieradata/service_configs.json | grep mysql | grep root_password | awk -F": " '{print $2}' | awk -F"\"" '{print $2}'` MYSQLPW=root diff --git a/doctor_tests/installer/devstack.py b/doctor_tests/installer/devstack.py new file mode 100644 index 00000000..02f3601a --- /dev/null +++ b/doctor_tests/installer/devstack.py @@ -0,0 +1,151 @@ +############################################################################## +# Copyright (c) 2019 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +import os +import socket +import time + +from doctor_tests.common.utils import SSHClient +from doctor_tests.common.utils import LocalSSH +from doctor_tests.identity_auth import get_session +from doctor_tests.installer.base import BaseInstaller +from doctor_tests.os_clients import nova_client + + +class DevstackInstaller(BaseInstaller): + node_user_name = None + cm_set_script = 'set_config.py' + nc_set_compute_script = 'set_compute_config.py' + cm_restore_script = 'restore_config.py' + nc_restore_compute_script = 'restore_compute_config.py' + ac_restart_script = 'restart_aodh.py' + ac_restore_script = 'restore_aodh.py' + python = 'python' + + def __init__(self, conf, log): + super(DevstackInstaller, self).__init__(conf, log) + # Run Doctor under users home. sudo hides other env param to be used + home, self.node_user_name = (iter(os.environ.get('VIRTUAL_ENV') + .split('/', 3)[1:3])) + # Migration needs to work so ssh should have proper key defined + self.key_file = '/%s/%s/.ssh/id_rsa' % (home, self.node_user_name) + self.log.info('ssh uses: %s and %s' % (self.node_user_name, + self.key_file)) + self.controllers = ([ip for ip in + socket.gethostbyname_ex(socket.gethostname())[2] + if not ip.startswith('127.')] or + [[(s.connect(('8.8.8.8', 53)), + s.getsockname()[0], s.close()) + for s in [socket.socket(socket.AF_INET, + socket.SOCK_DGRAM)]][0][1]]) + conf.admin_tool.ip = self.controllers[0] + self.computes = list() + self.nova = nova_client(conf.nova_version, get_session()) + + def setup(self): + self.log.info('Setup Devstack installer start......') + self._get_devstack_conf() + self.create_flavor() + self.set_apply_patches() + + def cleanup(self): + self.restore_apply_patches() + + def get_ssh_key_from_installer(self): + return self.key_file + + def get_transport_url(self): + client = LocalSSH(self.log) + cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf' + ret, url = client.ssh(cmd) + url = url.split("= ", 1)[1][:-1] + self.log.info('get_transport_url %s' % url) + return url + + def get_host_ip_from_hostname(self, hostname): + return [hvisor.__getattr__('host_ip') for hvisor in self.hvisors + if hvisor.__getattr__('hypervisor_hostname') == hostname][0] + + def _get_devstack_conf(self): + self.log.info('Get devstack config details for Devstack installer' + '......') + self.hvisors = self.nova.hypervisors.list(detailed=True) + self.log.info('checking hypervisors.......') + self.computes = [hvisor.__getattr__('host_ip') for hvisor in + self.hvisors] + self.use_containers = False + self.log.info('controller_ips:%s' % self.controllers) + self.log.info('compute_ips:%s' % self.computes) + self.log.info('use_containers:%s' % self.use_containers) + + def _set_docker_restart_cmd(self, service): + # There can be multiple instances running so need to restart all + cmd = "for container in `sudo docker ps | grep " + cmd += service + cmd += " | awk '{print $1}'`; do sudo docker restart $container; \ + done;" + return cmd + + def set_apply_patches(self): + self.log.info('Set apply patches start......') + + set_scripts = [self.cm_set_script] + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@ceilometer-anotification.service' + + client = LocalSSH(self.log) + self._run_apply_patches(client, + restart_cmd, + set_scripts, + python=self.python) + time.sleep(7) + + self.log.info('Set apply patches start......') + + if self.conf.test_case != 'fault_management': + restart_cmd = 'sudo systemctl restart' \ + ' devstack@n-cpu.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches(client, + restart_cmd, + [self.nc_set_compute_script], + python=self.python) + time.sleep(7) + + def restore_apply_patches(self): + self.log.info('restore apply patches start......') + + restore_scripts = [self.cm_restore_script] + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@ceilometer-anotification.service' + + if self.conf.test_case != 'fault_management': + restart_cmd += ' devstack@n-sch.service' + restore_scripts.append(self.nc_restore_compute_script) + + client = LocalSSH(self.log) + self._run_apply_patches(client, + restart_cmd, + restore_scripts, + python=self.python) + + if self.conf.test_case != 'fault_management': + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@n-cpu.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches( + client, restart_cmd, + [self.nc_restore_compute_script], + python=self.python) diff --git a/doctor_tests/installer/local.py b/doctor_tests/installer/local.py deleted file mode 100644 index fee14f33..00000000 --- a/doctor_tests/installer/local.py +++ /dev/null @@ -1,118 +0,0 @@ -############################################################################## -# Copyright (c) 2017 ZTE Corporation and others. -# -# All rights reserved. This program and the accompanying materials -# are made available under the terms of the Apache License, Version 2.0 -# which accompanies this distribution, and is available at -# http://www.apache.org/licenses/LICENSE-2.0 -############################################################################## -import os -import shutil -import subprocess - -from doctor_tests.installer.base import BaseInstaller -from doctor_tests.installer.common.vitrage import \ - set_vitrage_host_down_template -from doctor_tests.common.constants import Inspector -from doctor_tests.common.utils import load_json_file -from doctor_tests.common.utils import write_json_file - - -class LocalInstaller(BaseInstaller): - node_user_name = 'root' - - nova_policy_file = '/etc/nova/policy.json' - nova_policy_file_backup = '%s%s' % (nova_policy_file, '.bak') - - def __init__(self, conf, log): - super(LocalInstaller, self).__init__(conf, log) - self.policy_modified = False - self.add_policy_file = False - - def setup(self): - self.get_ssh_key_from_installer() - self.set_apply_patches() - - def cleanup(self): - self.restore_apply_patches() - - def get_ssh_key_from_installer(self): - self.log.info('Assuming SSH keys already exchanged with computer' - 'for local installer type') - return None - - def get_host_ip_from_hostname(self, hostname): - self.log.info('Get host ip from host name in local installer......') - - cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname) - server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) - stdout, stderr = server.communicate() - host_ip = stdout.strip().decode("utf-8") - - self.log.info('Get host_ip:%s from host_name:%s in local installer' - % (host_ip, hostname)) - return host_ip - - def set_apply_patches(self): - self._set_nova_policy() - if self.conf.inspector.type == Inspector.VITRAGE: - set_vitrage_host_down_template() - os.system('sudo systemctl restart devstack@vitrage-graph.service') - - def restore_apply_patches(self): - self._restore_nova_policy() - - def _set_nova_policy(self): - host_status_policy = 'os_compute_api:servers:show:host_status' - host_status_rule = 'rule:admin_or_owner' - policy_data = { - 'context_is_admin': 'role:admin', - 'owner': 'user_id:%(user_id)s', - 'admin_or_owner': 'rule:context_is_admin or rule:owner', - host_status_policy: host_status_rule - } - - if os.path.isfile(self.nova_policy_file): - data = load_json_file(self.nova_policy_file) - if host_status_policy in data: - rule_origion = data[host_status_policy] - if host_status_rule == rule_origion: - self.log.info('Do not need to modify nova policy.') - self.policy_modified = False - else: - # update the host_status_policy - data[host_status_policy] = host_status_rule - self.policy_modified = True - else: - # add the host_status_policy, if the admin_or_owner is not - # defined, add it also - for policy, rule in policy_data.items(): - if policy not in data: - data[policy] = rule - self.policy_modified = True - if self.policy_modified: - self.log.info('Nova policy is Modified.') - shutil.copyfile(self.nova_policy_file, - self.nova_policy_file_backup) - else: - # file does not exit, create a new one and add the policy - self.log.info('Nova policy file not exist. Creating a new one') - data = policy_data - self.add_policy_file = True - - if self.policy_modified or self.add_policy_file: - write_json_file(self.nova_policy_file, data) - os.system('sudo systemctl restart devstack@n-api.service') - - def _restore_nova_policy(self): - if self.policy_modified: - shutil.copyfile(self.nova_policy_file_backup, - self.nova_policy_file) - os.remove(self.nova_policy_file_backup) - elif self.add_policy_file: - os.remove(self.nova_policy_file) - - if self.add_policy_file or self.policy_modified: - os.system('sudo systemctl restart devstack@n-api.service') - self.add_policy_file = False - self.policy_modified = False diff --git a/doctor_tests/installer/mcp.py b/doctor_tests/installer/mcp.py index 65c8ed70..7659c9e2 100644 --- a/doctor_tests/installer/mcp.py +++ b/doctor_tests/installer/mcp.py @@ -7,6 +7,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## from os.path import isfile +import re import time from doctor_tests.common.constants import is_fenix @@ -60,6 +61,45 @@ class McpInstaller(BaseInstaller): mcp_key = '/var/lib/opnfv/mcp.rsa' return mcp_key if isfile(mcp_key) else ssh_key + def get_transport_url(self): + client = SSHClient(self.controllers[0], self.node_user_name, + key_filename=self.key_file) + try: + cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf' + ret, url = client.ssh(cmd) + + if ret: + raise Exception('Exec command to get transport from ' + 'controller(%s) in MCP installer failed, ' + 'ret=%s, output=%s' + % (self.controllers[0], ret, url)) + elif self.controllers[0] not in url: + # need to use ip instead of hostname + url = (re.sub("@.*:", "@%s:" % self.controllers[0], + url[0].split("=", 1)[1])) + except Exception: + cmd = 'grep -i "^rabbit" /etc/nova/nova.conf' + ret, lines = client.ssh(cmd) + if ret: + raise Exception('Exec command to get transport from ' + 'controller(%s) in MCP installer failed, ' + 'ret=%s, output=%s' + % (self.controllers[0], ret, url)) + else: + for line in lines.split('\n'): + if line.startswith("rabbit_userid"): + rabbit_userid = line.split("=") + if line.startswith("rabbit_port"): + rabbit_port = line.split("=") + if line.startswith("rabbit_password"): + rabbit_password = line.split("=") + url = "rabbit://%s:%s@%s:%s/?ssl=0" % (rabbit_userid, + rabbit_password, + self.controllers[0], + rabbit_port) + self.log.info('get_transport_url %s' % url) + return url + def _copy_overcloudrc_to_controllers(self): for ip in self.controllers: cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip) @@ -89,8 +129,8 @@ class McpInstaller(BaseInstaller): def set_apply_patches(self): self.log.info('Set apply patches start......') fenix_files = None - set_scripts = [self.cm_set_script] + thrs = [] restart_cmd = 'sudo systemctl restart' \ ' ceilometer-agent-notification.service' @@ -112,10 +152,10 @@ class McpInstaller(BaseInstaller): 'admin_tool/fenix', fenix_file) client.scp(src_file, fenix_file) - self._run_apply_patches(client, - restart_cmd, - set_scripts, - python=self.python) + thrs.append(self._run_apply_patches(client, + restart_cmd, + set_scripts, + python=self.python)) time.sleep(5) self.log.info('Set apply patches start......') @@ -125,11 +165,15 @@ class McpInstaller(BaseInstaller): for node_ip in self.computes: client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) - self._run_apply_patches(client, - restart_cmd, - [self.nc_set_compute_script], - python=self.python) + thrs.append(self._run_apply_patches( + client, + restart_cmd, + [self.nc_set_compute_script], + python=self.python)) time.sleep(5) + # If Fenix container ir build, it needs to be ready before continue + for thr in thrs: + thr.join() def restore_apply_patches(self): self.log.info('restore apply patches start......') diff --git a/doctor_tests/main.py b/doctor_tests/main.py index cdb4af55..7573faec 100644 --- a/doctor_tests/main.py +++ b/doctor_tests/main.py @@ -43,7 +43,6 @@ class DoctorTest(object): def setup(self): # prepare the cloud env self.installer.setup() - # preparing VM image... self.image.create() @@ -51,39 +50,50 @@ class DoctorTest(object): self.user.create() def test_fault_management(self): - try: - LOG.info('doctor fault management test starting.......') - transport_url = self.installer.get_transport_url() - self.fault_management = \ - FaultManagement(self.conf, self.installer, self.user, LOG, - transport_url) - - # prepare test env - self.fault_management.setup() - - # wait for aodh alarms are updated in caches for event evaluator, - # sleep time should be larger than event_alarm_cache_ttl - # (default 60) - # (tojuvone) Fraser currently needs 120 - time.sleep(120) - - # injecting host failure... - # NOTE (umar) add INTERFACE_NAME logic to host injection - self.fault_management.start() - time.sleep(30) - - # verify the test results - # NOTE (umar) copy remote monitor.log file when monitor=collectd - self.fault_management.check_host_status('down') - self.fault_management.check_notification_time() - - except Exception as e: - LOG.error('doctor fault management test failed, ' - 'Exception=%s' % e) - LOG.error(format_exc()) - sys.exit(1) - finally: - self.fault_management.cleanup() + retry = 2 + # Retry once if notified_time is None + while retry > 0: + try: + self.fault_management = None + LOG.info('doctor fault management test starting.......') + transport_url = self.installer.get_transport_url() + self.fault_management = \ + FaultManagement(self.conf, self.installer, self.user, LOG, + transport_url) + + # prepare test env + self.fault_management.setup() + + # wait for aodh alarms are updated in caches for event + # evaluator,sleep time should be larger than + # event_alarm_cache_ttl (default 60) + # (tojuvone) Fraser currently needs 120 + time.sleep(120) + + # injecting host failure... + # NOTE (umar) add INTERFACE_NAME logic to host injection + self.fault_management.start() + time.sleep(30) + + # verify the test results + # NOTE (umar) copy remote monitor.log file when + # monitor=collectd + self.fault_management.check_host_status('down') + self.fault_management.check_notification_time() + retry = 0 + + except Exception as e: + LOG.error('doctor fault management test failed, ' + 'Exception=%s' % e) + if 'notified_time=None' in str(e): + retry -= 1 + LOG.info('doctor fault management retry') + continue + LOG.error(format_exc()) + sys.exit(1) + finally: + if self.fault_management is not None: + self.fault_management.cleanup() def _amount_compute_nodes(self): services = self.nova.services.list(binary='nova-compute') @@ -96,11 +106,12 @@ class DoctorTest(object): LOG.info('not enough compute nodes, skipping doctor ' 'maintenance test') return - elif self.conf.installer.type not in ['apex', 'fuel']: + elif self.conf.installer.type not in ['apex', 'fuel', 'devstack']: LOG.info('not supported installer, skipping doctor ' 'maintenance test') return try: + maintenance = None LOG.info('doctor maintenance test starting.......') trasport_url = self.installer.get_transport_url() maintenance = Maintenance(trasport_url, self.conf, LOG) @@ -122,7 +133,8 @@ class DoctorTest(object): LOG.error(format_exc()) sys.exit(1) finally: - maintenance.cleanup_maintenance() + if maintenance is not None: + maintenance.cleanup_maintenance() def run(self): """run doctor tests""" diff --git a/doctor_tests/scenario/maintenance.py b/doctor_tests/scenario/maintenance.py index 2e40529f..e6cdcccd 100644 --- a/doctor_tests/scenario/maintenance.py +++ b/doctor_tests/scenario/maintenance.py @@ -35,11 +35,16 @@ class Maintenance(object): auth = get_identity_auth(project=self.conf.doctor_project) self.neutron = neutron_client(get_session(auth=auth)) self.stack = Stack(self.conf, self.log) + if self.conf.installer.type == "devstack": + self.endpoint_ip = trasport_url.split("@", 1)[1].split(":", 1)[0] + else: + self.endpoint_ip = self.conf.admin_tool.ip + self.endpoint = "http://%s:12347/" % self.endpoint_ip if self.conf.admin_tool.type == 'sample': self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log) - self.endpoint = 'maintenance' + self.endpoint += 'maintenance' else: - self.endpoint = 'v1/maintenance' + self.endpoint += 'v1/maintenance' self.app_manager = get_app_manager(self.stack, self.conf, self.log) self.inspector = get_inspector(self.conf, self.log, trasport_url) @@ -128,8 +133,9 @@ class Maintenance(object): else: # TBD Now we expect Fenix is running in self.conf.admin_tool.port pass - self.app_manager.start() + # Inspector before app_manager, as floating ip might come late self.inspector.start() + self.app_manager.start() def start_maintenance(self): self.log.info('start maintenance.......') @@ -138,17 +144,13 @@ class Maintenance(object): for hvisor in hvisors: hostname = hvisor.__getattr__('hypervisor_hostname') maintenance_hosts.append(hostname) - - url = ('http://%s:%s/%s' % - (self.conf.admin_tool.ip, - self.conf.admin_tool.port, - self.endpoint)) + url = self.endpoint headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} if self.conf.admin_tool.type == 'fenix': headers['X-Auth-Token'] = self.admin_session.get_token() - self.log.info('headers %s' % headers) + self.log.info('url %s headers %s' % (url, headers)) retries = 12 ret = None while retries > 0: @@ -160,8 +162,12 @@ class Maintenance(object): data = {'state': 'MAINTENANCE', 'maintenance_at': maintenance_at, - 'metadata': {'openstack_version': 'Rocky'}, - 'workflow': 'default'} + 'metadata': {'openstack_version': 'Train'}} + + if self.conf.app_manager.type == 'vnfm': + data['workflow'] = 'vnf' + else: + data['workflow'] = 'default' if self.conf.admin_tool.type == 'sample': data['hosts'] = maintenance_hosts @@ -170,7 +176,7 @@ class Maintenance(object): try: ret = requests.post(url, data=json.dumps(data), headers=headers) - except: + except Exception: if retries == 0: raise Exception('admin tool did not respond in 120s') else: @@ -187,11 +193,8 @@ class Maintenance(object): def remove_maintenance_session(self, session_id): self.log.info('remove maintenance session %s.......' % session_id) - url = ('http://%s:%s/%s/%s' % - (self.conf.admin_tool.ip, - self.conf.admin_tool.port, - self.endpoint, - session_id)) + + url = ('%s/%s' % (self.endpoint, session_id)) headers = { 'Content-Type': 'application/json', @@ -205,11 +208,8 @@ class Maintenance(object): raise Exception(ret.text) def get_maintenance_state(self, session_id): - url = ('http://%s:%s/%s/%s' % - (self.conf.admin_tool.ip, - self.conf.admin_tool.port, - self.endpoint, - session_id)) + + url = ('%s/%s' % (self.endpoint, session_id)) headers = { 'Content-Type': 'application/json', diff --git a/doctor_tests/user.py b/doctor_tests/user.py index 29aa004b..2cd9757f 100644 --- a/doctor_tests/user.py +++ b/doctor_tests/user.py @@ -129,7 +129,6 @@ class User(object): def _add_user_role_in_project(self, is_admin=False): """add test user with test role in test project""" - project = self.projects.get(self.conf.doctor_project) user_name = 'admin' if is_admin else self.conf.doctor_user @@ -32,6 +32,7 @@ passenv = ADMIN_TOOL_TYPE TEST_CASE SSH_KEY + APP_MANAGER_TYPE changedir = {toxinidir}/doctor_tests commands = doctor-test /usr/bin/find {toxinidir} -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete |