diff options
62 files changed, 2490 insertions, 1013 deletions
@@ -1,27 +0,0 @@ -Project: Fault Management project (doctor) -Project Creation Date: December 2, 2014 -Project Category: Requirement -Lifecycle State: Mature -Primary Contact: Tomi Juvonen (tomi.juvonen@nokia.com) -Project Lead: Tomi Juvonen (tomi.juvonen@nokia.com) -Jira Project Name: Fault Management project -Jira Project Prefix: DOCTOR -Mailing list tag: [doctor] -IRC: Server:freenode.net Channel:#opnfv-doctor -Repository: doctor - -Committers: -Ashiq Khan (NTT DOCOMO, khan@nttdocomo.com) -Bertrand Souville (NTT DOCOMO, souville@docomolab-euro.com) -Dong Wenjuan (ZTE, dong.wenjuan@zte.com.cn) -Gerald Kunzmann (NTT DOCOMO, kunzmann@docomolab-euro.com) -Ryota Mibu (NEC, r-mibu@cq.jp.nec.com) -Serge Manning (Sprint, Serge.Manning@sprint.com) -Tomi Juvonen (Nokia, tomi.juvonen@nokia.com) - -Link to TSC approval of the project: http://meetbot.opnfv.org/meetings/opnfv-meeting/2014/opnfv-meeting.2014-12-02-14.58.html -Link(s) to approval of committer update: -http://lists.opnfv.org/pipermail/opnfv-tsc/2015-June/000905.html -http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2015-June/003165.html -http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-June/011245.html -http://lists.opnfv.org/pipermail/opnfv-tech-discuss/2016-July/011771.html @@ -34,31 +34,12 @@ repositories: - 'doctor' committers: - <<: *opnfv_doctor_ptl - - name: 'Ashiq Khan' - email: 'khan@nttdocomo.com' - company: 'NTT DOCOMO' - id: 'ashiq.khan' - - name: 'Serge Manning' - email: 'serge.manning@sprint.com' - company: 'Sprint' - id: 'sergem913' - - name: 'Gerald Kunzmann' - email: 'kunzmann@docomolab-euro.com' - company: 'DOCOMO Euro-Labs' - id: 'kunzmann' - name: 'wenjuan dong' email: 'dong.wenjuan@zte.com.cn' company: 'ZTE' id: 'dongwenjuan' - - name: 'Bertrand Souville' - email: 'souville@docomolab-euro.com' - company: 'DOCOMO Euro-Labs' - id: 'bertys' - - name: 'Ryota Mibu' - email: 'r-mibu@cq.jp.nec.com' - company: 'NEC' - id: 'r-mibu' tsc: + # yamllint disable rule:line-length approval: 'http//meetbot.opnfv.org/meetings/opnfv-meeting/2014/opnfv-meeting.2014-12-02-14.58.html' changes: - type: 'removal' @@ -100,3 +81,6 @@ tsc: - type: 'removal' name: 'Peter Lee' link: 'https://lists.opnfv.org/pipermail/opnfv-tsc/2018-March/004190.html' + - type: 'removal' + name: 'Bertrand Souville' + link: 'https://lists.opnfv.org/g/opnfv-tech-discuss/message/22344' diff --git a/devstack/README.rst b/devstack/README.rst index 91e8abfe..aaa18a7f 100644 --- a/devstack/README.rst +++ b/devstack/README.rst @@ -18,7 +18,9 @@ OPNFV Doctor in DevStack. enable_plugin osprofiler https://git.openstack.org/openstack/osprofiler enable_plugin doctor https://git.opnfv.org/doctor -to the ``[[local|localrc]]`` section. +to the ``[[local|localrc]]`` section. Or, you can copy the local.conf.sample:: + + cp /<path-to-doctor>/devstack/local.conf.sample ${DEVSTACK_DIR}/local.conf .. note:: The order of enabling plugins matters. diff --git a/devstack/local.conf.sample b/devstack/local.conf.sample new file mode 100644 index 00000000..2967714a --- /dev/null +++ b/devstack/local.conf.sample @@ -0,0 +1,120 @@ +# Sample ``local.conf`` for user-configurable variables in ``stack.sh`` + +# NOTE: Copy this file to the root DevStack directory for it to work properly. + +# ``local.conf`` is a user-maintained settings file that is sourced from ``stackrc``. +# This gives it the ability to override any variables set in ``stackrc``. +# Also, most of the settings in ``stack.sh`` are written to only be set if no +# value has already been set; this lets ``local.conf`` effectively override the +# default values. + +# This is a collection of some of the settings we have found to be useful +# in our DevStack development environments. Additional settings are described +# in https://docs.openstack.org/devstack/latest/configuration.html#local-conf +# These should be considered as samples and are unsupported DevStack code. + +# The ``localrc`` section replaces the old ``localrc`` configuration file. +# Note that if ``localrc`` is present it will be used in favor of this section. +[[local|localrc]] + +# Minimal Contents +# ---------------- + +# While ``stack.sh`` is happy to run without ``localrc``, devlife is better when +# there are a few minimal variables set: + +# If the ``*_PASSWORD`` variables are not set here you will be prompted to enter +# values for them by ``stack.sh``and they will be added to ``local.conf``. +ADMIN_PASSWORD=devstack +DATABASE_PASSWORD=$ADMIN_PASSWORD +RABBIT_PASSWORD=$ADMIN_PASSWORD +SERVICE_PASSWORD=$ADMIN_PASSWORD + +# ``HOST_IP`` and ``HOST_IPV6`` should be set manually for best results if +# the NIC configuration of the host is unusual, i.e. ``eth1`` has the default +# route but ``eth0`` is the public interface. They are auto-detected in +# ``stack.sh`` but often is indeterminate on later runs due to the IP moving +# from an Ethernet interface to a bridge on the host. Setting it here also +# makes it available for ``openrc`` to include when setting ``OS_AUTH_URL``. +# Neither is set by default. +HOST_IP=127.0.0.1 +#HOST_IPV6=2001:db8::7 + + +# Logging +# ------- + +# By default ``stack.sh`` output only goes to the terminal where it runs. It can +# be configured to additionally log to a file by setting ``LOGFILE`` to the full +# path of the destination log file. A timestamp will be appended to the given name. +LOGFILE=$DEST/logs/stack.sh.log + +# Old log files are automatically removed after 7 days to keep things neat. Change +# the number of days by setting ``LOGDAYS``. +LOGDAYS=2 + +# Nova logs will be colorized if ``SYSLOG`` is not set; turn this off by setting +# ``LOG_COLOR`` false. +#LOG_COLOR=False + + +# Using milestone-proposed branches +# --------------------------------- + +# Uncomment these to grab the milestone-proposed branches from the +# repos: +#CINDER_BRANCH=milestone-proposed +#GLANCE_BRANCH=milestone-proposed +#HORIZON_BRANCH=milestone-proposed +#KEYSTONE_BRANCH=milestone-proposed +#KEYSTONECLIENT_BRANCH=milestone-proposed +#NOVA_BRANCH=milestone-proposed +#NOVACLIENT_BRANCH=milestone-proposed +#NEUTRON_BRANCH=milestone-proposed +#SWIFT_BRANCH=milestone-proposed + +# Using git versions of clients +# ----------------------------- +# By default clients are installed from pip. See LIBS_FROM_GIT in +# stackrc for details on getting clients from specific branches or +# revisions. e.g. +# LIBS_FROM_GIT="python-ironicclient" +# IRONICCLIENT_BRANCH=refs/changes/44/2.../1 + +# Swift +# ----- + +# Swift is now used as the back-end for the S3-like object store. Setting the +# hash value is required and you will be prompted for it if Swift is enabled +# so just set it to something already: +SWIFT_HASH=66a3d6b56c1f479c8b4e70ab5c2000f5 + +# For development purposes the default of 3 replicas is usually not required. +# Set this to 1 to save some resources: +SWIFT_REPLICAS=1 + +# The data for Swift is stored by default in (``$DEST/data/swift``), +# or (``$DATA_DIR/swift``) if ``DATA_DIR`` has been set, and can be +# moved by setting ``SWIFT_DATA_DIR``. The directory will be created +# if it does not exist. +SWIFT_DATA_DIR=$DEST/data + +# OPNFV Doctor +# ------------ + +# Enable the required plugins +# The order of enabling plugins matters +enable_plugin aodh http://git.openstack.org/openstack/aodh +enable_plugin panko https://git.openstack.org/openstack/panko +enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer +enable_plugin osprofiler https://git.openstack.org/openstack/osprofiler +enable_plugin doctor https://git.opnfv.org/doctor + +# To enable Python 3 +# USE_PYTHON3=True + +# To enable Congress as Doctor Inspector +# enable_plugin congress https://git.openstack.org/openstack/congress + +# To enable Neutron port data plane status +# Q_ML2_PLUGIN_EXT_DRIVERS=data_plane_status diff --git a/docs/conf.py b/docs/conf.py index eb12e74b..3c9978bb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1 +1,2 @@ from docs_conf.conf import * # noqa: F401,F403 +master_doc = 'index' diff --git a/docs/development/index.rst b/docs/development/index.rst index 2dc16a82..a7d2817b 100644 --- a/docs/development/index.rst +++ b/docs/development/index.rst @@ -2,18 +2,18 @@ .. http://creativecommons.org/licenses/by/4.0 .. (c) 2016 OPNFV. +.. _development: -====== -Doctor -====== +=========== +Development +=========== .. toctree:: :maxdepth: 2 - ./design/index.rst - ./requirements/index.rst - ./manuals/index.rst - ./overview/functest_scenario/index.rst + ./design/index + ./overview/index + ./requirements/index Indices ======= diff --git a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst b/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst deleted file mode 100644 index 9f92b5bf..00000000 --- a/docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst +++ /dev/null @@ -1,253 +0,0 @@ -.. This work is licensed under a Creative Commons Attribution 4.0 International License. -.. http://creativecommons.org/licenses/by/4.0 - - - -Platform overview -""""""""""""""""" - -Doctor platform provides these features since `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_: - -* Immediate Notification -* Consistent resource state awareness for compute host down -* Valid compute host status given to VM owner - -These features enable high availability of Network Services on top of -the virtualized infrastructure. Immediate notification allows VNF managers -(VNFM) to process recovery actions promptly once a failure has occurred. -Same framework can also be utilized to have VNFM awareness about -infrastructure maintenance. - -Consistency of resource state is necessary to execute recovery actions -properly in the VIM. - -Ability to query host status gives VM owner the possibility to get -consistent state information through an API in case of a compute host -fault. - -The Doctor platform consists of the following components: - -* OpenStack Compute (Nova) -* OpenStack Networking (Neutron) -* OpenStack Telemetry (Ceilometer) -* OpenStack Alarming (AODH) -* Doctor Sample Inspector, OpenStack Congress or OpenStack Vitrage -* Doctor Sample Monitor or any monitor supported by Congress or Vitrage - -.. note:: - Doctor Sample Monitor is used in Doctor testing. However in real - implementation like Vitrage, there are several other monitors supported. - -You can see an overview of the Doctor platform and how components interact in -:numref:`figure-p1`. - -.. figure:: ./images/Fault-management-design.png - :name: figure-p1 - :width: 100% - - Doctor platform and typical sequence - -Detailed information on the Doctor architecture can be found in the Doctor -requirements documentation: -http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html - -Running test cases -"""""""""""""""""" - -Functest will call the "doctor_tests/main.py" in Doctor to run the test job. -Doctor testing can also be triggered by tox on OPNFV installer jumphost. Tox -is normally used for functional, module and coding style testing in Python -project. - -Currently, 'Apex', 'Daisy', 'Fuel' and 'local' installer are supported. - - -Fault management use case -""""""""""""""""""""""""" - -* A consumer of the NFVI wants to receive immediate notifications about faults - in the NFVI affecting the proper functioning of the virtual resources. - Therefore, such faults have to be detected as quickly as possible, and, when - a critical error is observed, the affected consumer is immediately informed - about the fault and can switch over to the STBY configuration. - -The faults to be monitored (and at which detection rate) will be configured by -the consumer. Once a fault is detected, the Inspector in the Doctor -architecture will check the resource map maintained by the Controller, to find -out which virtual resources are affected and then update the resources state. -The Notifier will receive the failure event requests sent from the Controller, -and notify the consumer(s) of the affected resources according to the alarm -configuration. - -Detailed workflow information is as follows: - -* Consumer(VNFM): (step 0) creates resources (network, server/instance) and an - event alarm on state down notification of that server/instance or Neutron - port. - -* Monitor: (step 1) periodically checks nodes, such as ping from/to each - dplane nic to/from gw of node, (step 2) once it fails to send out event - with "raw" fault event information to Inspector - -* Inspector: when it receives an event, it will (step 3) mark the host down - ("mark-host-down"), (step 4) map the PM to VM, and change the VM status to - down. In network failure case, also Neutron port is changed to down. - -* Controller: (step 5) sends out instance update event to Ceilometer. In network - failure case, also Neutron port is changed to down and corresponding event is - sent to Ceilometer. - -* Notifier: (step 6) Ceilometer transforms and passes the events to AODH, - (step 7) AODH will evaluate events with the registered alarm definitions, - then (step 8) it will fire the alarm to the "consumer" who owns the - instance - -* Consumer(VNFM): (step 9) receives the event and (step 10) recreates a new - instance - -Fault management test case -"""""""""""""""""""""""""" - -Functest will call the 'doctor-test' command in Doctor to run the test job. - -The following steps are executed: - -Firstly, get the installer ip according to the installer type. Then ssh to -the installer node to get the private key for accessing to the cloud. As -'fuel' installer, ssh to the controller node to modify nova and ceilometer -configurations. - -Secondly, prepare image for booting VM, then create a test project and test -user (both default to doctor) for the Doctor tests. - -Thirdly, boot a VM under the doctor project and check the VM status to verify -that the VM is launched completely. Then get the compute host info where the VM -is launched to verify connectivity to the target compute host. Get the consumer -ip according to the route to compute ip and create an alarm event in Ceilometer -using the consumer ip. - -Fourthly, the Doctor components are started, and, based on the above preparation, -a failure is injected to the system, i.e. the network of compute host is -disabled for 3 minutes. To ensure the host is down, the status of the host -will be checked. - -Finally, the notification time, i.e. the time between the execution of step 2 -(Monitor detects failure) and step 9 (Consumer receives failure notification) -is calculated. - -According to the Doctor requirements, the Doctor test is successful if the -notification time is below 1 second. - -Maintenance use case -"""""""""""""""""""" - -* A consumer of the NFVI wants to interact with NFVI maintenance, upgrade, - scaling and to have graceful retirement. Receiving notifications over these - NFVI events and responding to those within given time window, consumer can - guarantee zero downtime to his service. - -The maintenance use case adds the Doctor platform an `admin tool` and an -`app manager` component. Overview of maintenance components can be seen in -:numref:`figure-p2`. - -.. figure:: ./images/Maintenance-design.png - :name: figure-p2 - :width: 100% - - Doctor platform components in maintenance use case - -In maintenance use case, `app manager` (VNFM) will subscribe to maintenance -notifications triggered by project specific alarms through AODH. This is the way -it gets to know different NFVI maintenance, upgrade and scaling operations that -effect to its instances. The `app manager` can do actions depicted in `green -color` or tell `admin tool` to do admin actions depicted in `orange color` - -Any infrastructure component like `Inspector` can subscribe to maintenance -notifications triggered by host specific alarms through AODH. Subscribing to the -notifications needs admin privileges and can tell when a host is out of use as -in maintenance and when it is taken back to production. - -Maintenance test case -""""""""""""""""""""" - -Maintenance test case is currently running in our Apex CI and executed by tox. -This is because the special limitation mentioned below and also the fact we -currently have only sample implementation as a proof of concept. Environmental -variable TEST_CASE='maintenance' needs to be used when executing -"doctor_tests/main.py". Test case workflow can be seen in :numref:`figure-p3`. - -.. figure:: ./images/Maintenance-workflow.png - :name: figure-p3 - :width: 100% - - Maintenance test case workflow - -In test case all compute capacity will be consumed with project (VNF) instances. -For redundant services on instances and an empty compute needed for maintenance, -test case will need at least 3 compute nodes in system. There will be 2 -instances on each compute, so minimum number of VCPUs is also 2. Depending on -how many compute nodes there is application will always have 2 redundant -instances (ACT-STDBY) on different compute nodes and rest of the compute -capacity will be filled with non-redundant instances. - -For each project specific maintenance message there is a time window for -`app manager` to make any needed action. This will guarantee zero -down time for his service. All replies back are done by calling `admin tool` API -given in the message. - -The following steps are executed: - -Infrastructure admin will call `admin tool` API to trigger maintenance for -compute hosts having instances belonging to a VNF. - -Project specific `MAINTENANCE` notification is triggered to tell `app manager` -that his instances are going to hit by infrastructure maintenance at a specific -point in time. `app manager` will call `admin tool` API to answer back -`ACK_MAINTENANCE`. - -When the time comes to start the actual maintenance workflow in `admin tool`, -a `DOWN_SCALE` notification is triggered as there is no empty compute node for -maintenance (or compute upgrade). Project receives corresponding alarm and scales -down instances and call `admin tool` API to answer back `ACK_DOWN_SCALE`. - -As it might happen instances are not scaled down (removed) from a single -compute node, `admin tool` might need to figure out what compute node should be -made empty first and send `PREPARE_MAINTENANCE` to project telling which instance -needs to be migrated to have the needed empty compute. `app manager` makes sure -he is ready to migrate instance and call `admin tool` API to answer back -`ACK_PREPARE_MAINTENANCE`. `admin tool` will make the migration and answer -`ADMIN_ACTION_DONE`, so `app manager` knows instance can be again used. - -:numref:`figure-p3` has next a light blue section of actions to be done for each -compute. However as we now have one empty compute, we will maintain/upgrade that -first. So on first round, we can straight put compute in maintenance and send -admin level host specific `IN_MAINTENANCE` message. This is caught by `Inspector` -to know host is down for maintenance. `Inspector` can now disable any automatic -fault management actions for the host as it can be down for a purpose. After -`admin tool` has completed maintenance/upgrade `MAINTENANCE_COMPLETE` message -is sent to tell host is back in production. - -Next rounds we always have instances on compute, so we need to have -`PLANNED_MAINTANANCE` message to tell that those instances are now going to hit -by maintenance. When `app manager` now receives this message, he knows instances -to be moved away from compute will now move to already maintained/upgraded host. -In test case no upgrade is done on application side to upgrade instances -according to new infrastructure capabilities, but this could be done here as -this information is also passed in the message. This might be just upgrading -some RPMs, but also totally re-instantiating instance with a new flavor. Now if -application runs an active side of a redundant instance on this compute, -a switch over will be done. After `app manager` is ready he will call -`admin tool` API to answer back `ACK_PLANNED_MAINTENANCE`. In test case the -answer is `migrate`, so `admin tool` will migrate instances and reply -`ADMIN_ACTION_DONE` and then `app manager` knows instances can be again used. -Then we are ready to make the actual maintenance as previously trough -`IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` steps. - -After all computes are maintained, `admin tool` can send `MAINTENANCE_COMPLETE` -to tell maintenance/upgrade is now complete. For `app manager` this means he -can scale back to full capacity. - -This is the current sample implementation and test case. Real life -implementation is started in OpenStack Fenix project and there we should -eventually address requirements more deeply and update the test case with Fenix -implementation. diff --git a/docs/development/overview/index.rst b/docs/development/overview/index.rst index 956e73e3..f6d78d57 100644 --- a/docs/development/overview/index.rst +++ b/docs/development/overview/index.rst @@ -3,11 +3,12 @@ .. _doctor-overview: -************************ -Doctor Development Guide -************************ +******** +Overview +******** .. toctree:: :maxdepth: 2 + overview.rst testing.rst diff --git a/docs/development/overview/overview.rst b/docs/development/overview/overview.rst new file mode 100644 index 00000000..21f5439e --- /dev/null +++ b/docs/development/overview/overview.rst @@ -0,0 +1,52 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + +Platform overview +""""""""""""""""" + +Doctor platform provides these features since `Danube Release <https://wiki.opnfv.org/display/SWREL/Danube>`_: + +* Immediate Notification +* Consistent resource state awareness for compute host down +* Valid compute host status given to VM owner + +These features enable high availability of Network Services on top of +the virtualized infrastructure. Immediate notification allows VNF managers +(VNFM) to process recovery actions promptly once a failure has occurred. +Same framework can also be utilized to have VNFM awareness about +infrastructure maintenance. + +Consistency of resource state is necessary to execute recovery actions +properly in the VIM. + +Ability to query host status gives VM owner the possibility to get +consistent state information through an API in case of a compute host +fault. + +The Doctor platform consists of the following components: + +* OpenStack Compute (Nova) +* OpenStack Networking (Neutron) +* OpenStack Telemetry (Ceilometer) +* OpenStack Alarming (AODH) +* Doctor Sample Inspector, OpenStack Congress or OpenStack Vitrage +* Doctor Sample Monitor or any monitor supported by Congress or Vitrage + +.. note:: + Doctor Sample Monitor is used in Doctor testing. However in real + implementation like Vitrage, there are several other monitors supported. + +You can see an overview of the Doctor platform and how components interact in +:numref:`figure-p1`. + + +Maintenance use case provides these features since `Iruya Release <https://wiki.opnfv.org/display/SWREL/Iruya>`_: + +* Infrastructure maintenance and upgrade workflow +* Interaction between VNFM and infrastructe workflow + +Since `Jerma Release <https://wiki.opnfv.org/display/SWREL/Jerma>`_ maintenance +use case also supports 'ETSI FEAT03' implementation to have the infrastructure +maintenance and upgrade fully optimized while keeping zero impact on VNF +service. + diff --git a/docs/development/requirements/index.rst b/docs/development/requirements/index.rst index fceaebf0..ccc35cb8 100644 --- a/docs/development/requirements/index.rst +++ b/docs/development/requirements/index.rst @@ -3,9 +3,9 @@ .. _doctor-requirements: -**************************************** -Doctor: Fault Management and Maintenance -**************************************** +********************************************** +Requirements: Fault Management and Maintenance +********************************************** :Project: Doctor, https://wiki.opnfv.org/doctor :Editors: Ashiq Khan (NTT DOCOMO), Gerald Kunzmann (NTT DOCOMO) diff --git a/docs/index.rst b/docs/index.rst index 4dedb98d..b8e8bfd0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,6 @@ Fault Management and Maintenance (Doctor) :numbered: :maxdepth: 2 - release/index development/index - + release/index + testing/index diff --git a/docs/release/configguide/feature.configuration.rst b/docs/release/configguide/feature.configuration.rst index 64928eea..8fbff50e 100644 --- a/docs/release/configguide/feature.configuration.rst +++ b/docs/release/configguide/feature.configuration.rst @@ -159,3 +159,57 @@ You can configure the Sample Monitor as follows (Example for Apex deployment): "http://127.0.0.1:$INSPECTOR_PORT/events" > monitor.log 2>&1 & **Collectd Monitor** + +OpenStack components +==================== + +In OPNFV and with Doctor testing you can have all OpenStack components configured +as needed. Here is sample of the needed configuration modifications. + +Ceilometer +---------- + +/etc/ceilometer/event_definitions.yaml: +# Maintenance use case needs new alarm definitions to be added +- event_type: maintenance.scheduled + traits: + actions_at: + fields: payload.maintenance_at + type: datetime + allowed_actions: + fields: payload.allowed_actions + host_id: + fields: payload.host_id + instances: + fields: payload.instances + metadata: + fields: payload.metadata + project_id: + fields: payload.project_id + reply_url: + fields: payload.reply_url + session_id: + fields: payload.session_id + state: + fields: payload.state +- event_type: maintenance.host + traits: + host: + fields: payload.host + project_id: + fields: payload.project_id + session_id: + fields: payload.session_id + state: + fields: payload.state + +/etc/ceilometer/event_pipeline.yaml: +# Maintenance and Fault management both needs these to be added + - notifier:// + - notifier://?topic=alarm.all + +Nova +---- + +/etc/nova/nova.conf +cpu_allocation_ratio=1.0 diff --git a/docs/release/configguide/index.rst b/docs/release/configguide/index.rst index b1e7c33d..c2331115 100644 --- a/docs/release/configguide/index.rst +++ b/docs/release/configguide/index.rst @@ -3,9 +3,9 @@ .. _doctor-configguide: -************************* -Doctor Installation Guide -************************* +************************** +Doctor Configuration Guide +************************** .. toctree:: :maxdepth: 2 diff --git a/docs/release/index.rst b/docs/release/index.rst index 8a1bf405..67eb4c5f 100644 --- a/docs/release/index.rst +++ b/docs/release/index.rst @@ -2,14 +2,18 @@ .. http://creativecommons.org/licenses/by/4.0 .. (c) 2017 OPNFV. +.. _release: -====== -Doctor -====== +======= +Release +======= .. toctree:: :maxdepth: 2 + ./configguide/index.rst ./installation/index.rst + ./release-notes/index.rst + ./scenarios/fault_management/fault_management.rst + ./scenarios/maintenance/maintenance.rst ./userguide/index.rst - diff --git a/docs/development/manuals/index.rst b/docs/release/installation/index.rst index f705f94a..f6527e5d 100644 --- a/docs/development/manuals/index.rst +++ b/docs/release/installation/index.rst @@ -1,13 +1,13 @@ .. This work is licensed under a Creative Commons Attribution 4.0 International License. .. http://creativecommons.org/licenses/by/4.0 -.. _doctor-manuals: +.. _doctor-configguide: -******* -Manuals -******* +************************* +Doctor Installation Guide +************************* .. toctree:: + :maxdepth: 2 -.. include:: mark-host-down_manual.rst -.. include:: get-valid-server-state.rst + installation.rst diff --git a/docs/release/installation/installation.rst b/docs/release/installation/installation.rst new file mode 100644 index 00000000..564f19fd --- /dev/null +++ b/docs/release/installation/installation.rst @@ -0,0 +1,44 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + +Doctor Installation +==================== + +You can clone doctor project in OPNFV installer jumphost or if you are not +in OPNFV environment you can clone Doctor to DevStack controller node + +git clone https://gerrit.opnfv.org/gerrit/doctor + +In DevStack controller here is a sample of including what Doctor testing +will require for sample fault management testing and for maintenance +testing using Fenix + +.. code-block:: bash + + git clone https://github.com/openstack/devstack -b stable/train + +.. code-block:: bash + + cd devstack vi local.conf + +.. code-block:: bash + + [[local|localrc]] + GIT_BASE=https://git.openstack.org + HOST_IP=<host_ip> + ADMIN_PASSWORD=admin + DATABASE_PASSWORD=admin + RABBIT_PASSWORD=admin + SERVICE_PASSWORD=admin + LOGFILE=/opt/stack/stack.sh.log + + PUBLIC_INTERFACE=eth0 + + CEILOMETER_EVENT_ALARM=True + + ENABLED_SERVICES=key,rabbit,mysql,fenix-engine,fenix-api,aodh-evaluator,aodh-notifier,aodh-api + + enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer stable/train + enable_plugin aodh https://git.openstack.org/openstack/aodh stable/train + enable_plugin gnocchi https://github.com/openstack/gnocchi + enable_plugin fenix https://opendev.org/x/fenix master diff --git a/docs/release/release-notes/release-notes.rst b/docs/release/release-notes/release-notes.rst index 142bfacf..b525335e 100644 --- a/docs/release/release-notes/release-notes.rst +++ b/docs/release/release-notes/release-notes.rst @@ -2,140 +2,46 @@ .. http://creativecommons.org/licenses/by/4.0 -This document provides the release notes for Gambia of Doctor. +This document provides the release notes for Iruya version of Doctor. Important notes =============== -In Gambia release, Doctor has been working with our second use case over -maintenance. Design guideline is now done and test case exists with sample -maintenance workflow code implemented in Doctor. Work has also started to have -the real implementation done in the OpenStack Fenix project -https://wiki.openstack.org/wiki/Fenix. - -Doctor CI testing has now moved to use tox on jumphots instead of running test -through features container. Also in Apex we use OpenStack services running in -containers. Functest daily testing supports Doctor fault management test case -for Apex, Daisy and Fuel installers. This testing is done through features -container. - -In this release, Doctor has not been working with the fault management use case as -the basic framework has been already done. However, we might need to get back to -it later to better meet the tough industry requirements as well as requirements -from edge, containers and 5G. +Jerma release has mainly been for finalizing maintenance use case testing +supporting the ETSI FEAT03 defined interactino between VNFM and infrastructure. +This is mainly to have infrastructure maintenance and upgrade operations +opttimized as fast as they can while keeping VNFs on top with zero impact +on their service. +Further more this is the final release of Doctor and the more deep testing is +moving more to upstream projects like Fenix for the maintenance. Also in +this release we have made sure that all Doctor testing and any deeper testing +with ehe upstream projects can be done in DevStack. This also makes DevStack +the most important installer. Summary ======= -Gambia Doctor framework uses OpenStack Queens integrated into its test cases. -Compared to the previous release, the Heat project is also being used in the -maintenance test case. +Jerma Doctor framework uses OpenStack Train integrated into its test cases. Release Data ============ Doctor changes -+------------------------------------------+----------------------------------------------------------+ -| **commit-ID** | **Subject** | -+------------------------------------------+----------------------------------------------------------+ -| 5b3f5937e7b861fca46b2a6b2d6708866b800f95 | fix building docs | -+------------------------------------------+----------------------------------------------------------+ -| 2ca5924081ce4784f599437707bd32807aa155ce | Fix SSH client connection reset | -+------------------------------------------+----------------------------------------------------------+ -| baac6579556f8216b36db0d0f87f9c2d4f8b4ef5 | Support Apex with services in containers | -+------------------------------------------+----------------------------------------------------------+ -| 23bf63c4616040cb0d69cd26238af2a4a7c00a90 | fix the username to login undercloud in Apex | -+------------------------------------------+----------------------------------------------------------+ -| 61eb3927ada784cc3dffb5ddd17f66e47871f708 | Local Documentation Builds | -+------------------------------------------+----------------------------------------------------------+ -| 0f1dd4314b9e0247d9af7af6df2410462423aeca | Updated from global requirements | -+------------------------------------------+----------------------------------------------------------+ -| 2d4a9f0c0a93797da6534583f6e74553a4b634be | Fix links to remove references to submodules | -+------------------------------------------+----------------------------------------------------------+ -| 3ddc2392b0ed364eede49ff006d64df3ea456350 | Gambia release notes | -+------------------------------------------+----------------------------------------------------------+ -| 825a0a0dd5e8028129b782ed21c549586257b1c5 | delete doctor datasource in congress when cleanup | -+------------------------------------------+----------------------------------------------------------+ -| fcf53129ab2b18b84571faff13d7cb118b3a41b3 | run profile even the notification time is larger than 1S | -+------------------------------------------+----------------------------------------------------------+ -| 495965d0336d42fc36494c81fd15cee2f34c96e9 | Update and add test case | -+------------------------------------------+----------------------------------------------------------+ -| da25598a6a31abe0579ffed12d1719e5ff75f9a7 | bugfix: add doctor datasource in congress | -+------------------------------------------+----------------------------------------------------------+ -| f9e1e3b1ae4be80bc2dc61d9c4213c81c091ea72 | Update the maintenance design document | -+------------------------------------------+----------------------------------------------------------+ -| 4639f15e6db2f1480b41f6fbfd11d70312d4e421 | Add maintenance test code | -+------------------------------------------+----------------------------------------------------------+ -| b54cbc5dd2d32fcb27238680b4657ed384d021c5 | Add setup and cleanup for maintenance test | -+------------------------------------------+----------------------------------------------------------+ -| b2bb504032ac81a2ed3f404113b097d9ce3d7f14 | bugfix: kill the stunnel when cleanup | -+------------------------------------------+----------------------------------------------------------+ -| eaeb3c0f9dc9e6645a159d0a78b9fc181fce53d4 | add ssh_keyfile for connect to installer in Apex | -+------------------------------------------+----------------------------------------------------------+ -| dcbe7bf1c26052b0e95d209254e7273aa1eaace1 | Add tox and test case to testing document | -+------------------------------------------+----------------------------------------------------------+ -| 0f607cb5efd91ee497346b7f792dfa844d15595c | enlarge the time of link down | -+------------------------------------------+----------------------------------------------------------+ -| 1351038a65739b8d799820de515178326ad05f7b | bugfix: fix the filename of ssh tunnel | -+------------------------------------------+----------------------------------------------------------+ -| e70bf248daac03eee6b449cd1654d2ee6265dd8c | Use py34 instead of py35 | -+------------------------------------------+----------------------------------------------------------+ -| 2a60d460eaf018951456451077b7118b60219b32 | add INSPECTOR_TYPE and TEST_CASE to tox env | -+------------------------------------------+----------------------------------------------------------+ -| 2043ceeb08c1eca849daeb2b3696d385425ba061 | [consumer] fix default value for port number | -+------------------------------------------+----------------------------------------------------------+ - -Releng changes - -+------------------------------------------+-----------------------------------------------------------------------+ -| **commit-ID** | **Subject** | -+------------------------------------------+-----------------------------------------------------------------------+ -| c87309f5a75ccc5d595f708817b97793c24c4387 | Add Doctor maintenance job | -+------------------------------------------+-----------------------------------------------------------------------+ -| bd16a9756ffd0743e143f0f2f966da8dd666c7a3 | remove congress test in Daisy | -+------------------------------------------+-----------------------------------------------------------------------+ -| c47aaaa53c91aae93877f2532c72374beaa4eabe | remove fuel job in Doctor | -+------------------------------------------+-----------------------------------------------------------------------+ -| ab2fed2522eaf82ea7c63dd05008a37c56e825d0 | use 'workspace-cleanup' plugin in publisher | -+------------------------------------------+-----------------------------------------------------------------------+ -| 3aaed5cf40092744f1b87680b9205a2901baecf3 | clean the workspace in the publisher | -+------------------------------------------+-----------------------------------------------------------------------+ -| 50151eb3717edd4ddd996f3705fbe1732de7f3b7 | run tox with 'sudo' | -+------------------------------------------+-----------------------------------------------------------------------+ -| a3adc85ecb52f5d19ec4e9c49ca1ac35aa429ff9 | remove inspector variable form job template | -+------------------------------------------+-----------------------------------------------------------------------+ -| adfbaf2a3e8487e4c9152bf864a653a0425b8582 | run doctor tests with different inspectors in sequence | -+------------------------------------------+-----------------------------------------------------------------------+ -| 2e98e56224cd550cb3bf9798e420eece28139bd9 | add the ssh_key info if the key_file is exist | -+------------------------------------------+-----------------------------------------------------------------------+ -| c109c271018e9a85d94be1b9b468338d64589684 | prepare installer info for doctor test | -+------------------------------------------+-----------------------------------------------------------------------+ -| 57cbefc7160958eae1d49e4753779180a25864af | use py34 for tox | -+------------------------------------------+-----------------------------------------------------------------------+ -| 3547754e808a581b09c9d22e013a7d986d9f6cd1 | specify the cacert file when it exits | -+------------------------------------------+-----------------------------------------------------------------------+ -| ef4f36aa1c2ff0819d73cde44f84b99a42e15c7e | bugfix: wrong usage of '!include-raw' | -+------------------------------------------+-----------------------------------------------------------------------+ -| 0e0e0d4cb71fb27b1789a2bef2d3c4ff313e67ff | use tox instead of functest for doctor CI jobs | -+------------------------------------------+-----------------------------------------------------------------------+ -| 5b22f1b95feacaec0380f6a7543cbf510b628451 | pass value to parameters | -+------------------------------------------+-----------------------------------------------------------------------+ -| 44ab0cea07fa2a734c4f6b80776ad48fd006d1b8 | Doctor job bugfix: fix the scenario | -+------------------------------------------+-----------------------------------------------------------------------+ -| 17617f1c0a78c7bdad0d11d329a6c7e119cbbddd | bugfix: run doctor tests parallelly | -+------------------------------------------+-----------------------------------------------------------------------+ -| 811e4ef7f4c37b7bc246afc34ff880c014ecc05d | delete 'opnfv-build-ubuntu-defaults' parameters for doctor verify job | -+------------------------------------------+-----------------------------------------------------------------------+ -| 0705f31ab5bc54c073df120cbe0fe62cf10f9a81 | delete the 'node' parameter in 'doctor-slave-parameter' macro | -+------------------------------------------+-----------------------------------------------------------------------+ -| 304151b15f9d7241db8c5fea067cafe048287d84 | fix the default node label for doctor test | -+------------------------------------------+-----------------------------------------------------------------------+ -| a6963f92f015a33b44b27199886952205499b44c | Fix project name | -+------------------------------------------+-----------------------------------------------------------------------+ -| f122bfed998b3b0e0178106a7538377c609c6512 | add a default value for SSH_KEY | -+------------------------------------------+-----------------------------------------------------------------------+ +- Maintenance use case updated to support latest version of Fenix. +- Maintenance use case now supports ETSI FEAT03 optimization with Fenix. +- Doctor testing is now preferred to be done in DevStack environment + where one can easily select OpenStack release from Rocky to Ussuri to + test Doctor functionality. Latest OPNFV Fuel can also be used for the + OpenStack version it supports. + +Doctor CI + +- Doctor tested with fuel installer. +- Fault management use case is tested with sample inspector. +- Maintenance use case is tested with sample implementation and towards + the latest Fenix version. The includes the new ETSI FEAT03 optimization. Version change ^^^^^^^^^^^^^^ @@ -143,49 +49,34 @@ Version change Module version changes ~~~~~~~~~~~~~~~~~~~~~~ -- OpenStack has changed from Pike-1 to Queens-1 +- OpenStack has changed Train Document version changes ~~~~~~~~~~~~~~~~~~~~~~~~ -These documents have been updated in Gambia release - -- Testing document - docs/development/overview/testing.rst -- Doctor scenario in functest - docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst -- Maintenance design guideline - docs/development/design/maintenance-design-guideline.rst +All documentation is updated to OPNFV unified format according to +documentation guidelines. Small updates in many documents. Reason for version ^^^^^^^^^^^^^^^^^^ -Documentation is updated due to tox usage in testing and adding maintenance -use case related documentation. +N/A Feature additions ~~~~~~~~~~~~~~~~~ -+--------------------+--------------------------------------------------------+ -| **JIRA REFERENCE** | **SLOGAN** | -+--------------------+--------------------------------------------------------+ -| DOCTOR-106 | Maintenance scenario | -+--------------------+--------------------------------------------------------+ -| DOCTOR-125 | Maintenance design document according to our test case | -+--------------------+--------------------------------------------------------+ -| DOCTOR-126 | Use Tox instead of Functest for doctor CI jobs | -+--------------------+--------------------------------------------------------+ -| DOCTOR-127 | Maintenance test POD | -+--------------------+--------------------------------------------------------+ -| DOCTOR-130 | Apex with containers | -+--------------------+--------------------------------------------------------+ - ++--------------------+--------------------------------------------+ +| **JIRA REFERENCE** | **SLOGAN** | ++--------------------+--------------------------------------------+ +| DOCTOR-137 | VNFM maintenance with ETSI changes | ++--------------------+--------------------------------------------+ +| DOCTOR-136 | DevStack support | ++--------------------+--------------------------------------------+ Deliverables ------------ - Software deliverables ===================== @@ -226,74 +117,21 @@ Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample +--------------------------------------+--------------+ | **TEST-SUITE** | **Results:** | +--------------------------------------+--------------+ -| INSTALLER_TYPE='Apex' | SUCCESS | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Compass' | N/A | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Daisy' | SUCCESS | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Fuel' | No POD | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Joid' | N/A | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Local' | N/A | +| INSTALLER_TYPE='fuel' | SUCCESS | +--------------------------------------+--------------+ -Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=congress -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Doctor CI results with TEST_CASE='maintenance' and INSPECTOR_TYPE=sample +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------------+--------------+ | **TEST-SUITE** | **Results:** | +--------------------------------------+--------------+ -| INSTALLER_TYPE='Apex' | FAILED | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Compass' | N/A | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Daisy' | N/A | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Fuel' | No POD | +| INSTALLER_TYPE='fuel' | SUCCESS | +| ADMIN_TOOL_TYPE='fenix' *) | | +--------------------------------------+--------------+ -| INSTALLER_TYPE='Joid' | N/A | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Local' | N/A | -+--------------------------------------+--------------+ - -Doctor Functest results with TEST_CASE='fault_management' -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -+--------------------------------------+--------------+ -| **TEST-SUITE** | **Results:** | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Apex' | skipped | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Compass' | N/A | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Daisy' | skipped | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Fuel' | skipped | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Joid' | N/A | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Local' | N/A | -+--------------------------------------+--------------+ - -Note: Installer Functest does not currently test features or skips running the -project test cases - -Doctor CI results with TEST_CASE='maintenance' -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -+--------------------------------------+--------------+ -| **TEST-SUITE** | **Results:** | -+--------------------------------------+--------------+ -| INSTALLER_TYPE='Apex' | SUCCESS | -+--------------------------------------+--------------+ - -Doctor Functest results with TEST_CASE='maintenance' -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -N/A - Needs special target and currently there is only sample implementation +*) Sample implementation not updated according to latest upstream Fenix + and is currently not being tested. References ========== @@ -301,3 +139,8 @@ References For more information about the OPNFV Doctor latest work, please see: https://wiki.opnfv.org/display/doctor/Doctor+Home + +Further information about ETSI FEAT03 optimization can be found from Fenix +Documentation: + +https://fenix.readthedocs.io/en/latest diff --git a/docs/release/release-notes/releasenotes_gambia.rst b/docs/release/release-notes/releasenotes_gambia.rst new file mode 100644 index 00000000..142bfacf --- /dev/null +++ b/docs/release/release-notes/releasenotes_gambia.rst @@ -0,0 +1,303 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + + +This document provides the release notes for Gambia of Doctor. + +Important notes +=============== + +In Gambia release, Doctor has been working with our second use case over +maintenance. Design guideline is now done and test case exists with sample +maintenance workflow code implemented in Doctor. Work has also started to have +the real implementation done in the OpenStack Fenix project +https://wiki.openstack.org/wiki/Fenix. + +Doctor CI testing has now moved to use tox on jumphots instead of running test +through features container. Also in Apex we use OpenStack services running in +containers. Functest daily testing supports Doctor fault management test case +for Apex, Daisy and Fuel installers. This testing is done through features +container. + +In this release, Doctor has not been working with the fault management use case as +the basic framework has been already done. However, we might need to get back to +it later to better meet the tough industry requirements as well as requirements +from edge, containers and 5G. + + +Summary +======= + +Gambia Doctor framework uses OpenStack Queens integrated into its test cases. +Compared to the previous release, the Heat project is also being used in the +maintenance test case. + +Release Data +============ + +Doctor changes + ++------------------------------------------+----------------------------------------------------------+ +| **commit-ID** | **Subject** | ++------------------------------------------+----------------------------------------------------------+ +| 5b3f5937e7b861fca46b2a6b2d6708866b800f95 | fix building docs | ++------------------------------------------+----------------------------------------------------------+ +| 2ca5924081ce4784f599437707bd32807aa155ce | Fix SSH client connection reset | ++------------------------------------------+----------------------------------------------------------+ +| baac6579556f8216b36db0d0f87f9c2d4f8b4ef5 | Support Apex with services in containers | ++------------------------------------------+----------------------------------------------------------+ +| 23bf63c4616040cb0d69cd26238af2a4a7c00a90 | fix the username to login undercloud in Apex | ++------------------------------------------+----------------------------------------------------------+ +| 61eb3927ada784cc3dffb5ddd17f66e47871f708 | Local Documentation Builds | ++------------------------------------------+----------------------------------------------------------+ +| 0f1dd4314b9e0247d9af7af6df2410462423aeca | Updated from global requirements | ++------------------------------------------+----------------------------------------------------------+ +| 2d4a9f0c0a93797da6534583f6e74553a4b634be | Fix links to remove references to submodules | ++------------------------------------------+----------------------------------------------------------+ +| 3ddc2392b0ed364eede49ff006d64df3ea456350 | Gambia release notes | ++------------------------------------------+----------------------------------------------------------+ +| 825a0a0dd5e8028129b782ed21c549586257b1c5 | delete doctor datasource in congress when cleanup | ++------------------------------------------+----------------------------------------------------------+ +| fcf53129ab2b18b84571faff13d7cb118b3a41b3 | run profile even the notification time is larger than 1S | ++------------------------------------------+----------------------------------------------------------+ +| 495965d0336d42fc36494c81fd15cee2f34c96e9 | Update and add test case | ++------------------------------------------+----------------------------------------------------------+ +| da25598a6a31abe0579ffed12d1719e5ff75f9a7 | bugfix: add doctor datasource in congress | ++------------------------------------------+----------------------------------------------------------+ +| f9e1e3b1ae4be80bc2dc61d9c4213c81c091ea72 | Update the maintenance design document | ++------------------------------------------+----------------------------------------------------------+ +| 4639f15e6db2f1480b41f6fbfd11d70312d4e421 | Add maintenance test code | ++------------------------------------------+----------------------------------------------------------+ +| b54cbc5dd2d32fcb27238680b4657ed384d021c5 | Add setup and cleanup for maintenance test | ++------------------------------------------+----------------------------------------------------------+ +| b2bb504032ac81a2ed3f404113b097d9ce3d7f14 | bugfix: kill the stunnel when cleanup | ++------------------------------------------+----------------------------------------------------------+ +| eaeb3c0f9dc9e6645a159d0a78b9fc181fce53d4 | add ssh_keyfile for connect to installer in Apex | ++------------------------------------------+----------------------------------------------------------+ +| dcbe7bf1c26052b0e95d209254e7273aa1eaace1 | Add tox and test case to testing document | ++------------------------------------------+----------------------------------------------------------+ +| 0f607cb5efd91ee497346b7f792dfa844d15595c | enlarge the time of link down | ++------------------------------------------+----------------------------------------------------------+ +| 1351038a65739b8d799820de515178326ad05f7b | bugfix: fix the filename of ssh tunnel | ++------------------------------------------+----------------------------------------------------------+ +| e70bf248daac03eee6b449cd1654d2ee6265dd8c | Use py34 instead of py35 | ++------------------------------------------+----------------------------------------------------------+ +| 2a60d460eaf018951456451077b7118b60219b32 | add INSPECTOR_TYPE and TEST_CASE to tox env | ++------------------------------------------+----------------------------------------------------------+ +| 2043ceeb08c1eca849daeb2b3696d385425ba061 | [consumer] fix default value for port number | ++------------------------------------------+----------------------------------------------------------+ + +Releng changes + ++------------------------------------------+-----------------------------------------------------------------------+ +| **commit-ID** | **Subject** | ++------------------------------------------+-----------------------------------------------------------------------+ +| c87309f5a75ccc5d595f708817b97793c24c4387 | Add Doctor maintenance job | ++------------------------------------------+-----------------------------------------------------------------------+ +| bd16a9756ffd0743e143f0f2f966da8dd666c7a3 | remove congress test in Daisy | ++------------------------------------------+-----------------------------------------------------------------------+ +| c47aaaa53c91aae93877f2532c72374beaa4eabe | remove fuel job in Doctor | ++------------------------------------------+-----------------------------------------------------------------------+ +| ab2fed2522eaf82ea7c63dd05008a37c56e825d0 | use 'workspace-cleanup' plugin in publisher | ++------------------------------------------+-----------------------------------------------------------------------+ +| 3aaed5cf40092744f1b87680b9205a2901baecf3 | clean the workspace in the publisher | ++------------------------------------------+-----------------------------------------------------------------------+ +| 50151eb3717edd4ddd996f3705fbe1732de7f3b7 | run tox with 'sudo' | ++------------------------------------------+-----------------------------------------------------------------------+ +| a3adc85ecb52f5d19ec4e9c49ca1ac35aa429ff9 | remove inspector variable form job template | ++------------------------------------------+-----------------------------------------------------------------------+ +| adfbaf2a3e8487e4c9152bf864a653a0425b8582 | run doctor tests with different inspectors in sequence | ++------------------------------------------+-----------------------------------------------------------------------+ +| 2e98e56224cd550cb3bf9798e420eece28139bd9 | add the ssh_key info if the key_file is exist | ++------------------------------------------+-----------------------------------------------------------------------+ +| c109c271018e9a85d94be1b9b468338d64589684 | prepare installer info for doctor test | ++------------------------------------------+-----------------------------------------------------------------------+ +| 57cbefc7160958eae1d49e4753779180a25864af | use py34 for tox | ++------------------------------------------+-----------------------------------------------------------------------+ +| 3547754e808a581b09c9d22e013a7d986d9f6cd1 | specify the cacert file when it exits | ++------------------------------------------+-----------------------------------------------------------------------+ +| ef4f36aa1c2ff0819d73cde44f84b99a42e15c7e | bugfix: wrong usage of '!include-raw' | ++------------------------------------------+-----------------------------------------------------------------------+ +| 0e0e0d4cb71fb27b1789a2bef2d3c4ff313e67ff | use tox instead of functest for doctor CI jobs | ++------------------------------------------+-----------------------------------------------------------------------+ +| 5b22f1b95feacaec0380f6a7543cbf510b628451 | pass value to parameters | ++------------------------------------------+-----------------------------------------------------------------------+ +| 44ab0cea07fa2a734c4f6b80776ad48fd006d1b8 | Doctor job bugfix: fix the scenario | ++------------------------------------------+-----------------------------------------------------------------------+ +| 17617f1c0a78c7bdad0d11d329a6c7e119cbbddd | bugfix: run doctor tests parallelly | ++------------------------------------------+-----------------------------------------------------------------------+ +| 811e4ef7f4c37b7bc246afc34ff880c014ecc05d | delete 'opnfv-build-ubuntu-defaults' parameters for doctor verify job | ++------------------------------------------+-----------------------------------------------------------------------+ +| 0705f31ab5bc54c073df120cbe0fe62cf10f9a81 | delete the 'node' parameter in 'doctor-slave-parameter' macro | ++------------------------------------------+-----------------------------------------------------------------------+ +| 304151b15f9d7241db8c5fea067cafe048287d84 | fix the default node label for doctor test | ++------------------------------------------+-----------------------------------------------------------------------+ +| a6963f92f015a33b44b27199886952205499b44c | Fix project name | ++------------------------------------------+-----------------------------------------------------------------------+ +| f122bfed998b3b0e0178106a7538377c609c6512 | add a default value for SSH_KEY | ++------------------------------------------+-----------------------------------------------------------------------+ + +Version change +^^^^^^^^^^^^^^ + +Module version changes +~~~~~~~~~~~~~~~~~~~~~~ + +- OpenStack has changed from Pike-1 to Queens-1 + +Document version changes +~~~~~~~~~~~~~~~~~~~~~~~~ + +These documents have been updated in Gambia release + +- Testing document + docs/development/overview/testing.rst +- Doctor scenario in functest + docs/development/overview/functest_scenario/doctor-scenario-in-functest.rst +- Maintenance design guideline + docs/development/design/maintenance-design-guideline.rst + +Reason for version +^^^^^^^^^^^^^^^^^^ + +Documentation is updated due to tox usage in testing and adding maintenance +use case related documentation. + +Feature additions +~~~~~~~~~~~~~~~~~ + ++--------------------+--------------------------------------------------------+ +| **JIRA REFERENCE** | **SLOGAN** | ++--------------------+--------------------------------------------------------+ +| DOCTOR-106 | Maintenance scenario | ++--------------------+--------------------------------------------------------+ +| DOCTOR-125 | Maintenance design document according to our test case | ++--------------------+--------------------------------------------------------+ +| DOCTOR-126 | Use Tox instead of Functest for doctor CI jobs | ++--------------------+--------------------------------------------------------+ +| DOCTOR-127 | Maintenance test POD | ++--------------------+--------------------------------------------------------+ +| DOCTOR-130 | Apex with containers | ++--------------------+--------------------------------------------------------+ + + + +Deliverables +------------ + + +Software deliverables +===================== + +None + +Documentation deliverables +========================== + +https://git.opnfv.org/doctor/tree/docs + +Known Limitations, Issues and Workarounds +========================================= + +System Limitations +^^^^^^^^^^^^^^^^^^ + +Maintenance test case requirements: + +- Minimum number of nodes: 1 Controller, 3 Computes +- Min number of VCPUs: 2 VCPUs for each compute + +Known issues +^^^^^^^^^^^^ + +None + +Workarounds +^^^^^^^^^^^ + +None + +Test Result +=========== + +Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Apex' | SUCCESS | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Compass' | N/A | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Daisy' | SUCCESS | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Fuel' | No POD | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Joid' | N/A | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Local' | N/A | ++--------------------------------------+--------------+ + +Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=congress +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Apex' | FAILED | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Compass' | N/A | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Daisy' | N/A | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Fuel' | No POD | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Joid' | N/A | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Local' | N/A | ++--------------------------------------+--------------+ + + +Doctor Functest results with TEST_CASE='fault_management' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Apex' | skipped | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Compass' | N/A | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Daisy' | skipped | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Fuel' | skipped | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Joid' | N/A | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Local' | N/A | ++--------------------------------------+--------------+ + +Note: Installer Functest does not currently test features or skips running the +project test cases + +Doctor CI results with TEST_CASE='maintenance' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='Apex' | SUCCESS | ++--------------------------------------+--------------+ + +Doctor Functest results with TEST_CASE='maintenance' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +N/A - Needs special target and currently there is only sample implementation + +References +========== + +For more information about the OPNFV Doctor latest work, please see: + +https://wiki.opnfv.org/display/doctor/Doctor+Home diff --git a/docs/release/release-notes/releasenotes_iruya.rst b/docs/release/release-notes/releasenotes_iruya.rst new file mode 100644 index 00000000..92775557 --- /dev/null +++ b/docs/release/release-notes/releasenotes_iruya.rst @@ -0,0 +1,129 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + + +This document provides the release notes for Iruya version of Doctor. + +Important notes +=============== + +In Iruya release there has not been many changes. + +All testing is now being made with Fuel installer. Maintenance use case +is now only tested against latest upstream Fenix. Only sample inspector is +tested as Fuel do not support Vitrage or Congress. + +Summary +======= + +Iruya Doctor framework uses OpenStack Stein integrated into its test cases. + +Release Data +============ + +Doctor changes + +- Maintenance use case updated to support latest version of Fenix running + in container on controller node +- Maintenance use case now support Fuel installer +- Doctor updated to use OpenStack Stein and only python 3.6 +- Testing only sample inspector as lacking installer support for + Vitrage and Congress + +Releng changes + +- Doctor testing running with python 3.6 and with sample inspector +- Doctor is only tested with Fuel installer + +Version change +^^^^^^^^^^^^^^ + +Module version changes +~~~~~~~~~~~~~~~~~~~~~~ + +- OpenStack has changed from Rocky to Stein since previous Hunter release. + +Document version changes +~~~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +Reason for version +^^^^^^^^^^^^^^^^^^ + +N/A + +Feature additions +~~~~~~~~~~~~~~~~~ + ++--------------------+--------------------------------------------------------------+ +| **JIRA REFERENCE** | **SLOGAN** | ++--------------------+--------------------------------------------------------------+ +| DOCTOR-134 | Update Doctor maintenance use case to work with latest Fenix | ++--------------------+--------------------------------------------------------------+ + +Deliverables +------------ + +Software deliverables +===================== + +None + +Documentation deliverables +========================== + +https://git.opnfv.org/doctor/tree/docs + +Known Limitations, Issues and Workarounds +========================================= + +System Limitations +^^^^^^^^^^^^^^^^^^ + +Maintenance test case requirements: + +- Minimum number of nodes: 1 Controller, 3 Computes +- Min number of VCPUs: 2 VCPUs for each compute + +Known issues +^^^^^^^^^^^^ + +None + +Workarounds +^^^^^^^^^^^ + +None + +Test Result +=========== + +Doctor CI results with TEST_CASE='fault_management' and INSPECTOR_TYPE=sample +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='fuel' | SUCCESS | ++--------------------------------------+--------------+ + +Doctor CI results with TEST_CASE='maintenance' and INSPECTOR_TYPE=sample +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------------------------------------+--------------+ +| **TEST-SUITE** | **Results:** | ++--------------------------------------+--------------+ +| INSTALLER_TYPE='fuel' | SUCCESS | +| ADMIN_TOOL_TYPE='fenix' *) | | ++--------------------------------------+--------------+ + +*) Sample implementation not updated according to latest upstream Fenix + and is currently not being tested. + +References +========== + +For more information about the OPNFV Doctor latest work, please see: + +https://wiki.opnfv.org/display/doctor/Doctor+Home diff --git a/docs/release/scenarios/fault_management/fault_management.rst b/docs/release/scenarios/fault_management/fault_management.rst new file mode 100644 index 00000000..99371201 --- /dev/null +++ b/docs/release/scenarios/fault_management/fault_management.rst @@ -0,0 +1,90 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + + +Running test cases +"""""""""""""""""" + +Functest will call the "doctor_tests/main.py" in Doctor to run the test job. +Doctor testing can also be triggered by tox on OPNFV installer jumphost. Tox +is normally used for functional, module and coding style testing in Python +project. + +Currently 'MCP' and 'devstack' installer are supported. + + +Fault management use case +""""""""""""""""""""""""" + +* A consumer of the NFVI wants to receive immediate notifications about faults + in the NFVI affecting the proper functioning of the virtual resources. + Therefore, such faults have to be detected as quickly as possible, and, when + a critical error is observed, the affected consumer is immediately informed + about the fault and can switch over to the STBY configuration. + +The faults to be monitored (and at which detection rate) will be configured by +the consumer. Once a fault is detected, the Inspector in the Doctor +architecture will check the resource map maintained by the Controller, to find +out which virtual resources are affected and then update the resources state. +The Notifier will receive the failure event requests sent from the Controller, +and notify the consumer(s) of the affected resources according to the alarm +configuration. + +Detailed workflow information is as follows: + +* Consumer(VNFM): (step 0) creates resources (network, server/instance) and an + event alarm on state down notification of that server/instance or Neutron + port. + +* Monitor: (step 1) periodically checks nodes, such as ping from/to each + dplane nic to/from gw of node, (step 2) once it fails to send out event + with "raw" fault event information to Inspector + +* Inspector: when it receives an event, it will (step 3) mark the host down + ("mark-host-down"), (step 4) map the PM to VM, and change the VM status to + down. In network failure case, also Neutron port is changed to down. + +* Controller: (step 5) sends out instance update event to Ceilometer. In network + failure case, also Neutron port is changed to down and corresponding event is + sent to Ceilometer. + +* Notifier: (step 6) Ceilometer transforms and passes the events to AODH, + (step 7) AODH will evaluate events with the registered alarm definitions, + then (step 8) it will fire the alarm to the "consumer" who owns the + instance + +* Consumer(VNFM): (step 9) receives the event and (step 10) recreates a new + instance + +Fault management test case +"""""""""""""""""""""""""" + +Functest will call the 'doctor-test' command in Doctor to run the test job. + +The following steps are executed: + +Firstly, get the installer ip according to the installer type. Then ssh to +the installer node to get the private key for accessing to the cloud. As +'fuel' installer, ssh to the controller node to modify nova and ceilometer +configurations. + +Secondly, prepare image for booting VM, then create a test project and test +user (both default to doctor) for the Doctor tests. + +Thirdly, boot a VM under the doctor project and check the VM status to verify +that the VM is launched completely. Then get the compute host info where the VM +is launched to verify connectivity to the target compute host. Get the consumer +ip according to the route to compute ip and create an alarm event in Ceilometer +using the consumer ip. + +Fourthly, the Doctor components are started, and, based on the above preparation, +a failure is injected to the system, i.e. the network of compute host is +disabled for 3 minutes. To ensure the host is down, the status of the host +will be checked. + +Finally, the notification time, i.e. the time between the execution of step 2 +(Monitor detects failure) and step 9 (Consumer receives failure notification) +is calculated. + +According to the Doctor requirements, the Doctor test is successful if the +notification time is below 1 second. diff --git a/docs/development/overview/functest_scenario/images/Fault-management-design.png b/docs/release/scenarios/maintenance/images/Fault-management-design.png Binary files differindex 6d98cdec..6d98cdec 100644 --- a/docs/development/overview/functest_scenario/images/Fault-management-design.png +++ b/docs/release/scenarios/maintenance/images/Fault-management-design.png diff --git a/docs/development/overview/functest_scenario/images/LICENSE b/docs/release/scenarios/maintenance/images/LICENSE index 21a2d03d..21a2d03d 100644 --- a/docs/development/overview/functest_scenario/images/LICENSE +++ b/docs/release/scenarios/maintenance/images/LICENSE diff --git a/docs/development/overview/functest_scenario/images/Maintenance-design.png b/docs/release/scenarios/maintenance/images/Maintenance-design.png Binary files differindex 8f21db6a..8f21db6a 100644 --- a/docs/development/overview/functest_scenario/images/Maintenance-design.png +++ b/docs/release/scenarios/maintenance/images/Maintenance-design.png diff --git a/docs/development/overview/functest_scenario/images/Maintenance-workflow.png b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png Binary files differindex 9b65fd59..9b65fd59 100644 --- a/docs/development/overview/functest_scenario/images/Maintenance-workflow.png +++ b/docs/release/scenarios/maintenance/images/Maintenance-workflow.png diff --git a/docs/release/scenarios/maintenance/maintenance.rst b/docs/release/scenarios/maintenance/maintenance.rst new file mode 100644 index 00000000..ecfe76b1 --- /dev/null +++ b/docs/release/scenarios/maintenance/maintenance.rst @@ -0,0 +1,120 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + + +Maintenance use case +"""""""""""""""""""" + +* A consumer of the NFVI wants to interact with NFVI maintenance, upgrade, + scaling and to have graceful retirement. Receiving notifications over these + NFVI events and responding to those within given time window, consumer can + guarantee zero downtime to his service. + +The maintenance use case adds the Doctor platform an `admin tool` and an +`app manager` component. Overview of maintenance components can be seen in +:numref:`figure-p2`. + +.. figure:: ./images/Maintenance-design.png + :name: figure-p2 + :width: 100% + + Doctor platform components in maintenance use case + +In maintenance use case, `app manager` (VNFM) will subscribe to maintenance +notifications triggered by project specific alarms through AODH. This is the way +it gets to know different NFVI maintenance, upgrade and scaling operations that +effect to its instances. The `app manager` can do actions depicted in `green +color` or tell `admin tool` to do admin actions depicted in `orange color` + +Any infrastructure component like `Inspector` can subscribe to maintenance +notifications triggered by host specific alarms through AODH. Subscribing to the +notifications needs admin privileges and can tell when a host is out of use as +in maintenance and when it is taken back to production. + +Maintenance test case +""""""""""""""""""""" + +Maintenance test case is currently running in our Apex CI and executed by tox. +This is because the special limitation mentioned below and also the fact we +currently have only sample implementation as a proof of concept and we also +support unofficial OpenStack project Fenix. Environment variable +TEST_CASE='maintenance' needs to be used when executing "doctor_tests/main.py" +and ADMIN_TOOL_TYPE='fenix' if want to test with Fenix instead of sample +implementation. Test case workflow can be seen in :numref:`figure-p3`. + +.. figure:: ./images/Maintenance-workflow.png + :name: figure-p3 + :width: 100% + + Maintenance test case workflow + +In test case all compute capacity will be consumed with project (VNF) instances. +For redundant services on instances and an empty compute needed for maintenance, +test case will need at least 3 compute nodes in system. There will be 2 +instances on each compute, so minimum number of VCPUs is also 2. Depending on +how many compute nodes there is application will always have 2 redundant +instances (ACT-STDBY) on different compute nodes and rest of the compute +capacity will be filled with non-redundant instances. + +For each project specific maintenance message there is a time window for +`app manager` to make any needed action. This will guarantee zero +down time for his service. All replies back are done by calling `admin tool` API +given in the message. + +The following steps are executed: + +Infrastructure admin will call `admin tool` API to trigger maintenance for +compute hosts having instances belonging to a VNF. + +Project specific `MAINTENANCE` notification is triggered to tell `app manager` +that his instances are going to hit by infrastructure maintenance at a specific +point in time. `app manager` will call `admin tool` API to answer back +`ACK_MAINTENANCE`. + +When the time comes to start the actual maintenance workflow in `admin tool`, +a `DOWN_SCALE` notification is triggered as there is no empty compute node for +maintenance (or compute upgrade). Project receives corresponding alarm and scales +down instances and call `admin tool` API to answer back `ACK_DOWN_SCALE`. + +As it might happen instances are not scaled down (removed) from a single +compute node, `admin tool` might need to figure out what compute node should be +made empty first and send `PREPARE_MAINTENANCE` to project telling which instance +needs to be migrated to have the needed empty compute. `app manager` makes sure +he is ready to migrate instance and call `admin tool` API to answer back +`ACK_PREPARE_MAINTENANCE`. `admin tool` will make the migration and answer +`ADMIN_ACTION_DONE`, so `app manager` knows instance can be again used. + +:numref:`figure-p3` has next a light blue section of actions to be done for each +compute. However as we now have one empty compute, we will maintain/upgrade that +first. So on first round, we can straight put compute in maintenance and send +admin level host specific `IN_MAINTENANCE` message. This is caught by `Inspector` +to know host is down for maintenance. `Inspector` can now disable any automatic +fault management actions for the host as it can be down for a purpose. After +`admin tool` has completed maintenance/upgrade `MAINTENANCE_COMPLETE` message +is sent to tell host is back in production. + +Next rounds we always have instances on compute, so we need to have +`PLANNED_MAINTANANCE` message to tell that those instances are now going to hit +by maintenance. When `app manager` now receives this message, he knows instances +to be moved away from compute will now move to already maintained/upgraded host. +In test case no upgrade is done on application side to upgrade instances +according to new infrastructure capabilities, but this could be done here as +this information is also passed in the message. This might be just upgrading +some RPMs, but also totally re-instantiating instance with a new flavor. Now if +application runs an active side of a redundant instance on this compute, +a switch over will be done. After `app manager` is ready he will call +`admin tool` API to answer back `ACK_PLANNED_MAINTENANCE`. In test case the +answer is `migrate`, so `admin tool` will migrate instances and reply +`ADMIN_ACTION_DONE` and then `app manager` knows instances can be again used. +Then we are ready to make the actual maintenance as previously trough +`IN_MAINTENANCE` and `MAINTENANCE_COMPLETE` steps. + +After all computes are maintained, `admin tool` can send `MAINTENANCE_COMPLETE` +to tell maintenance/upgrade is now complete. For `app manager` this means he +can scale back to full capacity. + +There is currently sample implementation on VNFM and test case. In +infrastructure side there is sample implementation of 'admin_tool' and +there is also support for the OpenStack Fenix that extends the use case to +support 'ETSI FEAT03' for VNFM interaction and to optimize the whole +infrastructure mainteannce and upgrade. diff --git a/docs/development/manuals/get-valid-server-state.rst b/docs/release/userguide/get-valid-server-state.rst index 824ea3c2..824ea3c2 100644 --- a/docs/development/manuals/get-valid-server-state.rst +++ b/docs/release/userguide/get-valid-server-state.rst diff --git a/docs/release/userguide/index.rst b/docs/release/userguide/index.rst index eee855dc..577072c7 100644 --- a/docs/release/userguide/index.rst +++ b/docs/release/userguide/index.rst @@ -11,3 +11,6 @@ Doctor User Guide :maxdepth: 2 feature.userguide.rst + get-valid-server-state.rst + mark-host-down_manual.rst + monitors.rst diff --git a/docs/development/manuals/mark-host-down_manual.rst b/docs/release/userguide/mark-host-down_manual.rst index 3815205d..3815205d 100644 --- a/docs/development/manuals/mark-host-down_manual.rst +++ b/docs/release/userguide/mark-host-down_manual.rst diff --git a/docs/development/manuals/monitors.rst b/docs/release/userguide/monitors.rst index eeb5e226..eeb5e226 100644 --- a/docs/development/manuals/monitors.rst +++ b/docs/release/userguide/monitors.rst diff --git a/docs/testing/developer/index.rst b/docs/testing/developer/index.rst new file mode 100644 index 00000000..dfbcfa74 --- /dev/null +++ b/docs/testing/developer/index.rst @@ -0,0 +1,13 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. SPDX-License-Identifier: CC-BY-4.0 +.. (c) Open Platform for NFV Project, Inc. and its contributors + +********* +Developer +********* + +.. toctree:: + :numbered: + :maxdepth: 2 + + testing.rst diff --git a/docs/development/overview/testing.rst b/docs/testing/developer/testing.rst index a4b9ffa6..6a929130 100644 --- a/docs/development/overview/testing.rst +++ b/docs/testing/developer/testing.rst @@ -41,6 +41,16 @@ export TEST_CASE with different values: #Run both tests cases export TEST_CASE='all' + #Use Fenix in maintenance testing instead of sample admin_tool + #This is only for 'mainteanance' test case + export ADMIN_TOOL_TYPE='fenix' + export APP_MANAGER_TYPE='vnfm' + + #Run in different installer jumphost 'fuel' or 'apex' + #In multinode DevStack you run Doctor in controller node + #with value export APP_MANAGER_TYPE=vnfm + export INSTALLER_TYPE='fuel' + Run Python Test Script ~~~~~~~~~~~~~~~~~~~~~~ @@ -57,42 +67,16 @@ environment and then run the test. .. _doctor.sample.conf: https://git.opnfv.org/doctor/tree/etc/doctor.sample.conf -In OPNFV Apex jumphost you can run Doctor testing as follows using tox: +In OPNFV testing environment jumphost you can run Doctor testing as follows +using tox: .. code-block:: bash - #Before Gambia: overcloudrc.v3 source overcloudrc export INSTALLER_IP=${INSTALLER_IP} export INSTALLER_TYPE=${INSTALLER_TYPE} git clone https://gerrit.opnfv.org/gerrit/doctor cd doctor sudo -E tox - -Run Functest Suite -================== - -Functest supports Doctor testing by triggering the test script above in a -Functest container. You can run the Doctor test with the following steps: - -.. code-block:: bash - - DOCKER_TAG=latest - docker pull docker.io/opnfv/functest-features:${DOCKER_TAG} - docker run --privileged=true -id \ - -e INSTALLER_TYPE=${INSTALLER_TYPE} \ - -e INSTALLER_IP=${INSTALLER_IP} \ - -e INSPECTOR_TYPE=sample \ - docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash - docker exec <container_id> functest testcase run doctor-notification - -See `Functest Userguide`_ for more information. - -.. _Functest Userguide: :doc:`<functest:testing/user/userguide>` - - -For testing with stable version, change DOCKER_TAG to 'stable' or other release -tag identifier. - -Tips -==== + +Note! In DevStack you run Doctor in controller node. diff --git a/docs/testing/index.rst b/docs/testing/index.rst new file mode 100644 index 00000000..3fae9568 --- /dev/null +++ b/docs/testing/index.rst @@ -0,0 +1,15 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. SPDX-License-Identifier: CC-BY-4.0 +.. (c) Open Platform for NFV Project, Inc. and its contributors + +.. _testing: + +======= +Testing +======= + +.. toctree:: + :maxdepth: 2 + + ./developer/index.rst + ./user/index.rst diff --git a/docs/testing/user/index.rst b/docs/testing/user/index.rst new file mode 100644 index 00000000..1be9c7eb --- /dev/null +++ b/docs/testing/user/index.rst @@ -0,0 +1,13 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. SPDX-License-Identifier: CC-BY-4.0 +.. (c) Open Platform for NFV Project, Inc. and its contributors + +**** +User +**** + +.. toctree:: + :numbered: + :maxdepth: 2 + + testing.rst diff --git a/docs/testing/user/testing.rst b/docs/testing/user/testing.rst new file mode 100644 index 00000000..6172d26a --- /dev/null +++ b/docs/testing/user/testing.rst @@ -0,0 +1,30 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + +Run Functest Suite (obsolete) +============================= + +Functest supports Doctor testing by triggering the test script above in a +Functest container. You can run the Doctor test with the following steps: + +.. code-block:: bash + + DOCKER_TAG=latest + docker pull docker.io/opnfv/functest-features:${DOCKER_TAG} + docker run --privileged=true -id \ + -e INSTALLER_TYPE=${INSTALLER_TYPE} \ + -e INSTALLER_IP=${INSTALLER_IP} \ + -e INSPECTOR_TYPE=sample \ + docker.io/opnfv/functest-features:${DOCKER_TAG} /bin/bash + docker exec <container_id> functest testcase run doctor-notification + +See `Functest Userguide`_ for more information. + +.. _Functest Userguide: :doc:`<functest:testing/user/userguide>` + + +For testing with stable version, change DOCKER_TAG to 'stable' or other release +tag identifier. + +Tips +==== diff --git a/doctor_tests/admin_tool/__init__.py b/doctor_tests/admin_tool/__init__.py index e8b12817..3417a334 100644 --- a/doctor_tests/admin_tool/__init__.py +++ b/doctor_tests/admin_tool/__init__.py @@ -8,16 +8,16 @@ ############################################################################## from oslo_config import cfg from oslo_utils import importutils - +import os OPTS = [ cfg.StrOpt('type', - default='sample', - choices=['sample'], + default=os.environ.get('ADMIN_TOOL_TYPE', 'sample'), + choices=['sample', 'fenix'], help='the component of doctor admin_tool', required=True), cfg.StrOpt('ip', - default='127.0.0.1', + default='0.0.0.0', help='the ip of admin_tool', required=True), cfg.IntOpt('port', diff --git a/doctor_tests/admin_tool/fenix/Dockerfile b/doctor_tests/admin_tool/fenix/Dockerfile new file mode 100644 index 00000000..202380eb --- /dev/null +++ b/doctor_tests/admin_tool/fenix/Dockerfile @@ -0,0 +1,34 @@ +FROM gliderlabs/alpine:3.6 + +ARG BRANCH=master +ARG OPENSTACK=master + +EXPOSE 12347 + +RUN echo "Building Fenix container against OpenStack $OPENSTACK" && \ + echo "Building Fenix with $BRANCH" && \ + mkdir /etc/fenix && \ + mkdir -p /var/tmp/fenix +WORKDIR /var/tmp/fenix +COPY fenix*.conf /etc/fenix/ + +RUN apk --no-cache add ca-certificates && \ + apk --no-cache add --update python3 sshpass py-pip git curl && \ + apk --no-cache add --virtual .build-deps --update \ + python3-dev build-base linux-headers libffi-dev \ + openssl-dev libjpeg-turbo-dev && \ + curl https://opendev.org/openstack/requirements/raw/branch/$OPENSTACK/upper-constraints.txt > upper-constraints.txt && \ + if [ ! -e /usr/bin/pip ]; then ln -s pip3 /usr/bin/pip ; fi && \ + if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python3 /usr/bin/python; fi && \ + pip3 install --upgrade pip && \ + pip3 install alembic aodhclient decorator flask Flask-RESTful eventlet jsonschema \ + keystoneauth1 keystonemiddleware python-novaclient oslo.config pecan \ + oslo.db oslo.log oslo.messaging oslo.serialization oslo.service oslo_policy \ + oslotest oslo.utils pbr pymysql six sqlalchemy -cupper-constraints.txt && \ + git clone https://opendev.org/x/fenix -b $BRANCH /fenix && \ + rm -fr /var/tmp/fenix +COPY run /fenix +COPY keystonercv3 /fenix +WORKDIR /fenix +RUN python3 setup.py install +CMD ./run diff --git a/doctor_tests/admin_tool/fenix/run b/doctor_tests/admin_tool/fenix/run new file mode 100755 index 00000000..50ae68e7 --- /dev/null +++ b/doctor_tests/admin_tool/fenix/run @@ -0,0 +1,32 @@ +#!/bin/sh +. keystonercv3 + +# Start the first process +nohup python3 /fenix/fenix/cmd/engine.py > /var/log/fenix-engine.log& +status=$? +if [ $status -ne 0 ]; then + echo "Failed to start engine.py: $status" + exit $status +fi + +# Start the second process +nohup python3 /fenix/fenix/cmd/api.py > /var/log/fenix-api.log& +status=$? +if [ $status -ne 0 ]; then + echo "Failed to start api.py: $status" + exit $status +fi + +echo "started Fenix: engine and api" +while sleep 60; do + ps aux |grep "cmd/engine.py" |grep -q -v grep + PROCESS_1_STATUS=$? + ps aux |grep "cmd/api.py" |grep -q -v grep + PROCESS_2_STATUS=$? + # If the greps above find anything, they exit with 0 status + # If they are not both 0, then something is wrong + if [ $PROCESS_1_STATUS -ne 0 -o $PROCESS_2_STATUS -ne 0 ]; then + echo "One of the processes has already exited." + exit 1 + fi +done diff --git a/doctor_tests/admin_tool/sample.py b/doctor_tests/admin_tool/sample.py index 892a4c83..a71f43a1 100644 --- a/doctor_tests/admin_tool/sample.py +++ b/doctor_tests/admin_tool/sample.py @@ -59,7 +59,7 @@ class AdminMain(Thread): self.parent = parent self.log = log self.conf = conf - self.url = 'http://0.0.0.0:%s' % conf.admin_tool.port + self.url = 'http://%s:%s' % (conf.admin_tool.ip, conf.admin_tool.port) self.projects_state = dict() # current state for each project self.proj_server_actions = dict() # actions for each project server self.projects_servers = dict() # servers processed in current state @@ -86,6 +86,7 @@ class AdminMain(Thread): driver='messaging', topics=['notifications']) self.notif_admin = self.notif_admin.prepare(publisher_id='admin_tool') + self.stopped = False self.log.info('Admin tool session %s initialized' % self.session_id) def cleanup(self): @@ -116,14 +117,15 @@ class AdminMain(Thread): if self._projects_not_in_wanted_states(wanted_states): self.log.error('Admin tool session %s: projects in invalid states ' '%s' % (self.session_id, self.projects_state)) - raise Exception('Admin tool session %s: not all projects in states' - ' %s' % (self.session_id, wanted_states)) + return False else: self.log.info('all projects replied') + return True def _project_notify(self, project_id, instance_ids, allowed_actions, actions_at, state, metadata): - reply_url = '%s/%s/maintenance' % (self.url, project_id) + reply_url = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project_id) payload = dict(project_id=project_id, instance_ids=instance_ids, @@ -148,11 +150,12 @@ class AdminMain(Thread): self.notif_admin.info({'some': 'context'}, 'maintenance.host', payload) - def down_scale(self): + def in_scale(self): for project in self.projects_servers: - self.log.info('DOWN_SCALE to project %s' % project) + self.log.info('SCALE_IN to project %s' % project) self.log.debug('instance_ids %s' % self.projects_servers[project]) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = [] wait_seconds = 120 actions_at = (datetime.datetime.utcnow() + @@ -163,18 +166,20 @@ class AdminMain(Thread): self._project_notify(project, instance_ids, allowed_actions, actions_at, state, metadata) - allowed_states = ['ACK_DOWN_SCALE', 'NACK_DOWN_SCALE'] - self.wait_projects_state(allowed_states, wait_seconds) - if self.projects_not_in_state('ACK_DOWN_SCALE'): - raise Exception('Admin tool session %s: all states not ' - 'ACK_DOWN_SCALE %s' % - (self.session_id, self.projects_state)) + allowed_states = ['ACK_SCALE_IN', 'NACK_SCALE_IN'] + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' + if self.projects_not_in_state('ACK_SCALE_IN'): + self.log.error('%s: all states not ACK_SCALE_IN' % + self.session_id) + self.state = 'MAINTENANCE_FAILED' def maintenance(self): for project in self.projects_servers: self.log.info('\nMAINTENANCE to project %s\n' % project) self.log.debug('instance_ids %s' % self.projects_servers[project]) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = [] actions_at = self.maintenance_at state = self.state @@ -190,16 +195,18 @@ class AdminMain(Thread): allowed_actions, actions_at, state, metadata) allowed_states = ['ACK_MAINTENANCE', 'NACK_MAINTENANCE'] - self.wait_projects_state(allowed_states, wait_seconds) + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' if self.projects_not_in_state('ACK_MAINTENANCE'): - raise Exception('Admin tool session %s: all states not ' - 'ACK_MAINTENANCE %s' % - (self.session_id, self.projects_state)) + self.log.error('%s: all states not ACK_MAINTENANCE' % + self.session_id) + self.state = 'MAINTENANCE_FAILED' def maintenance_complete(self): for project in self.projects_servers: self.log.info('MAINTENANCE_COMPLETE to project %s' % project) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = [] wait_seconds = 120 actions_at = (datetime.datetime.utcnow() + @@ -212,13 +219,14 @@ class AdminMain(Thread): metadata) allowed_states = ['ACK_MAINTENANCE_COMPLETE', 'NACK_MAINTENANCE_COMPLETE'] - self.wait_projects_state(allowed_states, wait_seconds) + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' if self.projects_not_in_state('ACK_MAINTENANCE_COMPLETE'): - raise Exception('Admin tool session %s: all states not ' - 'ACK_MAINTENANCE_COMPLETE %s' % - (self.session_id, self.projects_state)) + self.log.error('%s: all states not ACK_MAINTENANCE_COMPLETE' % + self.session_id) + self.state = 'MAINTENANCE_FAILED' - def need_down_scale(self, host_servers): + def need_in_scale(self, host_servers): room_for_instances = 0 for host in host_servers: instances = 0 @@ -267,7 +275,8 @@ class AdminMain(Thread): self.projects_servers[project] = projects_servers[project].copy() self.log.info('%s to project %s' % (state, project)) self.project_servers_log_info(project, projects_servers) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION'] wait_seconds = 120 actions_at = (datetime.datetime.utcnow() + @@ -278,11 +287,14 @@ class AdminMain(Thread): allowed_actions, actions_at, state, metadata) allowed_states = [state_ack, state_nack] - self.wait_projects_state(allowed_states, wait_seconds) - if self.projects_not_in_state(state_ack): - raise Exception('Admin tool session %s: all states not %s %s' % - (self.session_id, state_ack, self.projects_state)) - self.actions_to_have_empty_host(host) + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' + elif self.projects_not_in_state(state_ack): + self.log.error('%s: all states not %s' % + (self.session_id, state_ack)) + self.state = 'MAINTENANCE_FAILED' + else: + self.actions_to_have_empty_host(host) def notify_action_done(self, project, instance_id): instance_ids = instance_id @@ -463,7 +475,8 @@ class AdminMain(Thread): time.sleep(5) def run(self): - while self.state != 'MAINTENANCE_COMPLETE': + while (self.state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and + not self.stopped): self.log.info('--==session %s: processing state %s==--' % (self.session_id, self.state)) if self.state == 'MAINTENANCE': @@ -474,7 +487,8 @@ class AdminMain(Thread): raise Exception('all projects do not listen maintenance ' 'alarm') self.maintenance() - + if self.state == 'MAINTENANCE_FAILED': + continue maint_at = self.str_to_datetime(self.maintenance_at) if maint_at > datetime.datetime.utcnow(): time_now = (datetime.datetime.utcnow().strftime( @@ -492,14 +506,14 @@ class AdminMain(Thread): # True -> PLANNED_MAINTENANCE # False -> check if we can migrate VMs to get empty host # True -> PREPARE_MAINTENANCE - # False -> DOWN_SCALE + # False -> SCALE_IN maintenance_empty_hosts = ([h for h in self.hosts if h not in host_servers]) if len(maintenance_empty_hosts) == 0: - if self.need_down_scale(host_servers): + if self.need_in_scale(host_servers): self.log.info('Need to down scale') - self.state = 'DOWN_SCALE' + self.state = 'SCALE_IN' else: self.log.info('Free capacity, but need empty host') self.state = 'PREPARE_MAINTENANCE' @@ -508,14 +522,17 @@ class AdminMain(Thread): self.state = 'PLANNED_MAINTENANCE' self.log.info('--==State change from MAINTENANCE to %s==--' % self.state) - elif self.state == 'DOWN_SCALE': + elif self.state == 'SCALE_IN': # Test case is hard coded to have all compute capacity used # We need to down scale to have one empty compute host - self.down_scale() + self.update_server_info() + self.in_scale() + if self.state == 'MAINTENANCE_FAILED': + continue self.state = 'PREPARE_MAINTENANCE' host_servers = self.update_server_info() self.servers_log_info(host_servers) - self.log.info('--==State change from DOWN_SCALE to' + self.log.info('--==State change from SCALE_IN to' ' %s==--' % self.state) elif self.state == 'PREPARE_MAINTENANCE': @@ -527,7 +544,7 @@ class AdminMain(Thread): host_servers]) if len(maintenance_empty_hosts) == 0: self.log.info('no empty hosts for maintenance') - if self.need_down_scale(host_servers): + if self.need_in_scale(host_servers): raise Exception('Admin tool session %s: Not enough ' 'free capacity for maintenance' % self.session_id) @@ -535,6 +552,8 @@ class AdminMain(Thread): if host: self.make_compute_host_empty(host, host_servers[host], 'PREPARE_MAINTENANCE') + if self.state == 'MAINTENANCE_FAILED': + continue else: # We do not currently support another down scale if # first was not enough @@ -566,6 +585,7 @@ class AdminMain(Thread): maintenance_empty_hosts.append(host) self.log.info('--==Start to maintain empty hosts==--\n%s' % maintenance_empty_hosts) + self.update_server_info() for host in maintenance_empty_hosts: # scheduler has problems, let's see if just down scaled # host is really empty @@ -586,6 +606,8 @@ class AdminMain(Thread): self.log.info('PLANNED_MAINTENANCE host %s' % host) self.make_compute_host_empty(host, host_servers[host], 'PLANNED_MAINTENANCE') + if self.state == 'MAINTENANCE_FAILED': + continue self.log.info('IN_MAINTENANCE host %s' % host) self._admin_notify(admin_project, host, 'IN_MAINTENANCE', self.session_id) @@ -603,14 +625,16 @@ class AdminMain(Thread): self.log.info('Projects still need to up scale back to full ' 'capcity') self.maintenance_complete() + if self.state == 'MAINTENANCE_FAILED': + continue host_servers = self.update_server_info() self.servers_log_info(host_servers) - self.state = 'MAINTENANCE_COMPLETE' + self.state = 'MAINTENANCE_DONE' else: raise Exception('Admin tool session %s: session in invalid ' 'state %s' % (self.session_id, self.state)) - self.log.info('--==Maintenance session %s: ' - 'MAINTENANCE SESSION COMPLETE==--' % self.session_id) + self.log.info('--==Maintenance session %s: %s==--' % + (self.session_id, self.state)) def project_input(self, project_id, data): self.log.debug('Admin tool session %s: project %s input' % @@ -637,7 +661,6 @@ class AdminTool(Thread): self.admin_tool = admin_tool self.log = log self.conf = conf - self.port = self.conf.admin_tool.port self.maint_sessions = {} self.projects = {} self.maintenance_hosts = [] @@ -650,63 +673,55 @@ class AdminTool(Thread): def admin_maintenance_api_post(): data = json.loads(request.data.decode('utf8')) self.log.info('maintenance message: %s' % data) - if 'session_id' in data: - if data['state'] == 'REMOVE_MAINTENANCE_SESSION': - session_id = data['session_id'] - self.log.info('remove session %s' - % session_id) - self.maint_sessions[session_id].cleanup() - self.maint_sessions[session_id].stop() - del self.maint_sessions[session_id] - else: - session_id = str(generate_uuid()) - self.log.info('creating session: %s' % session_id) - self.maint_sessions[session_id] = ( - AdminMain(self.trasport_url, - session_id, - data, - self, - self.conf, - self.log)) - self.maint_sessions[session_id].start() + session_id = str(generate_uuid()) + self.log.info('creating session: %s' % session_id) + self.maint_sessions[session_id] = ( + AdminMain(self.trasport_url, + session_id, + data, + self, + self.conf, + self.log)) + self.maint_sessions[session_id].start() reply = json.dumps({'session_id': session_id, 'state': 'ACK_%s' % data['state']}) self.log.debug('reply: %s' % reply) return reply, 200, None - @app.route('/maintenance', methods=['GET']) - def admin_maintenance_api_get(): - data = json.loads(request.data.decode('utf8')) - self.log.debug('Admin get maintenance: %s' % data) - session_id = data['session_id'] + @app.route('/maintenance/<session_id>', methods=['GET']) + def admin_maintenance_api_get(session_id=None): + self.log.debug('Admin get maintenance') reply = json.dumps({'state': self.maint_sessions[session_id].state}) - self.log.debug('reply: %s' % reply) + self.log.info('reply: %s' % reply) return reply, 200, None - @app.route('/<projet_id>/maintenance', methods=['PUT']) - def project_maintenance_api_put(projet_id=None): + @app.route('/maintenance/<session_id>/<projet_id>', methods=['PUT']) + def project_maintenance_api_put(session_id=None, projet_id=None): data = json.loads(request.data.decode('utf8')) self.log.debug('%s project put: %s' % (projet_id, data)) - self.project_input(projet_id, data) + self.project_input(session_id, projet_id, data) return 'OK' - @app.route('/<projet_id>/maintenance', methods=['GET']) - def project_maintenance_api_get(projet_id=None): - data = json.loads(request.data.decode('utf8')) - self.log.debug('%s project get %s' % (projet_id, data)) - instances = self.project_get_instances(projet_id, data) + @app.route('/maintenance/<session_id>/<projet_id>', methods=['GET']) + def project_maintenance_api_get(session_id=None, projet_id=None): + self.log.debug('%s project get %s' % (projet_id, session_id)) + instances = self.project_get_instances(session_id, projet_id) reply = json.dumps({'instance_ids': instances}) self.log.debug('%s reply: %s' % (projet_id, reply)) return reply, 200, None + @app.route('/maintenance/<session_id>', methods=['DELETE']) + def remove_session(session_id=None): + self.log.info('remove session %s' + % session_id) + self.maint_sessions[session_id].cleanup() + self.maint_sessions[session_id].stop() + del self.maint_sessions[session_id] + return 'OK' + @app.route('/shutdown', methods=['POST']) def shutdown(): - for session in self.maint_sessions: - self.log.info('shutdown admin tool session %s thread' % - session) - self.maint_sessions[session].cleanup() - self.maint_sessions[session].stop() self.log.info('shutdown admin_tool server at %s' % time.time()) func = request.environ.get('werkzeug.server.shutdown') if func is None: @@ -714,13 +729,11 @@ class AdminTool(Thread): func() return 'admin_tool app shutting down...' - app.run(host='0.0.0.0', port=self.port) + app.run(host=self.conf.admin_tool.ip, port=self.conf.admin_tool.port) - def project_input(self, project_id, data): - session_id = data['session_id'] + def project_input(self, session_id, project_id, data): self.maint_sessions[session_id].project_input(project_id, data) - def project_get_instances(self, project_id, data): - session_id = data['session_id'] + def project_get_instances(self, session_id, project_id): return self.maint_sessions[session_id].project_get_instances( project_id) diff --git a/doctor_tests/app_manager/__init__.py b/doctor_tests/app_manager/__init__.py index 717d6587..c2f75918 100644 --- a/doctor_tests/app_manager/__init__.py +++ b/doctor_tests/app_manager/__init__.py @@ -8,12 +8,13 @@ ############################################################################## from oslo_config import cfg from oslo_utils import importutils +import os OPTS = [ cfg.StrOpt('type', - default='sample', - choices=['sample'], + default=os.environ.get('APP_MANAGER_TYPE', 'sample'), + choices=['sample', 'vnfm'], help='the component of doctor app manager', required=True), cfg.StrOpt('ip', @@ -28,7 +29,8 @@ OPTS = [ _app_manager_name_class_mapping = { - 'sample': 'doctor_tests.app_manager.sample.SampleAppManager' + 'sample': 'doctor_tests.app_manager.sample.SampleAppManager', + 'vnfm': 'doctor_tests.app_manager.vnfm.VNFM', } diff --git a/doctor_tests/app_manager/sample.py b/doctor_tests/app_manager/sample.py index 94926ee2..7ca35b97 100644 --- a/doctor_tests/app_manager/sample.py +++ b/doctor_tests/app_manager/sample.py @@ -17,6 +17,7 @@ import requests from doctor_tests.app_manager.base import BaseAppManager from doctor_tests.identity_auth import get_identity_auth from doctor_tests.identity_auth import get_session +from doctor_tests.os_clients import neutron_client from doctor_tests.os_clients import nova_client @@ -56,12 +57,16 @@ class AppManager(Thread): self.app_manager = app_manager self.log = log self.intance_ids = None + self.auth = get_identity_auth(project=self.conf.doctor_project) + self.session = get_session(auth=self.auth) + self.nova = nova_client(self.conf.nova_version, + self.session) + self.neutron = neutron_client(session=self.session) self.headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - self.auth = get_identity_auth(project=self.conf.doctor_project) - self.nova = nova_client(self.conf.nova_version, - get_session(auth=self.auth)) + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() self.orig_number_of_instances = self.number_of_instances() self.ha_instances = self.get_ha_instances() self.floating_ip = None @@ -85,7 +90,13 @@ class AppManager(Thread): if instance.id != self.active_instance_id: self.log.info('Switch over to: %s %s' % (instance.name, instance.id)) - instance.add_floating_ip(self.floating_ip) + # Deprecated, need to use neutron instead + # instance.add_floating_ip(self.floating_ip) + port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa + floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa + self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa + # Have to update ha_instances as floating_ip changed + self.ha_instances = self.get_ha_instances() self.active_instance_id = instance.id break @@ -114,8 +125,7 @@ class AppManager(Thread): for t in data['reason_data']['event']['traits']}) def get_session_instance_ids(self, url, session_id): - data = {'session_id': session_id} - ret = requests.get(url, data=json.dumps(data), headers=self.headers) + ret = requests.get(url, data=None, headers=self.headers) if ret.status_code != 200: raise Exception(ret.text) self.log.info('get_instance_ids %s' % ret.json()) @@ -155,7 +165,7 @@ class AppManager(Thread): data = json.loads(request.data.decode('utf8')) try: payload = self._alarm_traits_decoder(data) - except: + except Exception: payload = ({t[0]: t[2] for t in data['reason_data']['event']['traits']}) self.log.error('cannot parse alarm data: %s' % payload) @@ -177,12 +187,12 @@ class AppManager(Thread): reply['instance_ids'] = instance_ids reply_state = 'ACK_MAINTENANCE' - elif state == 'DOWN_SCALE': + elif state == 'SCALE_IN': # scale down 2 isntances that is VCPUS equaling to single # compute node self.scale_instances(-2) reply['instance_ids'] = self.get_instance_ids() - reply_state = 'ACK_DOWN_SCALE' + reply_state = 'ACK_SCALE_IN' elif state == 'MAINTENANCE_COMPLETE': # possibly need to upscale diff --git a/doctor_tests/app_manager/vnfm.py b/doctor_tests/app_manager/vnfm.py new file mode 100644 index 00000000..68fdbb88 --- /dev/null +++ b/doctor_tests/app_manager/vnfm.py @@ -0,0 +1,441 @@ +############################################################################## +# Copyright (c) 2018 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +from flask import Flask +from flask import request +import json +import requests +from threading import Thread +import time +import uuid +import yaml + +from doctor_tests.app_manager.base import BaseAppManager +from doctor_tests.identity_auth import get_identity_auth +from doctor_tests.identity_auth import get_session +from doctor_tests.os_clients import neutron_client +from doctor_tests.os_clients import nova_client +from doctor_tests.os_clients import keystone_client + + +class VNFM(BaseAppManager): + + def __init__(self, stack, conf, log): + super(VNFM, self).__init__(conf, log) + self.stack = stack + self.app = None + + def start(self): + self.log.info('VNFM start......') + self.app = VNFManager(self.stack, self.conf, self, self.log) + self.app.start() + + def stop(self): + self.log.info('VNFM stop......') + if not self.app: + return + self.app.delete_constraints() + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + } + url = 'http://%s:%d/shutdown'\ + % (self.conf.app_manager.ip, + self.conf.app_manager.port) + requests.post(url, data='', headers=headers) + + +class VNFManager(Thread): + + def __init__(self, stack, conf, app_manager, log): + Thread.__init__(self) + self.stack = stack + self.conf = conf + self.port = self.conf.app_manager.port + self.app_manager = app_manager + self.log = log + self.intance_ids = None + self.auth = get_identity_auth(project=self.conf.doctor_project) + self.session = get_session(auth=self.auth) + self.keystone = keystone_client( + self.conf.keystone_version, self.session) + self.nova = nova_client(self.conf.nova_version, + self.session) + self.neutron = neutron_client(session=self.session) + self.headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json'} + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + self.orig_number_of_instances = self.number_of_instances() + # List of instances + self.ha_instances = [] + self.nonha_instances = [] + # Different instance_id specific constraints {instanse_id: {},...} + self.instance_constraints = None + # Update existing instances to instance lists + self.update_instances() + nonha_instances = len(self.nonha_instances) + if nonha_instances < 7: + self.scale = 2 + self.max_impacted = 2 + else: + self.scale = int((nonha_instances) / 2) + self.max_impacted = self.scale - 1 + self.log.info('Init nonha_instances: %s scale: %s: max_impacted %s' % + (nonha_instances, self.scale, self.max_impacted)) + # Different instance groups constraints dict + self.ha_group = None + self.nonha_group = None + # Floating IP used in HA instance + self.floating_ip = None + # VNF project_id + self.project_id = None + # HA instance_id that is active / has floating IP + self.active_instance_id = self.active_instance_id() + + services = self.keystone.services.list() + for service in services: + if service.type == 'maintenance': + self.log.info('maintenance service: %s:%s type %s' + % (service.name, service.id, service.type)) + maint_id = service.id + self.maint_endpoint = [ep.url for ep in self.keystone.endpoints.list() + if ep.service_id == maint_id and + ep.interface == 'public'][0] + self.log.info('maintenance endpoint: %s' % self.maint_endpoint) + self.update_constraints_lock = False + self.update_constraints() + + def delete_remote_instance_constraints(self, instance_id): + url = "%s/instance/%s" % (self.maint_endpoint, instance_id) + self.log.info('DELETE: %s' % url) + ret = requests.delete(url, data=None, headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def update_remote_instance_constraints(self, instance): + url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"]) + self.log.info('PUT: %s' % url) + ret = requests.put(url, data=json.dumps(instance), + headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def delete_remote_group_constraints(self, instance_group): + url = "%s/instance_group/%s" % (self.maint_endpoint, + instance_group["group_id"]) + self.log.info('DELETE: %s' % url) + ret = requests.delete(url, data=None, headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def update_remote_group_constraints(self, instance_group): + url = "%s/instance_group/%s" % (self.maint_endpoint, + instance_group["group_id"]) + self.log.info('PUT: %s' % url) + ret = requests.put(url, data=json.dumps(instance_group), + headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def delete_constraints(self): + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + for instance_id in self.instance_constraints: + self.delete_remote_instance_constraints(instance_id) + self.delete_remote_group_constraints(self.nonha_group) + self.delete_remote_group_constraints(self.ha_group) + + def update_constraints(self): + while self.update_constraints_lock: + self.log.info('Waiting update_constraints_lock...') + time.sleep(1) + self.update_constraints_lock = True + self.log.info('Update constraints') + if self.project_id is None: + self.project_id = self.keystone.projects.list( + name=self.conf.doctor_project)[0].id + if self.nonha_group is None: + # Nova does not support groupping instances that do not belong to + # anti-affinity server_groups. Anyhow all instances need groupping + self.nonha_group = { + "group_id": str(uuid.uuid4()), + "project_id": self.project_id, + "group_name": "doctor_nonha_app_group", + "anti_affinity_group": False, + "max_instances_per_host": 0, + "max_impacted_members": self.max_impacted, + "recovery_time": 2, + "resource_mitigation": True} + self.log.info('create doctor_nonha_app_group constraints: %s' + % self.nonha_group) + self.update_remote_group_constraints(self.nonha_group) + if self.ha_group is None: + group_id = [sg.id for sg in self.nova.server_groups.list() + if sg.name == "doctor_ha_app_group"][0] + self.ha_group = { + "group_id": group_id, + "project_id": self.project_id, + "group_name": "doctor_ha_app_group", + "anti_affinity_group": True, + "max_instances_per_host": 1, + "max_impacted_members": 1, + "recovery_time": 4, + "resource_mitigation": True} + self.log.info('create doctor_ha_app_group constraints: %s' + % self.ha_group) + self.update_remote_group_constraints(self.ha_group) + instance_constraints = {} + for ha_instance in self.ha_instances: + instance = { + "instance_id": ha_instance.id, + "project_id": self.project_id, + "group_id": self.ha_group["group_id"], + "instance_name": ha_instance.name, + "max_interruption_time": 120, + "migration_type": "MIGRATE", + "resource_mitigation": True, + "lead_time": 40} + self.log.info('create ha instance constraints: %s' + % instance) + instance_constraints[ha_instance.id] = instance + for nonha_instance in self.nonha_instances: + instance = { + "instance_id": nonha_instance.id, + "project_id": self.project_id, + "group_id": self.nonha_group["group_id"], + "instance_name": nonha_instance.name, + "max_interruption_time": 120, + "migration_type": "MIGRATE", + "resource_mitigation": True, + "lead_time": 40} + self.log.info('create nonha instance constraints: %s' + % instance) + instance_constraints[nonha_instance.id] = instance + if not self.instance_constraints: + # Initial instance constraints + self.log.info('create initial instances constraints...') + for instance in [instance_constraints[i] for i + in instance_constraints]: + self.update_remote_instance_constraints(instance) + self.instance_constraints = instance_constraints.copy() + else: + self.log.info('check instances constraints changes...') + added = [i for i in instance_constraints.keys() + if i not in self.instance_constraints] + deleted = [i for i in self.instance_constraints.keys() + if i not in instance_constraints] + modified = [i for i in instance_constraints.keys() + if (i not in added and i not in deleted and + instance_constraints[i] != + self.instance_constraints[i])] + for instance_id in deleted: + self.delete_remote_instance_constraints(instance_id) + updated = added + modified + for instance in [instance_constraints[i] for i in updated]: + self.update_remote_instance_constraints(instance) + if updated or deleted: + # Some instance constraints have changed + self.instance_constraints = instance_constraints.copy() + self.update_constraints_lock = False + + def active_instance_id(self): + # Need rertry as it takes time after heat template done before + # Floating IP in place + retry = 5 + while retry > 0: + for instance in self.ha_instances: + network_interfaces = next(iter(instance.addresses.values())) + for network_interface in network_interfaces: + _type = network_interface.get('OS-EXT-IPS:type') + if _type == "floating": + if not self.floating_ip: + self.floating_ip = network_interface.get('addr') + self.log.debug('active_instance: %s %s' % + (instance.name, instance.id)) + return instance.id + time.sleep(2) + self.update_instances() + retry -= 1 + raise Exception("No active instance found") + + def switch_over_ha_instance(self): + for instance in self.ha_instances: + if instance.id != self.active_instance_id: + self.log.info('Switch over to: %s %s' % (instance.name, + instance.id)) + # Deprecated, need to use neutron instead + # instance.add_floating_ip(self.floating_ip) + port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa + floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa + self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa + # Have to update ha_instances as floating_ip changed + self.update_instances() + self.active_instance_id = instance.id + break + + def get_instance_ids(self): + ret = list() + for instance in self.nova.servers.list(detailed=False): + ret.append(instance.id) + return ret + + def update_instances(self): + instances = self.nova.servers.list(detailed=True) + self.ha_instances = [i for i in instances + if "doctor_ha_app_" in i.name] + self.nonha_instances = [i for i in instances + if "doctor_nonha_app_" in i.name] + + def _alarm_data_decoder(self, data): + if "[" in data or "{" in data: + # string to list or dict removing unicode + data = yaml.load(data.replace("u'", "'")) + return data + + def _alarm_traits_decoder(self, data): + return ({str(t[0]): self._alarm_data_decoder(str(t[2])) + for t in data['reason_data']['event']['traits']}) + + def get_session_instance_ids(self, url, session_id): + ret = requests.get(url, data=None, headers=self.headers) + if ret.status_code != 200: + raise Exception(ret.text) + self.log.info('get_instance_ids %s' % ret.json()) + return ret.json()['instance_ids'] + + def scale_instances(self, number_of_instances): + number_of_instances_before = self.number_of_instances() + + parameters = self.stack.parameters + parameters['nonha_intances'] += number_of_instances + self.stack.update(self.stack.stack_name, + self.stack.stack_id, + self.stack.template, + parameters=parameters, + files=self.stack.files) + + number_of_instances_after = self.number_of_instances() + if (number_of_instances_before + number_of_instances != + number_of_instances_after): + self.log.error('scale_instances with: %d from: %d ends up to: %d' + % (number_of_instances, number_of_instances_before, + number_of_instances_after)) + raise Exception('scale_instances failed') + + self.log.info('scaled instances from %d to %d' % + (number_of_instances_before, + number_of_instances_after)) + + def number_of_instances(self): + return len(self.nova.servers.list(detailed=False)) + + def run(self): + app = Flask('VNFM') + + @app.route('/maintenance', methods=['POST']) + def maintenance_alarm(): + data = json.loads(request.data.decode('utf8')) + try: + payload = self._alarm_traits_decoder(data) + except Exception: + payload = ({t[0]: t[2] for t in + data['reason_data']['event']['traits']}) + self.log.error('cannot parse alarm data: %s' % payload) + raise Exception('VNFM cannot parse alarm.' + 'Possibly trait data over 256 char') + + self.log.info('VNFM received data = %s' % payload) + + state = payload['state'] + reply_state = None + reply = dict() + + self.log.info('VNFM state: %s' % state) + + if state == 'MAINTENANCE': + instance_ids = (self.get_session_instance_ids( + payload['instance_ids'], + payload['session_id'])) + my_instance_ids = self.get_instance_ids() + invalid_instances = ( + [instance_id for instance_id in instance_ids + if instance_id not in my_instance_ids]) + if invalid_instances: + self.log.error('Invalid instances: %s' % invalid_instances) + reply_state = 'NACK_MAINTENANCE' + else: + reply_state = 'ACK_MAINTENANCE' + + elif state == 'SCALE_IN': + # scale down "self.scale" instances that is VCPUS equaling + # at least a single compute node + self.scale_instances(-self.scale) + reply_state = 'ACK_SCALE_IN' + + elif state == 'MAINTENANCE_COMPLETE': + # possibly need to upscale + number_of_instances = self.number_of_instances() + if self.orig_number_of_instances > number_of_instances: + scale_instances = (self.orig_number_of_instances - + number_of_instances) + self.scale_instances(scale_instances) + reply_state = 'ACK_MAINTENANCE_COMPLETE' + + elif state == 'PREPARE_MAINTENANCE': + # TBD from contraints + if "MIGRATE" not in payload['allowed_actions']: + raise Exception('MIGRATE not supported') + instance_ids = payload['instance_ids'][0] + self.log.info('VNFM got instance: %s' % instance_ids) + if instance_ids == self.active_instance_id: + self.switch_over_ha_instance() + # optional also in contraints + reply['instance_action'] = "MIGRATE" + reply_state = 'ACK_PREPARE_MAINTENANCE' + + elif state == 'PLANNED_MAINTENANCE': + # TBD from contraints + if "MIGRATE" not in payload['allowed_actions']: + raise Exception('MIGRATE not supported') + instance_ids = payload['instance_ids'][0] + self.log.info('VNFM got instance: %s' % instance_ids) + if instance_ids == self.active_instance_id: + self.switch_over_ha_instance() + # optional also in contraints + reply['instance_action'] = "MIGRATE" + reply_state = 'ACK_PLANNED_MAINTENANCE' + + elif state == 'INSTANCE_ACTION_DONE': + # TBD was action done in allowed window + self.log.info('%s' % payload['instance_ids']) + else: + raise Exception('VNFM received event with' + ' unknown state %s' % state) + + if reply_state: + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + reply['state'] = reply_state + url = payload['reply_url'] + self.log.info('VNFM reply: %s' % reply) + requests.put(url, data=json.dumps(reply), headers=self.headers) + + return 'OK' + + @app.route('/shutdown', methods=['POST']) + def shutdown(): + self.log.info('shutdown VNFM server at %s' % time.time()) + func = request.environ.get('werkzeug.server.shutdown') + if func is None: + raise RuntimeError('Not running with the Werkzeug Server') + func() + return 'VNFM shutting down...' + + app.run(host="0.0.0.0", port=self.port) diff --git a/doctor_tests/common/constants.py b/doctor_tests/common/constants.py index 088ff633..201f3fc4 100644 --- a/doctor_tests/common/constants.py +++ b/doctor_tests/common/constants.py @@ -12,6 +12,10 @@ from collections import namedtuple Host = namedtuple('Host', ['name', 'ip']) +def is_fenix(conf): + return conf.admin_tool.type == 'fenix' + + class Inspector(object): CONGRESS = 'congress' SAMPLE = 'sample' diff --git a/doctor_tests/common/utils.py b/doctor_tests/common/utils.py index 1a8840dd..67ca4f4b 100644 --- a/doctor_tests/common/utils.py +++ b/doctor_tests/common/utils.py @@ -10,6 +10,7 @@ import json import os import paramiko import re +import subprocess def load_json_file(full_path): @@ -97,6 +98,27 @@ class SSHClient(object): ftp.close() +class LocalSSH(object): + + def __init__(self, log): + self.log = log + self.log.info('Init local ssh client') + + def ssh(self, cmd): + ret = 0 + output = "%s failed!!!" % cmd + try: + output = subprocess.check_output((cmd), shell=True, + universal_newlines=True) + except subprocess.CalledProcessError: + ret = 1 + return ret, output + + def scp(self, src_file, dst_file): + return subprocess.check_output("cp %s %s" % (src_file, dst_file), + shell=True) + + def run_async(func): from threading import Thread from functools import wraps diff --git a/doctor_tests/image.py b/doctor_tests/image.py index 9961b22d..50841ef6 100644 --- a/doctor_tests/image.py +++ b/doctor_tests/image.py @@ -7,7 +7,11 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## import os -import urllib.request +try: + from urllib.request import urlopen +except Exception: + from urllib2 import urlopen + from oslo_config import cfg @@ -46,11 +50,14 @@ class Image(object): def create(self): self.log.info('image create start......') - images = {image.name: image for image in self.glance.images.list()} + if self.conf.image_name == 'cirros': + cirros = [image for image in images if 'cirros' in image] + if cirros: + self.conf.image_name = cirros[0] if self.conf.image_name not in images: if not os.path.exists(self.conf.image_filename): - resp = urllib.request.urlopen(self.conf.image_download_url) + resp = urlopen(self.conf.image_download_url) with open(self.conf.image_filename, "wb") as file: file.write(resp.read()) self.image = \ diff --git a/doctor_tests/inspector/__init__.py b/doctor_tests/inspector/__init__.py index 31291baf..50365a61 100644 --- a/doctor_tests/inspector/__init__.py +++ b/doctor_tests/inspector/__init__.py @@ -42,6 +42,10 @@ _inspector_name_class_mapping = { } -def get_inspector(conf, log): +def get_inspector(conf, log, transport_url=None): inspector_class = _inspector_name_class_mapping[conf.inspector.type] - return importutils.import_object(inspector_class, conf, log) + if conf.inspector.type == 'sample': + return importutils.import_object(inspector_class, conf, log, + transport_url) + else: + return importutils.import_object(inspector_class, conf, log) diff --git a/doctor_tests/inspector/sample.py b/doctor_tests/inspector/sample.py index a55a12b7..c44db95d 100644 --- a/doctor_tests/inspector/sample.py +++ b/doctor_tests/inspector/sample.py @@ -10,6 +10,7 @@ import collections from flask import Flask from flask import request import json +import oslo_messaging import time from threading import Thread import requests @@ -26,7 +27,7 @@ from doctor_tests.inspector.base import BaseInspector class SampleInspector(BaseInspector): event_type = 'compute.host.down' - def __init__(self, conf, log): + def __init__(self, conf, log, trasport_url): super(SampleInspector, self).__init__(conf, log) self.inspector_url = self.get_inspector_url() self.novaclients = list() @@ -43,6 +44,17 @@ class SampleInspector(BaseInspector): self.hostnames = list() self.app = None + try: + transport = oslo_messaging.get_notification_transport(self.conf, + trasport_url) + self.notif = oslo_messaging.Notifier(transport, + 'compute.instance.update', + driver='messaging', + topics=['notifications']) + self.notif = self.notif.prepare(publisher_id='sample') + except Exception: + self.notif = None + def _init_novaclients(self): self.NUMBER_OF_CLIENTS = self.conf.instance_count auth = get_identity_auth(project=self.conf.doctor_project) @@ -54,13 +66,13 @@ class SampleInspector(BaseInspector): def _init_servers_list(self): self.servers.clear() opts = {'all_tenants': True} - servers = self.nova.servers.list(search_opts=opts) + servers = self.nova.servers.list(detailed=True, search_opts=opts) for server in servers: try: host = server.__dict__.get('OS-EXT-SRV-ATTR:host') self.servers[host].append(server) self.log.debug('get hostname=%s from server=%s' - % (host, server)) + % (host, str(server.name))) except Exception as e: self.log.info('can not get hostname from server=%s, error=%s' % (server, e)) @@ -97,10 +109,14 @@ class SampleInspector(BaseInspector): event_type = event['type'] if event_type == self.event_type: self.hostnames.append(hostname) + if self.notif is not None: + thr0 = self._send_notif(hostname) thr1 = self._disable_compute_host(hostname) thr2 = self._vms_reset_state('error', hostname) if self.conf.inspector.update_neutron_port_dp_status: thr3 = self._set_ports_data_plane_status('DOWN', hostname) + if self.notif is not None: + thr0.join() thr1.join() thr2.join() if self.conf.inspector.update_neutron_port_dp_status: @@ -119,7 +135,7 @@ class SampleInspector(BaseInspector): def maintenance(self, data): try: payload = self._alarm_traits_decoder(data) - except: + except Exception: payload = ({t[0]: t[2] for t in data['reason_data']['event']['traits']}) self.log.error('cannot parse alarm data: %s' % payload) @@ -156,8 +172,8 @@ class SampleInspector(BaseInspector): nova.servers.reset_state(server, state) vmdown_time = time.time() self.vm_down_time = vmdown_time - self.log.info('doctor mark vm(%s) error at %s' - % (server, vmdown_time)) + self.log.info('doctor mark vm(%s) %s at %s' + % (server, state, vmdown_time)) thrs = [] for nova, server in zip(self.novaclients, self.servers[hostname]): @@ -167,6 +183,26 @@ class SampleInspector(BaseInspector): t.join() @utils.run_async + def _send_notif(self, hostname): + + @utils.run_async + def _send_notif(server): + payload = dict(tenant_id=server.tenant_id, + instance_id=server.id, + state="error") + self.notif.info({'some': 'context'}, 'compute.instance.update', + payload) + self.log.info('doctor compute.instance.update vm(%s) error %s' + % (server, time.time())) + + thrs = [] + for server in self.servers[hostname]: + t = _send_notif(server) + thrs.append(t) + for t in thrs: + t.join() + + @utils.run_async def _set_ports_data_plane_status(self, status, hostname): body = {'data_plane_status': status} diff --git a/doctor_tests/installer/__init__.py b/doctor_tests/installer/__init__.py index 2b9ad83d..00a01667 100644 --- a/doctor_tests/installer/__init__.py +++ b/doctor_tests/installer/__init__.py @@ -13,8 +13,8 @@ from oslo_utils import importutils OPTS = [ cfg.StrOpt('type', - default=os.environ.get('INSTALLER_TYPE', 'local'), - choices=['local', 'apex', 'daisy', 'fuel'], + default=os.environ.get('INSTALLER_TYPE', 'devstack'), + choices=['apex', 'daisy', 'fuel', 'devstack'], help='the type of installer', required=True), cfg.StrOpt('ip', @@ -28,10 +28,10 @@ OPTS = [ _installer_name_class_mapping = { - 'local': 'doctor_tests.installer.local.LocalInstaller', 'apex': 'doctor_tests.installer.apex.ApexInstaller', 'daisy': 'doctor_tests.installer.daisy.DaisyInstaller', - 'fuel': 'doctor_tests.installer.mcp.McpInstaller' + 'fuel': 'doctor_tests.installer.mcp.McpInstaller', + 'devstack': 'doctor_tests.installer.devstack.DevstackInstaller' } diff --git a/doctor_tests/installer/apex.py b/doctor_tests/installer/apex.py index 2aa81ff9..3ec2100c 100644 --- a/doctor_tests/installer/apex.py +++ b/doctor_tests/installer/apex.py @@ -6,10 +6,11 @@ # which accompanies this distribution, and is available at # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## -import re import time from doctor_tests.common.constants import Inspector +from doctor_tests.common.constants import is_fenix +from doctor_tests.common.utils import get_doctor_test_root_dir from doctor_tests.common.utils import SSHClient from doctor_tests.installer.base import BaseInstaller @@ -20,6 +21,7 @@ class ApexInstaller(BaseInstaller): cm_set_script = 'set_config.py' nc_set_compute_script = 'set_compute_config.py' cg_set_script = 'set_congress.py' + fe_set_script = 'set_fenix.sh' cm_restore_script = 'restore_config.py' nc_restore_compute_script = 'restore_compute_config.py' cg_restore_script = 'restore_congress.py' @@ -36,13 +38,13 @@ class ApexInstaller(BaseInstaller): self.key_file = None self.controllers = list() self.computes = list() - self.controller_clients = list() - self.compute_clients = list() def setup(self): self.log.info('Setup Apex installer start......') self.key_file = self.get_ssh_key_from_installer() self._get_overcloud_conf() + if is_fenix(self.conf): + self._copy_overcloudrc_to_controllers() self.create_flavor() self.set_apply_patches() self.setup_stunnel() @@ -56,6 +58,11 @@ class ApexInstaller(BaseInstaller): key_path = '/home/stack/.ssh/id_rsa' return self._get_ssh_key(self.client, key_path) + def _copy_overcloudrc_to_controllers(self): + for ip in self.controllers: + cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip) + self._run_cmd_remote(self.client, cmd) + def _get_overcloud_conf(self): self.log.info('Get overcloud config details from Apex installer' '......') @@ -83,26 +90,6 @@ class ApexInstaller(BaseInstaller): host_ips = self._run_cmd_remote(self.client, command) return host_ips[0] - def get_transport_url(self): - client = SSHClient(self.controllers[0], self.node_user_name, - key_filename=self.key_file) - if self.use_containers: - ncbase = "/var/lib/config-data/puppet-generated/nova" - else: - ncbase = "" - command = 'sudo grep "^transport_url" %s/etc/nova/nova.conf' % ncbase - - ret, url = client.ssh(command) - if ret: - raise Exception('Exec command to get host ip from controller(%s)' - 'in Apex installer failed, ret=%s, output=%s' - % (self.controllers[0], ret, url)) - # need to use ip instead of hostname - ret = (re.sub("@.*:", "@%s:" % self.controllers[0], - url[0].split("=", 1)[1])) - self.log.debug('get_transport_url %s' % ret) - return ret - def _set_docker_restart_cmd(self, service): # There can be multiple instances running so need to restart all cmd = "for container in `sudo docker ps | grep " @@ -113,6 +100,7 @@ class ApexInstaller(BaseInstaller): def set_apply_patches(self): self.log.info('Set apply patches start......') + fenix_files = None set_scripts = [self.cm_set_script] @@ -127,6 +115,10 @@ class ApexInstaller(BaseInstaller): if self.conf.test_case != 'fault_management': if self.use_containers: restart_cmd += self._set_docker_restart_cmd("nova-scheduler") + if is_fenix(self.conf): + set_scripts.append(self.fe_set_script) + testdir = get_doctor_test_root_dir() + fenix_files = ["Dockerfile", "run"] else: restart_cmd += ' openstack-nova-scheduler.service' set_scripts.append(self.nc_set_compute_script) @@ -141,29 +133,34 @@ class ApexInstaller(BaseInstaller): for node_ip in self.controllers: client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) - self.controller_clients.append(client) + if fenix_files is not None: + for fenix_file in fenix_files: + src_file = '{0}/{1}/{2}'.format(testdir, + 'admin_tool/fenix', + fenix_file) + client.scp(src_file, fenix_file) self._run_apply_patches(client, restart_cmd, set_scripts, python=self.python) + time.sleep(5) + + self.log.info('Set apply patches start......') if self.conf.test_case != 'fault_management': if self.use_containers: - restart_cmd = self._set_docker_restart_cmd("nova-compute") + restart_cmd = self._set_docker_restart_cmd("nova") else: restart_cmd = 'sudo systemctl restart' \ ' openstack-nova-compute.service' for node_ip in self.computes: client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) - self.compute_clients.append(client) self._run_apply_patches(client, restart_cmd, [self.nc_set_compute_script], python=self.python) - - if self.conf.test_case != 'fault_management': - time.sleep(10) + time.sleep(5) def restore_apply_patches(self): self.log.info('restore apply patches start......') @@ -192,39 +189,22 @@ class ApexInstaller(BaseInstaller): restart_cmd += ' openstack-congress-server.service' restore_scripts.append(self.cg_restore_script) - for client, node_ip in zip(self.controller_clients, self.controllers): - retry = 0 - while retry < 2: - try: - self._run_apply_patches(client, - restart_cmd, - restore_scripts, - python=self.python) - except Exception: - if retry > 0: - raise Exception("SSHClient to %s feiled" % node_ip) - client = SSHClient(node_ip, self.node_user_name, - key_filename=self.key_file) - retry += 1 - break + for node_ip in self.controllers: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches(client, + restart_cmd, + restore_scripts, + python=self.python) + if self.conf.test_case != 'fault_management': if self.use_containers: restart_cmd = self._set_docker_restart_cmd("nova-compute") else: restart_cmd = 'sudo systemctl restart' \ ' openstack-nova-compute.service' - for client, node_ip in zip(self.compute_clients, self.computes): - retry = 0 - while retry < 2: - try: - self._run_apply_patches( - client, restart_cmd, - [self.nc_restore_compute_script], - python=self.python) - except Exception: - if retry > 0: - raise Exception("SSHClient to %s feiled" % node_ip) - client = SSHClient(node_ip, self.node_user_name, - key_filename=self.key_file) - retry += 1 - break + for node_ip in self.computes: + self._run_apply_patches( + client, restart_cmd, + [self.nc_restore_compute_script], + python=self.python) diff --git a/doctor_tests/installer/base.py b/doctor_tests/installer/base.py index 30435931..de4d2f2e 100644 --- a/doctor_tests/installer/base.py +++ b/doctor_tests/installer/base.py @@ -14,8 +14,9 @@ import pwd import six import stat import subprocess +import time -from doctor_tests.common.utils import get_doctor_test_root_dir +from doctor_tests.common import utils from doctor_tests.identity_auth import get_session from doctor_tests.os_clients import nova_client @@ -75,7 +76,7 @@ class BaseInstaller(object): cmd = ("ssh -o UserKnownHostsFile=/dev/null" " -o StrictHostKeyChecking=no" " -i %s %s@%s -R %s:localhost:%s" - " sleep %s > ssh_tunnel.%s" + " sleep %s > ssh_tunnel.%s.%s" " 2>&1 < /dev/null " % (self.key_file, self.node_user_name, @@ -83,9 +84,28 @@ class BaseInstaller(object): port, port, tunnel_uptime, - node_ip)) + node_ip, + port)) server = subprocess.Popen('exec ' + cmd, shell=True) self.servers.append(server) + if self.conf.admin_tool.type == 'fenix': + port = self.conf.admin_tool.port + self.log.info('tunnel for port %s' % port) + cmd = ("ssh -o UserKnownHostsFile=/dev/null" + " -o StrictHostKeyChecking=no" + " -i %s %s@%s -L %s:localhost:%s" + " sleep %s > ssh_tunnel.%s.%s" + " 2>&1 < /dev/null " + % (self.key_file, + self.node_user_name, + node_ip, + port, + port, + tunnel_uptime, + node_ip, + port)) + server = subprocess.Popen('exec ' + cmd, shell=True) + self.servers.append(server) def _get_ssh_key(self, client, key_path): self.log.info('Get SSH keys from %s installer......' @@ -96,7 +116,8 @@ class BaseInstaller(object): % self.conf.installer.type) return self.key_file - ssh_key = '{0}/{1}'.format(get_doctor_test_root_dir(), 'instack_key') + ssh_key = '{0}/{1}'.format(utils.get_doctor_test_root_dir(), + 'instack_key') client.scp(key_path, ssh_key, method='get') user = getpass.getuser() uid = pwd.getpwnam(user).pw_uid @@ -105,6 +126,10 @@ class BaseInstaller(object): os.chmod(ssh_key, stat.S_IREAD) return ssh_key + @abc.abstractmethod + def get_transport_url(self): + pass + def _run_cmd_remote(self, client, command): self.log.info('Run command=%s in %s installer......' % (command, self.conf.installer.type)) @@ -131,19 +156,36 @@ class BaseInstaller(object): ret = False return ret + @utils.run_async def _run_apply_patches(self, client, restart_cmd, script_names, python='python3'): installer_dir = os.path.dirname(os.path.realpath(__file__)) - if isinstance(script_names, list): for script_name in script_names: script_abs_path = '{0}/{1}/{2}'.format(installer_dir, 'common', script_name) - client.scp(script_abs_path, script_name) - cmd = 'sudo %s %s' % (python, script_name) - ret, output = client.ssh(cmd) + if self.conf.installer.type == "devstack": + script_name = "/opt/stack/%s" % script_name + try: + client.scp(script_abs_path, script_name) + except Exception: + client.scp(script_abs_path, script_name) + try: + if ".py" in script_name: + cmd = 'sudo %s %s' % (python, script_name) + else: + cmd = 'sudo chmod 700 %s;sudo ./%s' % (script_name, + script_name) + ret, output = client.ssh(cmd) + self.log.info('Command %s output %s' % (cmd, output)) + except Exception: + ret, output = client.ssh(cmd) + self.log.info('Command %s output %s' % (cmd, output)) if ret: raise Exception('Do the command in remote' ' node failed, ret=%s, cmd=%s, output=%s' % (ret, cmd, output)) + if 'nova' in restart_cmd or 'devstack@n-' in restart_cmd: + # Make sure scheduler has proper cpu_allocation_ratio + time.sleep(5) client.ssh(restart_cmd) diff --git a/doctor_tests/installer/common/restore_compute_config.py b/doctor_tests/installer/common/restore_compute_config.py index 0e9939fd..82e10a66 100644 --- a/doctor_tests/installer/common/restore_compute_config.py +++ b/doctor_tests/installer/common/restore_compute_config.py @@ -11,18 +11,16 @@ import shutil def restore_cpu_allocation_ratio(): - nova_base = "/var/lib/config-data/puppet-generated/nova" - if not os.path.isdir(nova_base): - nova_base = "" - nova_file = nova_base + '/etc/nova/nova.conf' - nova_file_bak = nova_base + '/etc/nova/nova.bak' - - if not os.path.isfile(nova_file_bak): - print('Bak_file:%s does not exist.' % nova_file_bak) - else: - print('restore: %s' % nova_file) - shutil.copyfile(nova_file_bak, nova_file) - os.remove(nova_file_bak) + for nova_file_bak in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.bak", # noqa + "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.bak", # noqa + "/etc/nova/nova.bak"]: + if os.path.isfile(nova_file_bak): + nova_file = nova_file_bak.replace(".bak", ".conf") + print('restoring nova.bak.') + shutil.copyfile(nova_file_bak, nova_file) + os.remove(nova_file_bak) + return + print('nova.bak does not exist.') return restore_cpu_allocation_ratio() diff --git a/doctor_tests/installer/common/set_compute_config.py b/doctor_tests/installer/common/set_compute_config.py index 86266085..615f1895 100644 --- a/doctor_tests/installer/common/set_compute_config.py +++ b/doctor_tests/installer/common/set_compute_config.py @@ -10,37 +10,25 @@ import os import shutil -def make_initial_config(service, dest): - for mk in ["", "/etc", "/%s" % service]: - dest += mk - os.mkdir(dest) - src = "/etc/%s/%s.conf" % (service, service) - dest += "/%s.conf" % service - shutil.copyfile(src, dest) - - def set_cpu_allocation_ratio(): - docker_conf_base_dir = "/var/lib/config-data/puppet-generated" - if not os.path.isdir(docker_conf_base_dir): - nova_base = "" - else: - nova_base = "%s/nova" % docker_conf_base_dir - if not os.path.isdir(nova_base): - # nova.conf to be used might not exist - make_initial_config("nova", nova_base) - nova_file = nova_base + '/etc/nova/nova.conf' - nova_file_bak = nova_base + '/etc/nova/nova.bak' + nova_file_bak = None + for nova_file in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf", # noqa + "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.conf", # noqa + "/etc/nova/nova.conf"]: + if os.path.isfile(nova_file): + nova_file_bak = nova_file.replace(".conf", ".bak") + break - if not os.path.isfile(nova_file): - raise Exception("File doesn't exist: %s." % nova_file) + if nova_file_bak is None: + raise Exception("Could not find nova.conf") # TODO (tojuvone): Unfortunately ConfigParser did not produce working conf fcheck = open(nova_file) found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio" in ca]) fcheck.close() + change = False + found = False if found_list and len(found_list): - change = False - found = False for car in found_list: if car.startswith('#'): continue diff --git a/doctor_tests/installer/common/set_config.py b/doctor_tests/installer/common/set_config.py index 3dc6cd9a..e66d4c2c 100644 --- a/doctor_tests/installer/common/set_config.py +++ b/doctor_tests/installer/common/set_config.py @@ -125,6 +125,7 @@ def set_event_definitions(): 'reply_url': {'fields': 'payload.reply_url'}, 'actions_at': {'fields': 'payload.actions_at', 'type': 'datetime'}, + 'reply_at': {'fields': 'payload.reply_at', 'type': 'datetime'}, 'state': {'fields': 'payload.state'}, 'session_id': {'fields': 'payload.session_id'}, 'project_id': {'fields': 'payload.project_id'}, diff --git a/doctor_tests/installer/common/set_fenix.sh b/doctor_tests/installer/common/set_fenix.sh new file mode 100644 index 00000000..bd1eae47 --- /dev/null +++ b/doctor_tests/installer/common/set_fenix.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +############################################################################## +# Copyright (c) 2019 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +# Config files +docker -v >/dev/null || { +echo "Fenix needs docker to be installed..." +ver=`grep "UBUNTU_CODENAME" /etc/os-release | cut -d '=' -f 2` +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $ver stable" +apt install apt-transport-https ca-certificates curl software-properties-common +apt update +apt-cache policy docker-ce +apt-get install -y docker-ce docker-ce-cli containerd.io +dpkg -r --force-depends golang-docker-credential-helpers +} + +docker ps | grep fenix -q && { +REMOTE=`git ls-remote https://opendev.org/x/fenix HEAD | awk '{ print $1}'` +LOCAL=`docker exec -t fenix git rev-parse @` +if [[ "$LOCAL" =~ "$REMOTE" ]]; then + # Difference in above string ending marks, so cannot compare equal + echo "Fenix start: Already running latest $LOCAL equals $REMOTE" + exit 0 +else + echo "Fenix container needs to be recreated $LOCAL not $REMOTE" + # Remove previous container + for img in `docker image list | grep "^fenix" | awk '{print $1}'`; do + for dock in `docker ps --all -f "ancestor=$img" | grep "$img" | awk '{print $1}'`; do + docker stop $dock; docker rm $dock; + done; + docker image rm $img; + done +fi +} || echo "Fenix container needs to be created..." + +cp /root/keystonercv3 . + +transport=`grep -m1 "^transport" /etc/nova/nova.conf` +. keystonercv3 + +echo "[DEFAULT]" > fenix.conf +echo "port = 12347" >> fenix.conf +echo $transport >> fenix.conf + +echo "[database]" >> fenix.conf +MYSQLIP=`grep -m1 "^connection" /etc/nova/nova.conf | sed -e "s/.*@//;s/\/.*//"` +echo "connection = mysql+pymysql://fenix:fenix@$MYSQLIP/fenix" >> fenix.conf + +echo "[service_user]" >> fenix.conf +echo "os_auth_url = $OS_AUTH_URL" >> fenix.conf +echo "os_username = $OS_USERNAME" >> fenix.conf +echo "os_password = $OS_PASSWORD" >> fenix.conf +echo "os_user_domain_name = $OS_USER_DOMAIN_NAME" >> fenix.conf +echo "os_project_name = $OS_PROJECT_NAME" >> fenix.conf +echo "os_project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix.conf + +echo "[DEFAULT]" > fenix-api.conf +echo "port = 12347" >> fenix-api.conf +echo $transport >> fenix-api.conf + +echo "[keystone_authtoken]" >> fenix-api.conf +echo "auth_url = $OS_AUTH_URL" >> fenix-api.conf +echo "auth_type = password" >> fenix-api.conf +echo "project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf +echo "project_name = $OS_PROJECT_NAME" >> fenix-api.conf +echo "user_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf +echo "password = $OS_PASSWORD" >> fenix-api.conf +echo "username = $OS_USERNAME" >> fenix-api.conf +echo "cafile = /opt/stack/data/ca-bundle.pem" >> fenix-api.conf + +openstack service list | grep -q maintenance || { +openstack service create --name fenix --enable maintenance +openstack endpoint create --region $OS_REGION_NAME --enable fenix public http://localhost:12347/v1 +} + +# Mysql pw +# MYSQLPW=`cat /var/lib/config-data/mysql/etc/puppet/hieradata/service_configs.json | grep mysql | grep root_password | awk -F": " '{print $2}' | awk -F"\"" '{print $2}'` +MYSQLPW=root + +# Fenix DB +[ `mysql -uroot -p$MYSQLPW -e "SELECT host, user FROM mysql.user;" | grep fenix | wc -l` -eq 0 ] && { + mysql -uroot -p$MYSQLPW -hlocalhost -e "CREATE USER 'fenix'@'localhost' IDENTIFIED BY 'fenix';" + mysql -uroot -p$MYSQLPW -hlocalhost -e "GRANT ALL PRIVILEGES ON fenix.* TO 'fenix'@'' identified by 'fenix';FLUSH PRIVILEGES;" +} +mysql -ufenix -pfenix -hlocalhost -e "DROP DATABASE IF EXISTS fenix;" +mysql -ufenix -pfenix -hlocalhost -e "CREATE DATABASE fenix CHARACTER SET utf8;" + +# Build Fenix container and run it +chmod 700 run +docker build --build-arg OPENSTACK=master --build-arg BRANCH=master --network host $PWD -t fenix | tail -1 +docker run --network host -d --name fenix -p 12347:12347 -ti fenix +if [ $? -eq 0 ]; then + echo "Fenix start: OK" +else + echo "Fenix start: FAILED" +fi +# To debug check log from fenix container +# docker exec -ti fenix tail -f /var/log/fenix-engine.log diff --git a/doctor_tests/installer/devstack.py b/doctor_tests/installer/devstack.py new file mode 100644 index 00000000..02f3601a --- /dev/null +++ b/doctor_tests/installer/devstack.py @@ -0,0 +1,151 @@ +############################################################################## +# Copyright (c) 2019 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +import os +import socket +import time + +from doctor_tests.common.utils import SSHClient +from doctor_tests.common.utils import LocalSSH +from doctor_tests.identity_auth import get_session +from doctor_tests.installer.base import BaseInstaller +from doctor_tests.os_clients import nova_client + + +class DevstackInstaller(BaseInstaller): + node_user_name = None + cm_set_script = 'set_config.py' + nc_set_compute_script = 'set_compute_config.py' + cm_restore_script = 'restore_config.py' + nc_restore_compute_script = 'restore_compute_config.py' + ac_restart_script = 'restart_aodh.py' + ac_restore_script = 'restore_aodh.py' + python = 'python' + + def __init__(self, conf, log): + super(DevstackInstaller, self).__init__(conf, log) + # Run Doctor under users home. sudo hides other env param to be used + home, self.node_user_name = (iter(os.environ.get('VIRTUAL_ENV') + .split('/', 3)[1:3])) + # Migration needs to work so ssh should have proper key defined + self.key_file = '/%s/%s/.ssh/id_rsa' % (home, self.node_user_name) + self.log.info('ssh uses: %s and %s' % (self.node_user_name, + self.key_file)) + self.controllers = ([ip for ip in + socket.gethostbyname_ex(socket.gethostname())[2] + if not ip.startswith('127.')] or + [[(s.connect(('8.8.8.8', 53)), + s.getsockname()[0], s.close()) + for s in [socket.socket(socket.AF_INET, + socket.SOCK_DGRAM)]][0][1]]) + conf.admin_tool.ip = self.controllers[0] + self.computes = list() + self.nova = nova_client(conf.nova_version, get_session()) + + def setup(self): + self.log.info('Setup Devstack installer start......') + self._get_devstack_conf() + self.create_flavor() + self.set_apply_patches() + + def cleanup(self): + self.restore_apply_patches() + + def get_ssh_key_from_installer(self): + return self.key_file + + def get_transport_url(self): + client = LocalSSH(self.log) + cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf' + ret, url = client.ssh(cmd) + url = url.split("= ", 1)[1][:-1] + self.log.info('get_transport_url %s' % url) + return url + + def get_host_ip_from_hostname(self, hostname): + return [hvisor.__getattr__('host_ip') for hvisor in self.hvisors + if hvisor.__getattr__('hypervisor_hostname') == hostname][0] + + def _get_devstack_conf(self): + self.log.info('Get devstack config details for Devstack installer' + '......') + self.hvisors = self.nova.hypervisors.list(detailed=True) + self.log.info('checking hypervisors.......') + self.computes = [hvisor.__getattr__('host_ip') for hvisor in + self.hvisors] + self.use_containers = False + self.log.info('controller_ips:%s' % self.controllers) + self.log.info('compute_ips:%s' % self.computes) + self.log.info('use_containers:%s' % self.use_containers) + + def _set_docker_restart_cmd(self, service): + # There can be multiple instances running so need to restart all + cmd = "for container in `sudo docker ps | grep " + cmd += service + cmd += " | awk '{print $1}'`; do sudo docker restart $container; \ + done;" + return cmd + + def set_apply_patches(self): + self.log.info('Set apply patches start......') + + set_scripts = [self.cm_set_script] + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@ceilometer-anotification.service' + + client = LocalSSH(self.log) + self._run_apply_patches(client, + restart_cmd, + set_scripts, + python=self.python) + time.sleep(7) + + self.log.info('Set apply patches start......') + + if self.conf.test_case != 'fault_management': + restart_cmd = 'sudo systemctl restart' \ + ' devstack@n-cpu.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches(client, + restart_cmd, + [self.nc_set_compute_script], + python=self.python) + time.sleep(7) + + def restore_apply_patches(self): + self.log.info('restore apply patches start......') + + restore_scripts = [self.cm_restore_script] + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@ceilometer-anotification.service' + + if self.conf.test_case != 'fault_management': + restart_cmd += ' devstack@n-sch.service' + restore_scripts.append(self.nc_restore_compute_script) + + client = LocalSSH(self.log) + self._run_apply_patches(client, + restart_cmd, + restore_scripts, + python=self.python) + + if self.conf.test_case != 'fault_management': + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@n-cpu.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches( + client, restart_cmd, + [self.nc_restore_compute_script], + python=self.python) diff --git a/doctor_tests/installer/local.py b/doctor_tests/installer/local.py deleted file mode 100644 index fee14f33..00000000 --- a/doctor_tests/installer/local.py +++ /dev/null @@ -1,118 +0,0 @@ -############################################################################## -# Copyright (c) 2017 ZTE Corporation and others. -# -# All rights reserved. This program and the accompanying materials -# are made available under the terms of the Apache License, Version 2.0 -# which accompanies this distribution, and is available at -# http://www.apache.org/licenses/LICENSE-2.0 -############################################################################## -import os -import shutil -import subprocess - -from doctor_tests.installer.base import BaseInstaller -from doctor_tests.installer.common.vitrage import \ - set_vitrage_host_down_template -from doctor_tests.common.constants import Inspector -from doctor_tests.common.utils import load_json_file -from doctor_tests.common.utils import write_json_file - - -class LocalInstaller(BaseInstaller): - node_user_name = 'root' - - nova_policy_file = '/etc/nova/policy.json' - nova_policy_file_backup = '%s%s' % (nova_policy_file, '.bak') - - def __init__(self, conf, log): - super(LocalInstaller, self).__init__(conf, log) - self.policy_modified = False - self.add_policy_file = False - - def setup(self): - self.get_ssh_key_from_installer() - self.set_apply_patches() - - def cleanup(self): - self.restore_apply_patches() - - def get_ssh_key_from_installer(self): - self.log.info('Assuming SSH keys already exchanged with computer' - 'for local installer type') - return None - - def get_host_ip_from_hostname(self, hostname): - self.log.info('Get host ip from host name in local installer......') - - cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname) - server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) - stdout, stderr = server.communicate() - host_ip = stdout.strip().decode("utf-8") - - self.log.info('Get host_ip:%s from host_name:%s in local installer' - % (host_ip, hostname)) - return host_ip - - def set_apply_patches(self): - self._set_nova_policy() - if self.conf.inspector.type == Inspector.VITRAGE: - set_vitrage_host_down_template() - os.system('sudo systemctl restart devstack@vitrage-graph.service') - - def restore_apply_patches(self): - self._restore_nova_policy() - - def _set_nova_policy(self): - host_status_policy = 'os_compute_api:servers:show:host_status' - host_status_rule = 'rule:admin_or_owner' - policy_data = { - 'context_is_admin': 'role:admin', - 'owner': 'user_id:%(user_id)s', - 'admin_or_owner': 'rule:context_is_admin or rule:owner', - host_status_policy: host_status_rule - } - - if os.path.isfile(self.nova_policy_file): - data = load_json_file(self.nova_policy_file) - if host_status_policy in data: - rule_origion = data[host_status_policy] - if host_status_rule == rule_origion: - self.log.info('Do not need to modify nova policy.') - self.policy_modified = False - else: - # update the host_status_policy - data[host_status_policy] = host_status_rule - self.policy_modified = True - else: - # add the host_status_policy, if the admin_or_owner is not - # defined, add it also - for policy, rule in policy_data.items(): - if policy not in data: - data[policy] = rule - self.policy_modified = True - if self.policy_modified: - self.log.info('Nova policy is Modified.') - shutil.copyfile(self.nova_policy_file, - self.nova_policy_file_backup) - else: - # file does not exit, create a new one and add the policy - self.log.info('Nova policy file not exist. Creating a new one') - data = policy_data - self.add_policy_file = True - - if self.policy_modified or self.add_policy_file: - write_json_file(self.nova_policy_file, data) - os.system('sudo systemctl restart devstack@n-api.service') - - def _restore_nova_policy(self): - if self.policy_modified: - shutil.copyfile(self.nova_policy_file_backup, - self.nova_policy_file) - os.remove(self.nova_policy_file_backup) - elif self.add_policy_file: - os.remove(self.nova_policy_file) - - if self.add_policy_file or self.policy_modified: - os.system('sudo systemctl restart devstack@n-api.service') - self.add_policy_file = False - self.policy_modified = False diff --git a/doctor_tests/installer/mcp.py b/doctor_tests/installer/mcp.py index 9cfff92d..7659c9e2 100644 --- a/doctor_tests/installer/mcp.py +++ b/doctor_tests/installer/mcp.py @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2018 ZTE Corporation and others. +# Copyright (c) 2019 ZTE Corporation and others. # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -7,15 +7,26 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## from os.path import isfile +import re +import time +from doctor_tests.common.constants import is_fenix +from doctor_tests.common.utils import get_doctor_test_root_dir from doctor_tests.common.utils import SSHClient from doctor_tests.installer.base import BaseInstaller class McpInstaller(BaseInstaller): node_user_name = 'ubuntu' - cm_set_script = 'set_ceilometer.py' - cm_restore_script = 'restore_ceilometer.py' + + cm_set_script = 'set_config.py' + nc_set_compute_script = 'set_compute_config.py' + fe_set_script = 'set_fenix.sh' + cm_restore_script = 'restore_config.py' + nc_restore_compute_script = 'restore_compute_config.py' + ac_restart_script = 'restart_aodh.py' + ac_restore_script = 'restore_aodh.py' + python = 'python3' def __init__(self, conf, log): super(McpInstaller, self).__init__(conf, log) @@ -26,40 +37,87 @@ class McpInstaller(BaseInstaller): look_for_keys=True) self.controllers = list() self.controller_clients = list() + self.computes = list() def setup(self): self.log.info('Setup MCP installer start......') - - self.controllers = self.get_controller_ips() + self.get_node_ips() self.create_flavor() - self.set_apply_patches() + if is_fenix(self.conf): + self.set_apply_patches() self.setup_stunnel() def cleanup(self): - self.restore_apply_patches() + if is_fenix(self.conf): + self.restore_apply_patches() for server in self.servers: server.terminate() def get_ssh_key_from_installer(self): self.log.info('Get SSH keys from MCP......') - # Assuming mcp.rsa is already mapped to functest container - # if not, only the test runs on jumphost can get the ssh_key - # default in path /var/lib/opnfv/mcp.rsa + # Default in path /var/lib/opnfv/mcp.rsa ssh_key = '/root/.ssh/id_rsa' mcp_key = '/var/lib/opnfv/mcp.rsa' - return ssh_key if isfile(ssh_key) else mcp_key - - def get_controller_ips(self): - self.log.info('Get controller ips from Mcp installer......') - - command = "sudo salt --out yaml 'ctl*' " \ - "pillar.get _param:openstack_control_address |" \ - "awk '{print $2}'" - controllers = self._run_cmd_remote(self.client, command) - self.log.info('Get controller_ips:%s from Mcp installer' - % controllers) - return controllers + return mcp_key if isfile(mcp_key) else ssh_key + + def get_transport_url(self): + client = SSHClient(self.controllers[0], self.node_user_name, + key_filename=self.key_file) + try: + cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf' + ret, url = client.ssh(cmd) + + if ret: + raise Exception('Exec command to get transport from ' + 'controller(%s) in MCP installer failed, ' + 'ret=%s, output=%s' + % (self.controllers[0], ret, url)) + elif self.controllers[0] not in url: + # need to use ip instead of hostname + url = (re.sub("@.*:", "@%s:" % self.controllers[0], + url[0].split("=", 1)[1])) + except Exception: + cmd = 'grep -i "^rabbit" /etc/nova/nova.conf' + ret, lines = client.ssh(cmd) + if ret: + raise Exception('Exec command to get transport from ' + 'controller(%s) in MCP installer failed, ' + 'ret=%s, output=%s' + % (self.controllers[0], ret, url)) + else: + for line in lines.split('\n'): + if line.startswith("rabbit_userid"): + rabbit_userid = line.split("=") + if line.startswith("rabbit_port"): + rabbit_port = line.split("=") + if line.startswith("rabbit_password"): + rabbit_password = line.split("=") + url = "rabbit://%s:%s@%s:%s/?ssl=0" % (rabbit_userid, + rabbit_password, + self.controllers[0], + rabbit_port) + self.log.info('get_transport_url %s' % url) + return url + + def _copy_overcloudrc_to_controllers(self): + for ip in self.controllers: + cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip) + self._run_cmd_remote(self.client, cmd) + + def get_node_ips(self): + self.log.info('Get node ips from Mcp installer......') + + command = 'sudo salt "*" --out yaml pillar.get _param:single_address' + node_details = self._run_cmd_remote(self.client, command) + + self.controllers = [line.split()[1] for line in node_details + if line.startswith("ctl")] + self.computes = [line.split()[1] for line in node_details + if line.startswith("cmp")] + + self.log.info('controller_ips:%s' % self.controllers) + self.log.info('compute_ips:%s' % self.computes) def get_host_ip_from_hostname(self, hostname): command = "sudo salt --out yaml '%s*' " \ @@ -70,21 +128,80 @@ class McpInstaller(BaseInstaller): def set_apply_patches(self): self.log.info('Set apply patches start......') + fenix_files = None + set_scripts = [self.cm_set_script] + thrs = [] + + restart_cmd = 'sudo systemctl restart' \ + ' ceilometer-agent-notification.service' + + if self.conf.test_case != 'fault_management': + if is_fenix(self.conf): + set_scripts.append(self.fe_set_script) + testdir = get_doctor_test_root_dir() + fenix_files = ["Dockerfile", "run"] + restart_cmd += ' nova-scheduler.service' + set_scripts.append(self.nc_set_compute_script) - restart_cm_cmd = 'sudo service ceilometer-agent-notification restart' for node_ip in self.controllers: client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) - self.controller_clients.append(client) - self._run_apply_patches(client, - restart_cm_cmd, - [self.cm_set_script]) + if fenix_files is not None: + for fenix_file in fenix_files: + src_file = '{0}/{1}/{2}'.format(testdir, + 'admin_tool/fenix', + fenix_file) + client.scp(src_file, fenix_file) + thrs.append(self._run_apply_patches(client, + restart_cmd, + set_scripts, + python=self.python)) + time.sleep(5) + + self.log.info('Set apply patches start......') + + if self.conf.test_case != 'fault_management': + restart_cmd = 'sudo systemctl restart nova-compute.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + thrs.append(self._run_apply_patches( + client, + restart_cmd, + [self.nc_set_compute_script], + python=self.python)) + time.sleep(5) + # If Fenix container ir build, it needs to be ready before continue + for thr in thrs: + thr.join() def restore_apply_patches(self): self.log.info('restore apply patches start......') - restart_cm_cmd = 'sudo service ceilometer-agent-notification restart' - for client in self.controller_clients: + restore_scripts = [self.cm_restore_script] + + restore_scripts.append(self.ac_restore_script) + restart_cmd = 'sudo systemctl restart' \ + ' ceilometer-agent-notification.service' + + if self.conf.test_case != 'fault_management': + restart_cmd += ' nova-scheduler.service' + restore_scripts.append(self.nc_restore_compute_script) + + for node_ip in self.controllers: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) self._run_apply_patches(client, - restart_cm_cmd, - [self.cm_restore_script]) + restart_cmd, + restore_scripts, + python=self.python) + + if self.conf.test_case != 'fault_management': + restart_cmd = 'sudo systemctl restart nova-compute.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches( + client, restart_cmd, + [self.nc_restore_compute_script], + python=self.python) diff --git a/doctor_tests/main.py b/doctor_tests/main.py index 438d8324..7573faec 100644 --- a/doctor_tests/main.py +++ b/doctor_tests/main.py @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2017 ZTE Corporation and others. +# Copyright (c) 2019 ZTE Corporation and others. # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -43,7 +43,6 @@ class DoctorTest(object): def setup(self): # prepare the cloud env self.installer.setup() - # preparing VM image... self.image.create() @@ -51,37 +50,50 @@ class DoctorTest(object): self.user.create() def test_fault_management(self): - try: - LOG.info('doctor fault management test starting.......') - - self.fault_management = \ - FaultManagement(self.conf, self.installer, self.user, LOG) - - # prepare test env - self.fault_management.setup() - - # wait for aodh alarms are updated in caches for event evaluator, - # sleep time should be larger than event_alarm_cache_ttl - # (default 60) - # (tojuvone) Fraser currently needs 120 - time.sleep(120) - - # injecting host failure... - # NOTE (umar) add INTERFACE_NAME logic to host injection - self.fault_management.start() - time.sleep(30) - - # verify the test results - # NOTE (umar) copy remote monitor.log file when monitor=collectd - self.fault_management.check_host_status('down') - self.fault_management.check_notification_time() - - except Exception as e: - LOG.error('doctor fault management test failed, ' - 'Exception=%s' % e) - sys.exit(1) - finally: - self.fault_management.cleanup() + retry = 2 + # Retry once if notified_time is None + while retry > 0: + try: + self.fault_management = None + LOG.info('doctor fault management test starting.......') + transport_url = self.installer.get_transport_url() + self.fault_management = \ + FaultManagement(self.conf, self.installer, self.user, LOG, + transport_url) + + # prepare test env + self.fault_management.setup() + + # wait for aodh alarms are updated in caches for event + # evaluator,sleep time should be larger than + # event_alarm_cache_ttl (default 60) + # (tojuvone) Fraser currently needs 120 + time.sleep(120) + + # injecting host failure... + # NOTE (umar) add INTERFACE_NAME logic to host injection + self.fault_management.start() + time.sleep(30) + + # verify the test results + # NOTE (umar) copy remote monitor.log file when + # monitor=collectd + self.fault_management.check_host_status('down') + self.fault_management.check_notification_time() + retry = 0 + + except Exception as e: + LOG.error('doctor fault management test failed, ' + 'Exception=%s' % e) + if 'notified_time=None' in str(e): + retry -= 1 + LOG.info('doctor fault management retry') + continue + LOG.error(format_exc()) + sys.exit(1) + finally: + if self.fault_management is not None: + self.fault_management.cleanup() def _amount_compute_nodes(self): services = self.nova.services.list(binary='nova-compute') @@ -94,11 +106,12 @@ class DoctorTest(object): LOG.info('not enough compute nodes, skipping doctor ' 'maintenance test') return - elif self.conf.installer.type != 'apex': + elif self.conf.installer.type not in ['apex', 'fuel', 'devstack']: LOG.info('not supported installer, skipping doctor ' 'maintenance test') return try: + maintenance = None LOG.info('doctor maintenance test starting.......') trasport_url = self.installer.get_transport_url() maintenance = Maintenance(trasport_url, self.conf, LOG) @@ -120,7 +133,8 @@ class DoctorTest(object): LOG.error(format_exc()) sys.exit(1) finally: - maintenance.cleanup_maintenance() + if maintenance is not None: + maintenance.cleanup_maintenance() def run(self): """run doctor tests""" @@ -143,6 +157,7 @@ class DoctorTest(object): % function) except Exception as e: LOG.error('doctor test failed, Exception=%s' % e) + LOG.error(format_exc()) sys.exit(1) finally: self.cleanup() diff --git a/doctor_tests/scenario/fault_management.py b/doctor_tests/scenario/fault_management.py index 869311bd..0271dffe 100644 --- a/doctor_tests/scenario/fault_management.py +++ b/doctor_tests/scenario/fault_management.py @@ -40,7 +40,7 @@ sleep 1 class FaultManagement(object): - def __init__(self, conf, installer, user, log): + def __init__(self, conf, installer, user, log, transport_url): self.conf = conf self.log = log self.user = user @@ -55,7 +55,7 @@ class FaultManagement(object): self.network = Network(self.conf, log) self.instance = Instance(self.conf, log) self.alarm = Alarm(self.conf, log) - self.inspector = get_inspector(self.conf, log) + self.inspector = get_inspector(self.conf, log, transport_url) self.monitor = get_monitor(self.conf, self.inspector.get_inspector_url(), log) @@ -111,7 +111,10 @@ class FaultManagement(object): server = servers.get(vm_name) if not server: raise Exception('Can not find instance: vm_name(%s)' % vm_name) - host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') + # use hostname without domain name which is mapped to the cell + hostname = \ + server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') + host_name = hostname.split('.')[0] host_ip = self.installer.get_host_ip_from_hostname(host_name) self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at' @@ -209,6 +212,10 @@ class FaultManagement(object): detected = self.monitor.detected_time notified = self.consumer.notified_time + if None in [vmdown, hostdown, detected, notified]: + self.log.info('one of the time for profiler is None, return') + return + # TODO(yujunz) check the actual delay to verify time sync status # expected ~1s delay from $trigger to $linkdown relative_start = linkdown diff --git a/doctor_tests/scenario/maintenance.py b/doctor_tests/scenario/maintenance.py index 9fcd4128..e6cdcccd 100644 --- a/doctor_tests/scenario/maintenance.py +++ b/doctor_tests/scenario/maintenance.py @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2018 Nokia Corporation and others. +# Copyright (c) 2019 Nokia Corporation and others. # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -28,15 +28,25 @@ class Maintenance(object): def __init__(self, trasport_url, conf, log): self.conf = conf self.log = log + self.admin_session = get_session() self.keystone = keystone_client( self.conf.keystone_version, get_session()) self.nova = nova_client(conf.nova_version, get_session()) auth = get_identity_auth(project=self.conf.doctor_project) self.neutron = neutron_client(get_session(auth=auth)) self.stack = Stack(self.conf, self.log) - self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log) + if self.conf.installer.type == "devstack": + self.endpoint_ip = trasport_url.split("@", 1)[1].split(":", 1)[0] + else: + self.endpoint_ip = self.conf.admin_tool.ip + self.endpoint = "http://%s:12347/" % self.endpoint_ip + if self.conf.admin_tool.type == 'sample': + self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log) + self.endpoint += 'maintenance' + else: + self.endpoint += 'v1/maintenance' self.app_manager = get_app_manager(self.stack, self.conf, self.log) - self.inspector = get_inspector(self.conf, self.log) + self.inspector = get_inspector(self.conf, self.log, trasport_url) def get_external_network(self): ext_net = None @@ -64,8 +74,16 @@ class Maintenance(object): raise Exception('not enough vcpus (%d) on %s' % (vcpus, hostname)) if vcpus_used > 0: - raise Exception('%d vcpus used on %s' - % (vcpus_used, hostname)) + if self.conf.test_case == 'all': + # VCPU might not yet be free after fault_management test + self.log.info('%d vcpus used on %s, retry...' + % (vcpus_used, hostname)) + time.sleep(15) + hvisor = self.nova.hypervisors.get(hvisor.id) + vcpus_used = hvisor.__getattr__('vcpus_used') + if vcpus_used > 0: + raise Exception('%d vcpus used on %s' + % (vcpus_used, hostname)) if prev_vcpus != 0 and prev_vcpus != vcpus: raise Exception('%d vcpus on %s does not match to' '%d on %s' @@ -110,9 +128,14 @@ class Maintenance(object): parameters=parameters, files=files) - self.admin_tool.start() - self.app_manager.start() + if self.conf.admin_tool.type == 'sample': + self.admin_tool.start() + else: + # TBD Now we expect Fenix is running in self.conf.admin_tool.port + pass + # Inspector before app_manager, as floating ip might come late self.inspector.start() + self.app_manager.start() def start_maintenance(self): self.log.info('start maintenance.......') @@ -121,22 +144,49 @@ class Maintenance(object): for hvisor in hvisors: hostname = hvisor.__getattr__('hypervisor_hostname') maintenance_hosts.append(hostname) - - url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port - # let's start maintenance 20sec from now, so projects will have - # time to ACK to it before that - maintenance_at = (datetime.datetime.utcnow() + - datetime.timedelta(seconds=20) - ).strftime('%Y-%m-%d %H:%M:%S') - data = {'hosts': maintenance_hosts, - 'state': 'MAINTENANCE', - 'maintenance_at': maintenance_at, - 'metadata': {'openstack_version': 'Pike'}} + url = self.endpoint headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - - ret = requests.post(url, data=json.dumps(data), headers=headers) + if self.conf.admin_tool.type == 'fenix': + headers['X-Auth-Token'] = self.admin_session.get_token() + self.log.info('url %s headers %s' % (url, headers)) + retries = 12 + ret = None + while retries > 0: + # let's start maintenance 20sec from now, so projects will have + # time to ACK to it before that + maintenance_at = (datetime.datetime.utcnow() + + datetime.timedelta(seconds=30) + ).strftime('%Y-%m-%d %H:%M:%S') + + data = {'state': 'MAINTENANCE', + 'maintenance_at': maintenance_at, + 'metadata': {'openstack_version': 'Train'}} + + if self.conf.app_manager.type == 'vnfm': + data['workflow'] = 'vnf' + else: + data['workflow'] = 'default' + + if self.conf.admin_tool.type == 'sample': + data['hosts'] = maintenance_hosts + else: + data['hosts'] = [] + try: + ret = requests.post(url, data=json.dumps(data), + headers=headers) + except Exception: + if retries == 0: + raise Exception('admin tool did not respond in 120s') + else: + self.log.info('admin tool not ready, retry in 10s') + retries = retries - 1 + time.sleep(10) + continue + break + if not ret: + raise Exception("admin tool did not respond") if ret.status_code != 200: raise Exception(ret.text) return ret.json()['session_id'] @@ -144,48 +194,56 @@ class Maintenance(object): def remove_maintenance_session(self, session_id): self.log.info('remove maintenance session %s.......' % session_id) - url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port + url = ('%s/%s' % (self.endpoint, session_id)) - data = {'state': 'REMOVE_MAINTENANCE_SESSION', - 'session_id': session_id} headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - ret = requests.post(url, data=json.dumps(data), headers=headers) + if self.conf.admin_tool.type == 'fenix': + headers['X-Auth-Token'] = self.admin_session.get_token() + + ret = requests.delete(url, data=None, headers=headers) if ret.status_code != 200: raise Exception(ret.text) def get_maintenance_state(self, session_id): - url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port - data = {'session_id': session_id} + + url = ('%s/%s' % (self.endpoint, session_id)) + headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - ret = requests.get(url, data=json.dumps(data), headers=headers) + + if self.conf.admin_tool.type == 'fenix': + headers['X-Auth-Token'] = self.admin_session.get_token() + + ret = requests.get(url, data=None, headers=headers) if ret.status_code != 200: raise Exception(ret.text) return ret.json()['state'] def wait_maintenance_complete(self, session_id): - retries = 66 + retries = 90 state = None - time.sleep(540) - while state != 'MAINTENANCE_COMPLETE' and retries > 0: + time.sleep(300) + while (state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and + retries > 0): time.sleep(10) state = self.get_maintenance_state(session_id) retries = retries - 1 - if retries == 0 and state != 'MAINTENANCE_COMPLETE': - raise Exception('maintenance %s not completed within 20min, status' - ' %s' % (session_id, state)) - elif state == 'MAINTENANCE_COMPLETE': - self.log.info('maintenance %s %s' % (session_id, state)) - self.remove_maintenance_session(session_id) - elif state == 'MAINTENANCE_FAILED': + self.remove_maintenance_session(session_id) + self.log.info('maintenance %s ended with state %s' % + (session_id, state)) + if state == 'MAINTENANCE_FAILED': raise Exception('maintenance %s failed' % session_id) + elif retries == 0: + raise Exception('maintenance %s not completed within 20min' % + session_id) def cleanup_maintenance(self): - self.admin_tool.stop() + if self.conf.admin_tool.type == 'sample': + self.admin_tool.stop() self.app_manager.stop() self.inspector.stop() self.log.info('stack delete start.......') diff --git a/doctor_tests/stack.py b/doctor_tests/stack.py index ee586fa8..8a921beb 100644 --- a/doctor_tests/stack.py +++ b/doctor_tests/stack.py @@ -94,7 +94,7 @@ class Stack(object): # It might not always work at first self.log.info('retry creating maintenance stack.......') self.delete() - time.sleep(3) + time.sleep(5) stack = self.heat.stacks.create(stack_name=self.stack_name, files=files, template=template, diff --git a/doctor_tests/user.py b/doctor_tests/user.py index 29aa004b..2cd9757f 100644 --- a/doctor_tests/user.py +++ b/doctor_tests/user.py @@ -129,7 +129,6 @@ class User(object): def _add_user_role_in_project(self, is_admin=False): """add test user with test role in test project""" - project = self.projects.get(self.conf.doctor_project) user_name = 'admin' if is_admin else self.conf.doctor_user @@ -1,12 +1,12 @@ [tox] minversion = 2.3.1 -envlist = py34, pep8,docs,docs-linkcheck +envlist = py36,pep8,docs,docs-linkcheck skipsdist = True [testenv] usedevelop = True -install_command = pip install \ - -chttps://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/pike \ +install_command = pip3 install \ + -chttps://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/stein \ {opts} {packages} setenv = VIRTUAL_ENV={envdir} deps = -r{toxinidir}/requirements.txt @@ -29,10 +29,13 @@ passenv = INSTALLER_TYPE INSTALLER_IP INSPECTOR_TYPE + ADMIN_TOOL_TYPE TEST_CASE SSH_KEY + APP_MANAGER_TYPE changedir = {toxinidir}/doctor_tests commands = doctor-test + /usr/bin/find {toxinidir} -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete [testenv:pep8] changedir = {toxinidir} |