diff options
-rw-r--r-- | docs/development/design/index.rst | 1 | ||||
-rw-r--r-- | docs/development/design/maintenance-design-guideline.rst | 155 | ||||
-rw-r--r-- | tests/installer/apex.py | 6 | ||||
-rw-r--r-- | tests/logger.py | 5 |
4 files changed, 159 insertions, 8 deletions
diff --git a/docs/development/design/index.rst b/docs/development/design/index.rst index 87d14d42..e50c1704 100644 --- a/docs/development/design/index.rst +++ b/docs/development/design/index.rst @@ -26,3 +26,4 @@ See also https://wiki.opnfv.org/requirements_projects . port-data-plane-status.rst inspector-design-guideline.rst performance-profiler.rst + maintenance-design-guideline.rst diff --git a/docs/development/design/maintenance-design-guideline.rst b/docs/development/design/maintenance-design-guideline.rst new file mode 100644 index 00000000..93c3cf4e --- /dev/null +++ b/docs/development/design/maintenance-design-guideline.rst @@ -0,0 +1,155 @@ +.. This work is licensed under a Creative Commons Attribution 4.0 International License. +.. http://creativecommons.org/licenses/by/4.0 + +==================================== +Planned Maintenance Design Guideline +==================================== + +.. NOTE:: + This is spec draft of design guideline for planned maintenance. + JIRA ticket to track the update and collect comments: `DOCTOR-52`_. + +This document describes how one can implement planned maintenance by utilizing +the `OPNFV Doctor project`_. framework and to meet the set requirements. + +Problem Description +=================== + +Telco application need to know when planned maintenance is going to happen in +order to guarantee zero down time in its operation. It needs to be possible to +make own actions to have application running on not affected resource or give +guidance to admin actions like migration. More details are defined in +requirement documentation: `use cases`_, `architecture`_ and `implementation`_. +Also discussion in the OPNFV summit about `planned maintenance session`_. + +Guidelines +========== + +Cloud admin needs to make a notification about planned maintenance including +all details that application needs in order to make decisions upon his affected +service. This notification payload can be consumed by application by subscribing +to corresponding event alarm trough alarming service like OpenStack AODH. + +Before maintenance starts application needs to be able to make switch over for +his ACT-STBY service affected, do operation to move service to not effected part +of infra or give a hint for admin operation like migration that can be +automatically issued by admin tool according to agreed policy. + +Flow diagram:: + + admin alarming project controller inspector + | service app manager | | + | 1. | | | | + +------------------------->+ | + +<-------------------------+ | + | 2. | | | | + +------>+ 3. | | | + | +-------->+ 4. | | + | | +------->+ | + | | 5. +<-------+ | + +<----------------+ | | + | | 6. | | + +------------------------->+ | + +<-------------------------+ 7. | + +------------------------------------->+ + | 8. | | | | + +------>+ 9. | | | + | +-------->+ | | + +--------------------------------------+ + | 10. | + +--------------------------------------+ + | 11. | | | | + +------------------------->+ | + +<-------------------------+ | + | 12. | | | | + +------>+-------->+ | 13. | + +------------------------------------->+ + +-------+---------+--------+-----------+ + +Concepts used below: + +- `full maintenance`: This means maintenance will take a longer time and + resource should be emptied, meaning container or VM need to be moved or + deleted. Admin might need to test resource to work after maintenance. + +- `reboot`: Only a reboot is needed and admin does not need separate testing + after that. Container or VM can be left in place if so wanted. + +- `notification`: Notification to rabbitmq. + +Admin makes a planned maintenance session where he sets +a `maintenance_session_id` that is a unique ID for all the hardware resources he +is going to have the maintenance at the same time. Mostly maintenance should be +done node by node, meaning a single compute node at a time would be in single +planned maintenance session having unique `maintenance_session_id`. This ID will +be carried trough the whole session in all places and can be used to query +maintenance in admin tool API. Project running a Telco application should set +a specific role for admin tool to know it cannot do planned maintenance unless +project has agreed actions to be done for its VMs or containers. This means the +project has configured itself to get alarms upon planned maintenance and it is +capable of agreeing needed actions. Admin is supposed to use an admin tool to +automate maintenance process partially or entirely. + +The flow of a successful planned maintenance session as in OpenStack example +case: + +1. Admin disables nova-compute in order to do planned maintenance on a compute + host and gets ACK from the API call. This action needs to be done to ensure + no thing will be placed in this compute host by any user. Action is always + done regardless the whole compute will be affected or not. +2. Admin sends a project specific maintenance notification with state + `planned maintenance`. This includes detailed information about maintenance, + like when it is going to start, is it `reboot` or `full maintenance` + including the information about project containers or VMs running on host or + the part of it that will need maintenance. Also default action like + migration will be mentioned that will be issued by admin before maintenance + starts if no other action is set by project. In case project has a specific + role set, planned maintenance cannot start unless project has agreed the + admin action. Available admin actions are also listed in notification. +3. Application manager of the project receives AODH alarm about the same. +4. Application manager can do switch over to his ACT-STBY service, delete and + re-instantiate his service on not affected resource if so wanted. +5. Application manager may call admin tool API to give preferred instructions + for leaving VMs and containers in place or do admin action to migrate them. + In case admin does not receive this instruction before maintenance is to + start it will do the pre-configured default action like migration to + projects without a specific role to say project need to agree the action. + VMs or Containers can be left on host if type of maintenance is just `reboot`. +6. Admin does possible actions to VMs and containers and receives an ACK. +7. In case everything went ok, Admin sends admin type of maintenance + notification with state `in maintenance`. This notification can be consumed + by Inspector and other cloud services to know there is ongoing maintenance + which means things like automatic fault management actions for the hardware + resources should be disabled. +8. If maintenance type is `reboot` and project is still having containers or + VMs running on affected hardware resource, Admin sends project specific + maintenance notification with state updated to `in maintenance`. If project + do not have anything left running on affected hardware resource, state will + be `maintenance over` instead. If maintenance can not be performed for some + reason state should be `maintenance cancelled`. In this case last operation + remaining for admin is to re-enable nova-compute service, ensure + everything is running and not to proceed any further steps. +9. Application manager of the project receives AODH alarm about the same. +10. Admin will do the maintenance. This is out of Doctor scope. +11. Admin enables nova-compute service when maintenance is over and host can be + put back to production. An ACK is received from API call. +12. In case project had left containers or VMs on hardware resource over + maintenance, Admin sends project specific maintenance notification with + state updated to `maintenance over`. +13. Admin sends admin type of maintenance notification with state updated to + `maintenance over`. Inspector and other + cloud services can consume this to know hardware resource is back in use. + +POC +--- + +There was a `Maintenance POC`_ for planned maintenance in the OPNFV Beijing +summit to show the basic concept of using framework defined by the project. + +.. _DOCTOR-52: https://jira.opnfv.org/browse/DOCTOR-52 +.. _OPNFV Doctor project: https://wiki.opnfv.org/doctor +.. _use cases: http://artifacts.opnfv.org/doctor/docs/requirements/02-use_cases.html#nvfi-maintenance +.. _architecture: http://artifacts.opnfv.org/doctor/docs/requirements/03-architecture.html#nfvi-maintenance +.. _implementation: http://artifacts.opnfv.org/doctor/docs/requirements/05-implementation.html#nfvi-maintenance +.. _planned maintenance session: https://lists.opnfv.org/pipermail/opnfv-tech-discuss/2017-June/016677.html +.. _Maintenance POC: https://wiki.opnfv.org/download/attachments/5046291/Doctor%20Maintenance%20PoC%202017.pptx?version=1&modificationDate=1498182869000&api=v2 diff --git a/tests/installer/apex.py b/tests/installer/apex.py index 24cd5a75..e0960a5f 100644 --- a/tests/installer/apex.py +++ b/tests/installer/apex.py @@ -13,8 +13,6 @@ import pwd import stat import sys -from installer.common.congress import set_doctor_driver_conf -from installer.common.congress import restore_doctor_driver_conf from installer.base import BaseInstaller from utils import SSHClient @@ -74,16 +72,12 @@ class ApexInstaller(BaseInstaller): client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) self.controller_clients.append(client) self._ceilometer_apply_patches(client, self.cm_set_script) - cmd = 'sudo systemctl restart openstack-congress-server.service' - set_doctor_driver_conf(client, cmd) def restore_apply_patches(self): self.log.info('restore apply patches start......') for client in self.controller_clients: self._ceilometer_apply_patches(client, self.cm_restore_script) - cmd = 'sudo systemctl restart openstack-congress-server.service' - restore_doctor_driver_conf(client, cmd) def _ceilometer_apply_patches(self, ssh_client, script_name): installer_dir = os.path.dirname(os.path.realpath(__file__)) diff --git a/tests/logger.py b/tests/logger.py index 021389d9..80d19bb9 100644 --- a/tests/logger.py +++ b/tests/logger.py @@ -21,7 +21,8 @@ class Logger(object): CI_DEBUG = os.getenv('CI_DEBUG') - logging.basicConfig(filemode='w') + filename = '%s.log' % logger_name + logging.basicConfig(filemode='w', filename=filename) self.logger = logging.getLogger(logger_name) self.logger.propagate = 0 self.logger.setLevel(logging.DEBUG) @@ -37,7 +38,7 @@ class Logger(object): ch.setLevel(logging.INFO) self.logger.addHandler(ch) - file_handler = logging.FileHandler('%s.log' % logger_name) + file_handler = logging.FileHandler(filename) file_handler.setFormatter(formatter) file_handler.setLevel(logging.DEBUG) self.logger.addHandler(file_handler) |