From 61a851845546300cc2f5ee9f3dd6761c9ecd093e Mon Sep 17 00:00:00 2001 From: Jie Hu Date: Wed, 16 Dec 2015 18:52:19 +0800 Subject: ESCALATOR-31 Adjusting documentation JIRA: ESCALATOR-31 Change-Id: I0b83511a542982f07c2ab9d60517f4b5f357569b Signed-off-by: Jie Hu --- docs/00-Authors.rst | 15 - docs/01-Scope.rst | 28 -- docs/02-Background_and_Terminologies.rst | 535 --------------------- docs/03-Functional_Requirements.rst | 240 --------- docs/04-Use_Cases_and_Scenarios.rst | 211 -------- docs/05-Reference_Architecture.rst | 113 ----- docs/06-Information_Flows.rst | 56 --- docs/07-Interfaces_and_Files.rst | 27 -- docs/08-Requirements_from_other_OPNFV_Project.rst | 40 -- docs/09-Reference.rst | 17 - docs/10-Useful_Working_Drafts_of_ETSI_NFV.rst | 11 - docs/A1-Appendix.rst | 49 -- docs/design/201-Reference_Architecture.rst | 54 +++ docs/design/202-Information_Flows.rst | 57 +++ docs/design/203-Administrative_Interfaces.rst | 16 + docs/design/204-Configuration_and_Logging.rst | 18 + docs/design/2A1-Appendix.rst | 50 ++ docs/design/etc/conf.py | 34 ++ docs/design/etc/opnfv-logo.png | Bin 0 -> 2829 bytes docs/design/images/figure2.png | Bin 0 -> 9196 bytes docs/design/images/figure3.png | Bin 0 -> 25188 bytes docs/design/images/figure4.png | Bin 0 -> 122040 bytes docs/design/images/figure5.png | Bin 0 -> 28490 bytes docs/design/images/figure6.png | Bin 0 -> 46314 bytes docs/design/index.rst | 33 ++ docs/etc/conf.py | 34 ++ docs/etc/opnfv-logo.png | Bin 0 -> 2829 bytes docs/gap_analysis/301-Impact_Analysis.rst | 47 ++ docs/gap_analysis/etc/conf.py | 34 ++ docs/gap_analysis/etc/opnfv-logo.png | Bin 0 -> 2829 bytes docs/gap_analysis/index.rst | 30 ++ docs/how-to-use-docs/README.txt | 1 - docs/index.rst | 38 -- docs/requirements/000-Contributors.rst | 16 + docs/requirements/101-Scope.rst | 45 ++ docs/requirements/102-Terminologies.rst | 129 +++++ docs/requirements/103-Background.rst | 226 +++++++++ docs/requirements/104-Requirements.rst | 478 ++++++++++++++++++ docs/requirements/105-Use_Cases.rst | 213 ++++++++ docs/requirements/106-Reference.rst | 18 + .../1A1-Requirements_from_other_Projects.rst | 34 ++ .../1A2-Questionnaire_of_Escalator.rst | 11 + docs/requirements/300-Gap_Analysis_Report.rst | 50 ++ docs/requirements/etc/conf.py | 34 ++ docs/requirements/etc/opnfv-logo.png | Bin 0 -> 2829 bytes docs/requirements/images/figure1.png | Bin 0 -> 118003 bytes docs/requirements/index.rst | 37 ++ 47 files changed, 1698 insertions(+), 1381 deletions(-) delete mode 100644 docs/00-Authors.rst delete mode 100644 docs/01-Scope.rst delete mode 100644 docs/02-Background_and_Terminologies.rst delete mode 100644 docs/03-Functional_Requirements.rst delete mode 100644 docs/04-Use_Cases_and_Scenarios.rst delete mode 100644 docs/05-Reference_Architecture.rst delete mode 100644 docs/06-Information_Flows.rst delete mode 100644 docs/07-Interfaces_and_Files.rst delete mode 100644 docs/08-Requirements_from_other_OPNFV_Project.rst delete mode 100644 docs/09-Reference.rst delete mode 100644 docs/10-Useful_Working_Drafts_of_ETSI_NFV.rst delete mode 100644 docs/A1-Appendix.rst create mode 100644 docs/design/201-Reference_Architecture.rst create mode 100644 docs/design/202-Information_Flows.rst create mode 100644 docs/design/203-Administrative_Interfaces.rst create mode 100644 docs/design/204-Configuration_and_Logging.rst create mode 100644 docs/design/2A1-Appendix.rst create mode 100644 docs/design/etc/conf.py create mode 100644 docs/design/etc/opnfv-logo.png create mode 100644 docs/design/images/figure2.png create mode 100644 docs/design/images/figure3.png create mode 100644 docs/design/images/figure4.png create mode 100644 docs/design/images/figure5.png create mode 100644 docs/design/images/figure6.png create mode 100644 docs/design/index.rst create mode 100644 docs/etc/conf.py create mode 100644 docs/etc/opnfv-logo.png create mode 100644 docs/gap_analysis/301-Impact_Analysis.rst create mode 100644 docs/gap_analysis/etc/conf.py create mode 100644 docs/gap_analysis/etc/opnfv-logo.png create mode 100644 docs/gap_analysis/index.rst delete mode 100644 docs/how-to-use-docs/README.txt delete mode 100644 docs/index.rst create mode 100644 docs/requirements/000-Contributors.rst create mode 100644 docs/requirements/101-Scope.rst create mode 100644 docs/requirements/102-Terminologies.rst create mode 100644 docs/requirements/103-Background.rst create mode 100644 docs/requirements/104-Requirements.rst create mode 100644 docs/requirements/105-Use_Cases.rst create mode 100644 docs/requirements/106-Reference.rst create mode 100644 docs/requirements/1A1-Requirements_from_other_Projects.rst create mode 100644 docs/requirements/1A2-Questionnaire_of_Escalator.rst create mode 100644 docs/requirements/300-Gap_Analysis_Report.rst create mode 100644 docs/requirements/etc/conf.py create mode 100644 docs/requirements/etc/opnfv-logo.png create mode 100644 docs/requirements/images/figure1.png create mode 100644 docs/requirements/index.rst diff --git a/docs/00-Authors.rst b/docs/00-Authors.rst deleted file mode 100644 index fdbf61b..0000000 --- a/docs/00-Authors.rst +++ /dev/null @@ -1,15 +0,0 @@ -Authors: --------- - -| Jie Hu (ZTE, hu.jie@zte.com.cn) -| Qiao Fu (China Mobile, fuqiao@chinamobile.com) -| Ulrich Kleber (Huawei, Ulrich.Kleber@huawei.com) -| Maria Toeroe (Ericsson, maria.toeroe@ericsson.com) -| Sama, Malla Reddy (DOCOMO, sama@docomolab-euro.com) -| Zhong Chao (ZTE, chao.zhong@zte.com.cn) -| Julien Zhang (ZTE, zhang.jun3g@zte.com.cn) -| Yuri Yuan (ZTE, yuan.yue@zte.com.cn) -| Zhipeng Huang (Huawei, huangzhipeng@huawei.com) -| Jia Meng (ZTE, meng.jia@zte.com.cn) -| Liyi Meng (Ericsson, liyi.meng@ericsson.com) -| Pasi Vaananen (Stratus, pasi.vaananen@stratus.com) \ No newline at end of file diff --git a/docs/01-Scope.rst b/docs/01-Scope.rst deleted file mode 100644 index 5247e40..0000000 --- a/docs/01-Scope.rst +++ /dev/null @@ -1,28 +0,0 @@ -Scope ------ - -This document describes the user requirements on the smooth upgrade -function of the NFVI and VIM with respect to the upgrades of the OPNFV -platform from one version to another. Smooth upgrade means that the -upgrade results in no service outage for the end-users. This requires -that the process of the upgrade is automatically carried out by a tool -(code name: Escalator) with pre-configured data. The upgrade process -includes preparation, validation, execution, monitoring and -conclusion. - -.. While it is good to have a tool for the entire upgrade process, - but it is a challenging task, so maybe we shouldn't require automation - for the entire process right away. Automation is essential at - execution. - -.. Maybe we can analysis information flows of the upgrade tool, - abstract the basic / essential actions from the tool (or tools), and - map them to a command set of NFVI / VIM's interfaces. - -The requirements are defined in a stepwise approach, i.e. in the first -phase focusing on the upgrade of the VIM then widening the scope to the -NFVI. - -The requirements may apply to different NFV functions (NFVI, or VIM, or -both of them). They will be classified in the Appendix of this -document. \ No newline at end of file diff --git a/docs/02-Background_and_Terminologies.rst b/docs/02-Background_and_Terminologies.rst deleted file mode 100644 index 488968b..0000000 --- a/docs/02-Background_and_Terminologies.rst +++ /dev/null @@ -1,535 +0,0 @@ -General Requirements Background and Terminology ------------------------------------------------ - -Terminologies and definitions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -NFVI - The term is an abbreviation for Network Function Virtualization - Infrastructure; sometimes it is also referred as data plane in this - document. The NFVI provides the virtual resources to the virtual - network functions under the control of the VIM. - -VIM - The term is an abbreviation for Virtual Infrastructure Manager; - sometimes it is also referred as control plane in this document. - The VIM controls and manages the NFVI compute, network and storage - resources to provide the required virtual resources to the VNFs. - -Operator - The term refers to network service providers and Virtual Network - Function (VNF) providers. - -End-User - The term refers to a subscriber of the Operator's services. - -Network Service - The term refers to a service provided by an Operator to its - end-users using a set of (virtualized) Network Functions - -Infrastructure Services - The term refers to services provided by the NFV Infrastructure to the VNFs - as required by the Management & Orchestration functions and especially the VIM. - I.e. these are the virtual resources as perceived by the VNFs. - -Smooth Upgrade - The term refers to an upgrade that results in no service outage - for the end-users. - -Rolling Upgrade - The term refers to an upgrade strategy, which upgrades a node or a subset - of nodes at a time in a wave style rolling through the data centre. It - is a popular upgrade strategy to maintain service availability. - -Parallel Universe Upgrade - The term refers to an upgrade strategy, which creates and deploys - a new universe - a system with the new configuration - while the old - system continues running. The state of the old system is transferred - to the new system after sufficient testing of the new system. - -Infrastructure Resource Model - The term refers to the representation of infrastructure resources, - namely: the physical resources, the virtualization - facility resources and the virtual resources. - -Physical Resource - The term refers to a piece of hardware in the NFV infrastructure that may - also include firmware enabling this piece of hardware. - -Virtual Resource - The term refers to a resource, which is provided as services built on top - of the physical resources via the virtualization facilities; in particular, - virtual resources are the resources on which VNFs are deployed. Examples of - virtual resources are: VMs, virtual switches, virtual routers, virtual disks. - -Visualization Facility - The term refers to a resource that enables the creation - of virtual environments on top of the physical resources, e.g. - hypervisor, OpenStack, etc. - -Upgrade Campaign - The term refers to a choreography that describes how the upgrade should - be performed in terms of its targets (i.e. upgrade objects), the - steps/actions required of upgrading each, and the coordination of these - steps so that service availability can be maintained. It is an input to an - upgrade tool (Escalator) to carry out the upgrade. - -Upgrade Duration - The duration of an upgrade characterized by the time elapsed between its - initiation and its completion. E.g. from the moment the execution of an - upgrade campaign has started until it has been committed. Depending on - the upgrade strategy, the state of the configuration and the upgrade target - some parts of the system may be in a more vulnerable state with respect to - service availbility. - -Outage - The period of time during which a given service is not provided is referred - as the outage of that given service. If a subsystem or the entire system - does not provide any service, it is the outage of the given subsystem or the - system. Smooth upgrade means upgrade with no outage for the user plane, i.e. - no VNF should experience service outage. - -Rollback - The term refers to a failure handling strategy that reverts the changes - done by a potentially failed upgrade execution one by one in a reverse order. - I.e. it is like undoing the changes done by the upgrade. - -Backup - The term refers to data persisted to a storage, so that it can be used to - restore the system or a given part of it in the same state as it was when the - backup was created assuming a cold restart. Changes made to the system from - the moment the backup was created till the moment it is used to restore the - (sub)system are lost in the restoration process. - -Restore - The term refers to a failure handling strategy that reverts the changes - done, for example, by an upgrade by restoring the system from some backup - data. This results in the loss of any change and data persisted after the - backup was been taken. To recover those additional measures need to be taken - if necessary (e.g. rollforward). - -Rollforward - The term refers to a failure handling strategy applied after a restore - (from a backup) opertaion to recover any loss of data persisted between - the time the backup has been taken and the moment it is restored. Rollforward - requires that data that needs to survive the restore operation is logged at - a location not impacted by the restore so that it can be re-applied to the - system after its restoration from the backup. - -Downgrade - The term refers to an upgrade in which an earlier version of the software - is restored through the upgrade procedure. A system can be downgraded to any - earlier version and the compatibility of the versions will determine the - applicable upgrade strategies and whether service outage can be avoided. - In particular any data conversion needs special attention. - - - -Upgrade Objects -~~~~~~~~~~~~~~~ - -Physical Resource -^^^^^^^^^^^^^^^^^ - -Most cloud infrastructures support the dynamic addition and removal of -hardware. Accordingly a hardware upgrade could be done by adding the new -piece of hardware and removing the old one. From the persepctive of smooth -upgrade the orchestration/scheduling of these actions is the primary concern. - -Upgrading a physical resource may involve as well the upgrade of its firmware -and/or modifying its configuration data. This may require the restart of the -hardware. - - - -Virtual Resources -^^^^^^^^^^^^^^^^^ - -Addition and removal of virtual resources may be initiated by the users or be -a result of an elasticity action. Users may also request the upgrade of their -virtual resources using a new VM image. - -.. Needs to be moved to requirement section: Escalator should facilitate such an -option and allow for a smooth upgrade. - -On the other hand changes in the infrastructure, namely, in the hardware and/or -the virtualization facility resources may result in the upgrade of the virtual -resources. For example if by some reason the hypervisor is changed and -the current VMs cannot be migrated to the new hypervisor - they are -incompatible - then the VMs need to be upgraded too. This is not -something the NFVI user (i.e. VNFs ) would know about. - - -Virtualization Facility Resources -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Based on the functionality they provide, virtualization facility -resources could be divided into computing node, networking node, -storage node and management node. - -The possible upgrade objects in these nodes are considered below: -(Note: hardware based virtualization may be considered as virtualization -facility resource, but from escalator perspective, it is better to -consider it as part of the hardware upgrade. ) - -**Computing node** - -1. OS Kernel - -2. Hypvervisor and virtual switch - -3. Other kernel modules, like drivers - -4. User space software packages, like nova-compute agents and other - control plane programs. - -Updating 1 and 2 will cause the loss of virtualzation functionality of -the compute node, which may lead to the interruption of data plane services -if the virtual resource is not redudant. - -Updating 3 might have the same result. - -Updating 4 might lead to control plane services interruption if not an -HA deployment. - -.. I'm not sure why would 4 cause control plane interruption on a - compute node. My understanding is that simply the node cannot be managed. - Redundancy won't help in that either. - - -**Networking node** - -1. OS kernel, optional, not all switches/routers allow the upgrade their - OS since it is more like a firmware than a generic OS. - -2. User space software package, like neutron agents and other control - plane programs - -Updating 1 if allowed will cause a node reboot and therefore leads to -data plane service interruption if the virtual resource is not -redundant. - -Updating 2 might lead to control plane services interruption if not an -HA deployment. - -**Storage node** - -1. OS kernel, optional, not all storage nodes allow the upgrade their OS - since it is more like a firmware than a generic OS. - -2. Kernel modules - -3. User space software packages, control plane programs - -Updating 1 if allowed will cause a node reboot and therefore leads to -data plane services interruption if the virtual resource is not -redundant. - -Update 2 might result in the same. - -Updating 3 might lead to control plane services interruption if not an -HA deployment. - -**Management node** - -1. OS Kernel - -2. Kernel modules, like driver - -3. User space software packages, like database, message queue and - control plane programs. - -Updating 1 will cause a node reboot and therefore leads to control -plane services interruption if not an HA deployment. Updating 2 might -result in the same. - -Updating 3 might lead to control plane services interruption if not an -HA deployment. - - - - - -Upgrade Granularity -~~~~~~~~~~~~~~~~~~~ - -The granularity of an upgrade can be characterized from two perspective: -- the physical dimension and -- the software dimension - - -Physical Dimension -^^^^^^^^^^^^^^^^^^ - -The physical dimension characterizes the number of similar upgrade objects -targeted by the upgrade, i.e. whether it is full / partial upgrade of a -data centre, cluster, zone. -Because of the upgrade of a data centre or a zone, it may be divided into -several batches. Thus there is a need for efficiency in the execution of -upgrades of potentially huge number of upgrade objects while still maintain -availability to fulfill the requirement of smooth upgrade. - -The upgrade of a cloud environment (cluster) may also -be partial. For example, in one cloud environment running a number of -VNFs, we may just try to upgrade one of them to check the stability and -performance, before we upgrade all of them. -Thus there is a need for proper organization of the artifacts associated with -the different upgrade objects. Also the different versions should be able -to coextist beyond the upgrade period. - -From this perspective special attention may be needed when upgrading -objects that are collaborating in a redundancy schema as in this case -different versions not only need to coexist but also collaborate. This -puts requirement on the upgrade objects primarily. If this is not possible -the upgrade campaign should be designed in such a way that the proper -isolation is ensured. - -Software Dimension -^^^^^^^^^^^^^^^^^^ - -The software dimension of the upgrade characterizes the upgrade object -type targeted and the combination in which they are upgraded together. - -Even though the upgrade may -initially target only one type of upgrade object, e.g. the hypervisor -the dependency of other upgrade objects on this initial target object may -require their upgrade as well. I.e. the upgrades need to be combined. From this -perspective the main concern is compatibility of the dependent and -sponsor objects. To take into consideration of these dependencies -they need to be described together with the version compatility information. -Breaking dependencies is the major cause of outages during upgrades. - -In other cases it is more efficient to upgrade a combination of upgrade -objects than to do it one by one. One aspect of the combination is how -the upgrade packages can be combined, whether a new image can be created for -them before hand or the different packages can be installed during the upgrade -independently, but activated together. - -The combination of upgrade objects may span across -layers (e.g. software stack in the host and the VM of the VNF). -Thus, it may require additional coordination between the management layers. - -With respect to each upgrade object type and even stacks we can -distingush major and minor upgrades: - -**Major Upgrade** - -Upgrades between major releases may introducing significant changes in -function, configuration and data, such as the upgrade of OPNFV from -Arno to Brahmaputra. - -**Minor Upgrade** - -Upgrades inside one major releases which would not leads to changing -the structure of the platform and may not infect the schema of the -system data. - -Scope of Impact -~~~~~~~~~~~~~~~ - -Considering availability and therefore smooth upgrade, one of the major -concerns is the predictability and control of the outcome of the different -upgrade operations. Ideally an upgrade can be performed without impacting any -entity in the system, which means none of the operations change or potentially -change the behaviour of any entity in the system in an uncotrolled manner. -Accordingly the operations of such an upgrade can be performed any time while -the system is running, while all the entities are online. No entity needs to be -taken offline to avoid such adverse effects. Hence such upgrade operations -are referred as online operations. The effects of the upgrade might be activated -next time it is used, or may require a special activation action such as a -restart. Note that the activation action provides more control and predictability. - -If an entity's behavior in the system may change due to the upgrade it may -be better to take it offline for the time of the relevant upgrade operations. -The main question is however considering the hosting relation of an upgrade -object what hosted entities are impacted. Accordingly we can identify a scope -which is impacted by taking the given upgrade object offline. The entities -that are in the scope of impact may need to be taken offline or moved out of -this scope i.e. migrated. - -If the impacted entity is in a different layer managed by another manager -this may require coordination because taking out of service some -infrastructure resources for the time of their upgrade which support virtual -resources used by VNFs that should not experience outages. The hosted VNFs -may or may not allow for the hot migration of their VMs. In case of migration -the VMs placement policy should be considered. - - - -Upgrade duration -~~~~~~~~~~~~~~~~ - -As the OPNFV end-users are primarily Telecom operators, the network -services provided by the VNFs deployed on the NFVI should meet the -requirement of 'Carrier Grade'.:: - - In telecommunication, a "carrier grade" or"carrier class" refers to a - system, or a hardware or software component that is extremely reliable, - well tested and proven in its capabilities. Carrier grade systems are - tested and engineered to meet or exceed "five nines" high availability - standards, and provide very fast fault recovery through redundancy - (normally less than 50 milliseconds). [from wikipedia.org] - -"five nines" means working all the time in ONE YEAR except 5'15". - -:: - - We have learnt that a well prepared upgrade of OpenStack needs 10 - minutes. The major time slot in the outage time is used spent on - synchronizing the database. [from ' Ten minutes OpenStack Upgrade? Done! - ' by Symantec] - -This 10 minutes of downtime of the OpenStack services however did not impact the -users, i.e. the VMs running on the compute nodes. This was the outage of -the control plane only. On the other hand with respect to the -preparations this was a manually tailored upgrade specific to the -particular deployment and the versions of each OpenStack service. - -The project targets to achieve a more generic methodology, which however -requires that the upgrade objects fulfil certain requirements. Since -this is only possible on the long run we target first the upgrade -of the different VIM services from version to version. - -**Questions:** - -1. Can we manage to upgrade OPNFV in only 5 minutes? - -.. The first question is whether we have the same carrier grade - requirement on the control plane as on the user plane. I.e. how - much control plane outage we can/willing to tolerate? - In the above case probably if the database is only half of the size - we can do the upgrade in 5 minutes, but is that good? It also means - that if the database is twice as much then the outage is 20 - minutes. - For the user plane we should go for less as with two release yearly - that means 10 minutes outage per year. - -.. 10 minutes outage per year to the users? Plus, if we take - control plane into the consideration, then total outage will be - more than 10 minute in whole network, right? - -.. The control plane outage does not have to cause outage to - the users, but it may of course depending on the size of the system - as it's more likely that there's a failure that needs to be handled - by the control plane. - -2. Is it acceptable for end users ? Such as a planed service - interruption will lasting more than ten minutes for software - upgrade. - -.. For user plane, no it's not acceptable in case of - carrier-grade. The 5' 15" downtime should include unplanned and - planned downtimes. - -.. I go agree with Maria, it is not acceptable. - -3. Will any VNFs still working well when VIM is down? - -.. In case of OpenStack it seems yes. .:) - -The maximum duration of an upgrade -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The duration of an upgrade is related to and proportional with the -scale and the complexity of the OPNFV platform as well as the -granularity (in function and in space) of the upgrade. - -.. Also, if is a partial upgrade like module upgrade, it depends - also on the OPNFV modules and their tight connection entities as well. - -.. Since the maintenance window is shrinking and becoming non-existent - the duration of the upgrade is secondary to the requirement of smooth upgrade. - But probably we want to be able to put a time constraint on each upgrade - during which it must complete otherwise it is considered failed and the system - should be rolled back. I.e. in case of automatic execution it might not be clear - if an upgrade is long or just hanging. The time constraints may be a function - of the size of the system in terms of the upgrade object(s). - -The maximum duration of a roll back when an upgrade is failed -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The duration of a roll back is short than the corresponding upgrade. It -depends on the duration of restore the software and configure data from -pre-upgrade backup / snapshot. - -.. During the upgrade process two types of failure may happen: - In case we can recover from the failure by undoing the upgrade - actions it is possible to roll back the already executed part of the - upgrade in graceful manner introducing no more service outage than - what was introduced during the upgrade. Such a graceful roll back - requires typically the same amount of time as the executed portion of - the upgrade and impose minimal state/data loss. - -.. Requirement: It should be possible to roll back gracefully the - failed upgrade of stateful services of the control plane. - In case we cannot recover from the failure by just undoing the - upgrade actions, we have to restore the upgraded entities from their - backed up state. In other terms the system falls back to an earlier - state, which is typically a faster recovery procedure than graceful - roll back and depending on the statefulness of the entities involved it - may result in significant state/data loss. - -.. Two possible types of failures can happen during an upgrade - -.. We can recover from the failure that occurred in the upgrade process: - In this case, a graceful rolling back of the executed part of the - upgrade may be possible which would "undo" the executed part in a - similar fashion. Thus, such a roll back introduces no more service - outage during an upgrade than the executed part introduced. This - process typically requires the same amount of time as the executed - portion of the upgrade and impose minimal state/data loss. - -.. We cannot recover from the failure that occurred in the upgrade - process: In this case, the system needs to fall back to an earlier - consistent state by reloading this backed-up state. This is typically - a faster recovery procedure than the graceful roll back, but can cause - state/data loss. The state/data loss usually depends on the - statefulness of the entities whose state is restored from the backup. - -The maximum duration of a VNF interruption (Service outage) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Since not the entire process of a smooth upgrade will affect the VNFs, -the duration of the VNF interruption may be shorter than the duration -of the upgrade. In some cases, the VNF running without the control -from of the VIM is acceptable. - -.. Should require explicitly that the NFVI should be able to - provide its services to the VNFs independent of the control plane? - -.. Requirement: The upgrade of the control plane must not cause - interruption of the NFVI services provided to the VNFs. - -.. With respect to carrier-grade the yearly service outage of the - VNF should not exceed 5' 15" regardless whether it is planned or - unplanned outage. Considering the HA requirements TL-9000 requires an - end-to-end service recovery time of 15 seconds based on which the ETSI - GS NFV-REL 001 V1.1.1 (2015-01) document defines three service - availability levels (SAL). The proposed example service recovery times - for these levels are: - -.. SAL1: 5-6 seconds - -.. SAL2: 10-15 seconds - -.. SAL3: 20-25 seconds - -.. my comment was actually that the downtime metrics of the - underlying elements, components and services are small fraction of the - total E2E service availability time. No-one on the E2E service path - will get the whole downtime allocation (in this context it includes - upgrade process related outages for the services provided by VIM etc. - elements that are subject to upgrade process). - -.. So what you are saying is that the upgrade of any entity - (component, service) shouldn't cause even this much service - interruption. This was the reason I brought these figures here as well - that they are posing some kind of upper-upper boundary. Ideally the - interruption is in the millisecond range i.e. no more than a - switch-over or a live migration. - -.. Requirement: Any interruption caused to the VNF by the upgrade - of the NFVI should be in the sub-second range. - -.. In the future we also need to consider the upgrade of the NFVI, - i.e. HW, firmware, hypervisors, host OS etc. \ No newline at end of file diff --git a/docs/03-Functional_Requirements.rst b/docs/03-Functional_Requirements.rst deleted file mode 100644 index c0695bb..0000000 --- a/docs/03-Functional_Requirements.rst +++ /dev/null @@ -1,240 +0,0 @@ -Functional Requirements ------------------------ - -Basic Actions -~~~~~~~~~~~~~ - -This section describes the basic functions may required by Escalator. - -Preparation (offline) -^^^^^^^^^^^^^^^^^^^^^ - -This is the design phase when the upgrade plan (or upgrade campaign) is -being designed so that it can be executed automatically with minimal -service outage. It may include the following work: - -1. Check the dependencies of the software modules and their impact, - backward compatibilities to figure out the appropriate upgrade method - and ordering. -2. Find out if a rolling upgrade could be planned with several rolling - steps to avoid any service outage due to the upgrade some - parts/services at the same time. -3. Collect the proper version files and check the integration for - upgrading. -4. The preparation step should produce an output (i.e. upgrade - campaign/plan), which is executable automatically in an NFV Framework - and which can be validated before execution. - - - The upgrade campaign should not be referring to scalable entities - directly, but allow for adaptation to the system configuration and - state at any given moment. - - The upgrade campaign should describe the ordering of the upgrade - of different entities so that dependencies, redundancies can be - maintained during the upgrade execution - - The upgrade campaign should provide information about the - applicable recovery procedures and their ordering. - - The upgrade campaign should consider information about the - verification/testing procedures to be performed during the upgrade - so that upgrade failures can be detected as soon as possible and - the appropriate recovery procedure can be identified and applied. - - The upgrade campaign should provide information on the expected - execution time so that hanging execution can be identified - - The upgrade campaign should indicate any point in the upgrade when - coordination with the users (VNFs) is required. - -.. Depends on the attributes of the object being upgraded, the - upgrade plan may be slitted into step(s) and/or sub-plan(s), and even - more small sub-plans in design phase. The plan(s) or sub-plan(s) my - include step(s) or sub-plan(s). - -Validation the upgrade plan / Checking the pre-requisites of System( offline / online) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The upgrade plan should be validated before the execution by testing -it in a test environment which is similar to the product environment. - -.. However it could also mean that we can identify some properties - that it should satisfy e.g. what operations can or cannot be executed - simultaneously like never take out two VMs of the same VNF. - -.. Another question is if it requires that the system is in a particular - state when the upgrade is applied. I.e. if there's certain amount of - redundancy in the system, migration is enabled for VMs, when the NFVI - is upgraded the VIM is healthy, when the VIM is upgraded the NFVI is - healthy, etc. - -.. I'm not sure what online validation means: Is it the validation of the - upgrade plan/campaign or the validation of the system that it is in a - state that the upgrade can be performed without too much risk?== - -Before the upgrade plan being executed, the system healthy of the -online product environment should be checked and confirmed to satisfy -the requirements which were described in the upgrade plan. The -sysinfo, e.g. which included system alarms, performance statistics and -diagnostic logs, will be collected and analogized. It is required to -resolve all of the system faults or exclude the unhealthy part before -executing the upgrade plan. - - -Backup/Snapshot (online) -^^^^^^^^^^^^^^^^^^^^^^^^ - -For avoid loss of data when a unsuccessful upgrade was encountered, the -data should be back-upped and the system state snapshot should be taken -before the execution of upgrade plan. This would be considered in the -upgrade plan. - -Several backups/Snapshots may be generated and stored before the single -steps of changes. The following data/files are required to be -considered: - -1. running version files for each node. -2. system components' configuration file and database. -3. image and storage, if it is necessary. - -.. Does 3 imply VNF image and storage? I.e. VNF state and data?== - -.. The following text is derived from previous "4. Negotiate - with the VNF if it's ready for the upgrade" - -Although the upper layer, which include VNFs and VNFMs, is out of the -scope of Escalator, but it is still recommended to let it ready for a -smooth system upgrade. The escalator could not guarantee the safe of -VNFs. The upper layer should have some safe guard mechanism in design, -and ready for avoiding failure in system upgrade. - -Execution (online) -^^^^^^^^^^^^^^^^^^ - -The execution of upgrade plan should be a dynamical procedure which is - controlled by Escalator. - -.. Revised text to be general.== - -1. It is required to supporting execution ether in sequence or in - parallel. -2. It is required to check the result of the execution and take the - action according the situation and the policies in the upgrade plan. -3. It is required to execute properly on various configurations of - system object. I.e. stand-alone, HA, etc. -4. It is required to execute on the designated different parts of the - system. I.e. physical server, virtualized server, rack, chassis, - cluster, even different geographical places. - -Testing (online) -^^^^^^^^^^^^^^^^ - -The testing after upgrade the whole system or parts of system to make -sure the upgraded system(object) is working normally. - -.. Revised text to be general. - -1. It is recommended to run the prepared test cases to see if the - functionalities are available without any problem. -2. It is recommended to check the sysinfo, e.g. system alarms, - performance statistics and diagnostic logs to see if there are any - abnormal. - -Restore/Roll-back (online) -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When upgrade is failure unfortunately, a quick system restore or system -roll-back should be taken to recovery the system and the services. - -.. Revised text to be general. - -1. It is recommend to support system restore from backup when upgrade - was failed. -2. It is recommend to support graceful roll-back with reverse order - steps if possible. - -Monitoring (online) -^^^^^^^^^^^^^^^^^^^ - -Escalator should continually monitor the process of upgrade. It is -keeping update status of each module, each node, each cluster into a -status table during upgrade. - -.. Revised text to be general. - -1. It is required to collect the status of every objects being upgraded - and sending abnormal alarms during the upgrade. -2. It is recommend to reuse the existing monitoring system, like alarm. -3. It is recommend to support pro-actively query. -4. It is recommend to support passively wait for notification. - -**Two possible ways for monitoring:** - -**Pro-Actively Query** requires NFVI/VIM provides proper API or CLI -interface. If Escalator serves as a service, it should pass on these -interfaces. - -**Passively Wait for Notification** requires Escalator provides -callback interface, which could be used by NFVI/VIM systems or upgrade -agent to send back notification. - -.. I am not sure why not to subscribe the notification. - -Logging (online) -^^^^^^^^^^^^^^^^ - -Record the information generated by escalator into log files. The log -file is used for manual diagnostic of exceptions. - -1. It is required to support logging. -2. It is recommended to include time stamp, object id, action name, - error code, etc. - -Administrative Control (online) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Administrative Control is used for control the privilege to start any -escalator's actions for avoiding unauthorized operations. - -#. It is required to support administrative control mechanism -#. It is recommend to reuse the system's own secure system. -#. It is required to avoid conflicts when the system's own secure system - being upgraded. - -Requirements on Object being upgraded -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. We can develop BPs in future from requirements of this section and - gap analysis for upper stream projects - -Escalator focus on smooth upgrade. In practical implementation, it -might be combined with installer/deplorer, or act as an independent -tool/service. In either way, it requires targeting systems(NFVI and -VIM) are developed/deployed in a way that Escalator could perform -upgrade on them. - -On NFVI system, live-migration is likely used to maintain availability -because OPNFV would like to make HA transparent from end user. This -requires VIM system being able to put compute node into maintenance mode -and then isolated from normal service. Otherwise, new NFVI instances -might risk at being schedule into the upgrading node. - -On VIM system, availability is likely achieved by redundancy. This -impose less requirements on system/services being upgrade (see PVA -comments in early version). However, there should be a way to put the -target system into standby mode. Because starting upgrade on the -master node in a cluster is likely a bad idea. - -.. Revised text to be general. - -1. It is required for NFVI/VIM to support **service handover** mechanism - that minimize interruption to 0.001%(i.e. 99.999% service - availability). Possible implementations are live-migration, redundant - deployment, etc, (Note: for VIM, interruption could be less - restrictive) - -2. It is required for NFVI/VIM to restore the early version in a efficient - way, such as **snapshot**. - -3. It is required for NFVI/VIM to **migration data** efficiently between - base and upgraded system. - -4. It is recommend for NFV/VIM's interface to support upgrade - orchestration, e.g. reading/setting system state. - - diff --git a/docs/04-Use_Cases_and_Scenarios.rst b/docs/04-Use_Cases_and_Scenarios.rst deleted file mode 100644 index ee9b488..0000000 --- a/docs/04-Use_Cases_and_Scenarios.rst +++ /dev/null @@ -1,211 +0,0 @@ -Use Cases and Scenarios ------------------------ - -This section describes the use cases and scenarios to verify the -requirements of Escalator. - -Scenarios -~~~~~~~~~ -1. Upgrade a system with HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A HA configuration system is very popular in the operator's data centre. -It is a typical product environment. It is always running 7\*24 with VNFs -running on it to provide services to the end users. - - -2. Upgrade a system with non-HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A non-HA configuration system is normally deployed for experimental or -development usages, such as a Vagrant/VM environment. - -Escalator supports the upgrade in this scenario, but it does not guarantee a -smooth upgrade. - -Use cases -~~~~~~~~~ -Use case #1: Smooth upgrade in a HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -For a system with HA configuration, the operator can use Escalator to -smooth-upgrade NFVI/VIM components into a new version without any service -outage. - -When a compute node being upgraded, the VMs on the node may need to be migrated -to other compute nodes to avoid service outage, so it is requred that there are -enough redundant resources to migrate VMs on this compute node. - -Before upgrade, the operator can use Escalator to check whether smooth upgrade -conditions are all satisfied. These conditions include whether there are enough -idle resources to migrate VMs during updrading, and whether the new version is -compatible with the current one, etc. If there are some conditions not -satisfied, Escalator will show them. Escalator can also provide the solutions if -there is any, such as the number and configuration of spare compute nodes which -are needed. - -When upgrade starts, Escalator will also automatically check whether smooth -upgrade conditions are all satisfied. If some smooth upgrade conditions are not -satisfied, Escalator will show the failure of smooth upgrade. - -- Pre-Conditions - - 1. The system is running as normal. - 2. The VNFs are providing services as usual. - -- Upgrading steps - - 1. The VNFs are continually providing services during the upgrade. - 2. The operator successfully logged in the GUI of Escalator to select the - software packages including Linux OS, Hypervisor, OpenStack, ODL and other - OPNFV components, ect. (All or part of components could be selected.) - 3. Select the nodes to be upgraded. i.e. controller node, network node, - storage node and compute node, etc. - 4. Select "Disable Scale-up". It will limit the scale-up operation when - upgrade is in progress to prevent failures due to the shortage of - resources. - 5. Select "Check Smooth Upgrade Conditions". If Escalator shows that there are - some conditions not satisfied, try to resolve them according to the - solutions provided. - 6. Select "Smooth Upgrade", then apply the upgrade operation. - 7. Select "Restore Scale-up" after the upgrade. It will restore scale-up to - the original enabled/disabled state before upgrade. - -- Post-Conditions - - 1. The system is upgraded successfully. - 2. There is no service outage during the upgrade. - 3. The VNFs are providing services as usual after the upgrade. - -Use case #2: Roll-back after a failed smooth upgrade in a HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -For a system with HA configuration, if the upgrade fails when the operator is -smooth-upgrading NFVI/VIM components into a new version using Escalator, the -operator can roll-back the system without any service outage. - -- Pre-Conditions - - 1. The system is running as normal. - 2. The VNFs are providing services as usual. - 3. Scale-up operation is disabled. - 4. Smooth upgrade failed. - -- Roll-back steps - - 1. Escalator concludes that the upgrade has failed and provides the operator - with the reason. - 2. Select the "Roll-back" operation. - 3. If the roll-back is successful, go to step 4, otherwise the operator can - select "Restore Backup" to restore the system from the backup data. - 4. Select "Restore Scale-up" after the roll-back. It will restore scale-up to - the original enabled/disabled state before upgrade. - -- Post-Conditions - - 1. The system is rolled-back successfully when the upgrade failed. - 2. There is no service outage during the roll-back. - 3. The VNFs are providing services as usual after the roll-back. - -Use case #3: Roll-back after a successful smooth upgrade in a HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When a smooth upgrade in a HA configuration is successful, the operator may want -to roll-back for some reasons, such as performance issues. -Escalator supports roll-back after a successful smooth upgrade without any -service outage. - -- Pre-Conditions - - 1. The system is running as normal. - 2. The VNFs are providing services as usual. - 3. Smooth upgrade succeeded. - -- Roll-back steps - - 1. Select "Disable Scale-up". It will limit the scale-up operation when roll- - back is in progress to prevent failures due to the shortage of resources. - 2. Select "Check Smooth Roll-back Conditions". If Escalator shows that there - are some conditions not satisfied, try to resolve them according to the - solutions provided. - 3. Select "Roll-back", then apply the roll-back operation. - 4. If the roll-back is successful, go to step 5, otherwise the operator can - select "Restore Backup" to restore the system from the backup data. - 5. Select "Restore Scale-up" after the roll-back. It will restore scale-up to - the original enabled/disabled state before roll-back. - -- Post-Conditions - - 1. The system is rolled-back successfully. - 2. There is no service outage during the roll-back. - 3. The VNFs are providing services as usual after the roll-back. - -Use case #4: Non-smooth upgrade in a non-HA/HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -For a system with non-HA configuration, the operator can also use Escalator to -upgrade NFVI/VIM components into a new version. In this case, the upgrade may -result in service outage. In other words, the upgrade is non-smooth. -For a system with HA configuration, if the service outage is acceptable or -inevitable, the operator can also use Escalator to non-smoothly upgrade the -system. - -- Pre-Conditions - - 1. The system is running as normal. - -- Upgrading steps - - 1. The operator successfully logged in the GUI of Escalator to select the - software packages including Linux OS, Hypervisor, OpenStack, ODL and other - OPNFV components, ect. (All or part of components could be selected.) - 2. Select the nodes to be upgraded. i.e. controller node, network node, - storage node and compute node, etc. - 3. Select "Non-Smooth Upgrade", then apply the upgrade operation. - -- Post-Conditions - - 1. The system is upgraded successfully. - -Use case #5: Roll-back after a failed non-smooth upgrade in a non-HA/HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -For a system with non-HA/HA configuration, if the upgrade fails when the -operator is non-smoothly upgrading NFVI/VIM components into a new version using -Escalator, the operator can roll-back the system. In this case, the roll-back -may result in service outage. - -- Pre-Conditions - - 1. The system is running as normal. - 2. Non-smooth upgrade failed. - -- Roll-back steps - - 1. Escalator concludes that the upgrade has failed and provides the operator - with the reason. - 2. Select the "Roll-back" operation. - 3. If the roll-back fails, the operator can select "Restore Backup" to restore - the system from the backup data. - -- Post-Conditions - - 1. The system is rolled-back successfully when the upgrade failed. - -Use case #6: Roll-back after a successful non-smooth upgrade in a non-HA/HA configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When a non-smooth upgrade in a non-HA/HA configuration is successful, the -operator may want to roll-back for some reasons, such as performance issues. -Escalator supports roll-back after a successful non-smooth upgrade. In this -case,the roll-back may result in service outage. - -- Pre-Conditions - - 1. The system is running as normal. - 2. Non-smooth upgrade succeeded. - -- Roll-back steps - - 1. Select the "Roll-back" operation. - 2. If the roll-back fails, the operator can select "Restore Backup" to restore - the system from the backup data. - -- Post-Conditions - - 1. The system is rolled-back successfully when the upgrade failed. - diff --git a/docs/05-Reference_Architecture.rst b/docs/05-Reference_Architecture.rst deleted file mode 100644 index 63b54c2..0000000 --- a/docs/05-Reference_Architecture.rst +++ /dev/null @@ -1,113 +0,0 @@ -Reference Architecture ----------------------- - -This section describes the reference architecture, the function blocks, -and the function entities of Escalator for the reader to well understand how -the basic functions to be organized. - -Upgrade Scope -~~~~~~~~~~~~~~~~ - -Upgrade objects described in this document are software programs covered by -red box in the picture below which includes: VIM and NFVI. -The target of the upgrade is to reduce the impact on the applications in the -blue box below as much as possible. -Note that this upgrade process does not take into consideration the effects -of Vi-Vnfm and Or-Vi. In other words, the unserviceability of the two -interfaces during upgrade can be accepted. - -.. figure:: images/figure1.png - :name: figure1 - :width: 100% - -The software stack on each node is generally as shown in the table below. - -.. figure:: images/figure2.png - :name: figure2 - :width: 100% - -Because the control node upgrade will not affect the VNFs service in the blue -box, this scheme focuses on upgrading of compute nodes. - -Precondition of Upgrade -~~~~~~~~~~~~~~~~~~~~~~~ - -1 The environmental requirements -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -1. System is running normally. If there are any faults before the upgrade, -it is difficult to distinguish between upgrade introduced and the environment -itself. - -2. The environment should have the redundant resources. Because the upgrade -process is based on the business migration, in the absence of resource -redundancy,it is impossible to realize the business migration, as well as to -achieve a smooth upgrade. - -Resource redundancy in two levels: - -1 NFVI level: This level is mainly the compute nodes resource redundancy. -During the upgrade, the virtual machine on business can be migrated to another -free compute node. - -2 VNF level: This level depends on HA mechanism in VNF, such as: -active-standby, load balance. In this case, as long as business of the target -node on VMs is migrated to other free nodes, the migration of VM might not be -necessary. - -The way of redundancy to be used is subject to the specific environment. -Generally speaking, During the upgrade, the VNF's service level availability -mechanism should be used in higher priority than the NFVI's. This will help -us to reduce the service outage. - -2 The requirements for component release version -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This is primarily a compatibility requirement. You can refer to Linux/Python -Compatible Semantic Versioning 3.0.0: - -Given a version number MAJOR.MINOR.PATCH, increment the: - -1. MAJOR version when you make incompatible API changes, - -2. MINOR version when you add functionality in a backwards-compatible manner, - -3. PATCH version when you make backwards-compatible bug fixes. - -Some internal interfaces of OpenStack will be used by Escalator indirectly, -such as VM migration related interface between VIM and NFVI. So it is required -to be backward compatible on these interfaces. Refer to "Interface" chapter -for details. - -Upgrade related modules in VIM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Upgrade operations are initiated by the user through the VIM. For VIM, upgrade -management mainly contains the object: - -- **Upgrade Manager** - -Mainly responsible for control of the upgrade process.The Escalator is required -to know the software deployment information of the platform and will use these -information during the upgrading. It will be collected from some place, such -as the Installer, Deploy Manager and Escalator itself, etc. - -- **VIM Interface**: - -Mainly responsible for the external interface, include Vi-Vnfm, Or-Vi. This -module stores VNFO and VNFM external information such as address and -authentication. - -- **Cloud Manager**: - -Mainly responsible for virtualization resources management,which might be -considered made up of Openstack and SDN control node. - -- **System Support**: - -This layer is the runtime support environment of upper layers, e.g. Cloud -Manager and VIM interface., including:OS, HA, etc. To upgrade the upper -software is based on this module. - -.. figure:: images/figure3.png - :name: figure3 - :width: 100% \ No newline at end of file diff --git a/docs/06-Information_Flows.rst b/docs/06-Information_Flows.rst deleted file mode 100644 index 641b59b..0000000 --- a/docs/06-Information_Flows.rst +++ /dev/null @@ -1,56 +0,0 @@ -Information Flows ------------------ - -This section describes the information flows among the function -entities when Escalator is in actions. - -1. Upgrade process of Compute nodes - -1.1 consider VIM as a whole - -.. figure:: images/figure4.png - :name: figure4 - :width: 100% - -process is: -1. Operators add new version files on the VIM,initiate the upgrade. -2. VIM chooses some compute nodes as the upgrade target nodes, and set them -into maintenance mode. VIM queries the list of running VMs on target nodes. -3. VIM notice VNFM corresponding to the virtual machine, to migrate the -business. -4. VNFM migrates the business. If the business is in active of active-standby -mode, it will initiate switch-over. If the business is in loading balance mode, -it will move the business to other node. -5. After VNFM moves business, it notifies the VIM. -6. VIM judges whether the business on the target VM has all been moved. If -not, VIM migrates the VM with business loaded to other free nodes. Then VIM -upgrades the target computer nodes. After upgrade, VIM set the target compute -nodes into normal nodes. -7. If there are computer nodes remained to be upgraded, goto step 2. - -4.2 from inside VIM - -.. figure:: images/figure5.png - :name: figure5 - :width: 100% - -.. figure:: images/figure6.png - :name: figure6 - :width: 100% - -process is: -1. Upgrade manager receives user operation commands. Add new version files. -Upgrade is began. -2. Upgrade Manager selects compute node A to Upgrade. Query list of the VMs -running the compute nodes A to the Cloud Manager, and set the node to -maintenance mode, that is to say creation or migration of new VM on this node -is impossible anymore. -3. Upgrade Manager notifies VNFM compute node A into maintenance mode by VIM -interface, temporarily disabling the inserting of business, and business on -compute node A need move to the other available compute nodes. -4. When receives the VNFM reply, or waited for a timeout, Upgrade Manager -notifies the system support on compute node A to do software upgrade. -5. After upgraded, Upgrade Manager removes maintenance mode for the compute -node A. -6. Upgrade Manager claims VNFM computing nodes A available. -7. Select computer node B to upgrade \ No newline at end of file diff --git a/docs/07-Interfaces_and_Files.rst b/docs/07-Interfaces_and_Files.rst deleted file mode 100644 index 87f916e..0000000 --- a/docs/07-Interfaces_and_Files.rst +++ /dev/null @@ -1,27 +0,0 @@ -Interfaces and Files --------------------- - -This section describes the required interfaces and files of Escalator. - - -CLI Interface -~~~~~~~~~~~~~~~~ - -This section describes CLI of Escalator. - -RESTful API -~~~~~~~~~~~ - -This section describes the API of Escalator for developer. - -Configuration File -~~~~~~~~~~~~~~~~~~ - -This section will suggest a format of the configuration files and how to -deal with it. - -Log File -~~~~~~~~ - -This section will suggest a format of the log files and how to deal with -it. \ No newline at end of file diff --git a/docs/08-Requirements_from_other_OPNFV_Project.rst b/docs/08-Requirements_from_other_OPNFV_Project.rst deleted file mode 100644 index 62e611f..0000000 --- a/docs/08-Requirements_from_other_OPNFV_Project.rst +++ /dev/null @@ -1,40 +0,0 @@ -Requirements from other OPNFV projects --------------------------------------- - -We have created a questionnaire_ for collecting other projects requirements. -Please advertise it. - -.. _questionnaire: https://docs.google.com/forms/d/11o1mt15zcq0WBtXYK0n6lKF8XuIzQTwvv8ePTjmcoF0/viewform?usp=send_form - - - -Doctor Project -~~~~~~~~~~~~~~ - -.. This scenario could be out of scope in Escalator project, but - having the option to support this should be better to align with - Doctor requirements. - -The scope of Doctor project also covers maintenance scenario in which - -1. The VIM administrator requests host maintenance to VIM. - -2. VIM will notify it to consumer such as VNFM to trigger application level - migration or switching active-standby nodes. - -3. VIM waits response from the consumer for a short while. - -- VIM should send out notification of VM migration to consumer (VNFM) - as abstracted message like "maintenance". - -- VIM could wait VM migration until it receives "VM ready to - maintenance" message from the owner (VNFM) - -HA Project -~~~~~~~~~~ - -Multi-site Project -~~~~~~~~~~~~~~~~~~ - -- Escalator upgrade one site should at least not lead to the other site - API token validation failed. diff --git a/docs/09-Reference.rst b/docs/09-Reference.rst deleted file mode 100644 index 0b5ff17..0000000 --- a/docs/09-Reference.rst +++ /dev/null @@ -1,17 +0,0 @@ -Reference ---------- - -[1] ETSI GS NFV 002 (V1.1.1): “Architectural Framework” - -[2] ETSI GS NFV 003 (V1.1.1): "Terminology for Main Concepts in NFV" - -[3] ETSI GS NFV-SWA001:“Virtual Network Function Architecture” - -[4] ETSI GS NFV-MAN001:“Management and Orchestration” - -[5] ETSI GS NFV-REL001:"Resiliency Requirements" - -[6] QuEST Forum TL-9000:"Quality Management System Requirement -Handbook" - -[7] Service Availability Forum AIS:"Software Management Framework" diff --git a/docs/10-Useful_Working_Drafts_of_ETSI_NFV.rst b/docs/10-Useful_Working_Drafts_of_ETSI_NFV.rst deleted file mode 100644 index 5c2195b..0000000 --- a/docs/10-Useful_Working_Drafts_of_ETSI_NFV.rst +++ /dev/null @@ -1,11 +0,0 @@ -Useful Working Drafts of ETSI NFV ---------------------------------- - -Access them with your own ETSI account, please DO NOT disclose the -content. - -[1] Migrate Virtualised Compute Resource operation @ 7.3.1.8 -ftp://docbox.etsi.org/ISG/NFV/Open/Drafts/IFA005_Or-Vi_ref_point_Spec/NFV-IFA005v070.zip - -[2] Reliability issues during NFV Software upgrade and improvement mechanisms @ 8 -ftp://@docbox.etsi.org/ISG/NFV/Open/Drafts/REL003_E2E_reliability_models/NFV-REL003v030.zip diff --git a/docs/A1-Appendix.rst b/docs/A1-Appendix.rst deleted file mode 100644 index 85f0717..0000000 --- a/docs/A1-Appendix.rst +++ /dev/null @@ -1,49 +0,0 @@ -Appendix --------- - -A.1 Impact Analysis -~~~~~~~~~~~~~~~~~~~ - -Upgrading the different software modules may cause different impact on -the availability of the infrastructure resources and even on the service -continuity of the vNFs. - -**Software modules in the computing nodes** - -#. Host OS patch - -#. Hypervisor, such as KVM, QEMU, XEN, libvirt -#. Openstack agent in computing nodes (like Nova agent, Ceilometer - agent...) - -.. As SW module, we should list the host OS and maybe its - drivers as well. From upgrade perspective do we limit host OS - upgrades to patches only? - -**Software modules in network nodes** - -#. Neutron L2/L3 agent -#. OVS, SR-IOV Driver - -**Software modules storage nodes** - -#. Ceph - -The table below analyses such an impact - considering a single instance -of each software module - from the following aspects: - -- the function which will be lost during upgrade, -- the duration of the loss of this specific function, -- if this causes the loss of the vNF function, -- if it causes incompatibility in the different parts of the software, -- what should be backed up before the upgrade, -- the duration of restoration time if the upgrade fails - -These values provided come from internal testing and based on some -assumptions, they may vary depending on the deployment techniques. -Please feel free to add if you find more efficient values during your -testing. - -https://wiki.opnfv.org/_media/upgrade_analysis_v0.5.xlsx - -Note that no redundancy of the software modules is considered in the table. diff --git a/docs/design/201-Reference_Architecture.rst b/docs/design/201-Reference_Architecture.rst new file mode 100644 index 0000000..75aa461 --- /dev/null +++ b/docs/design/201-Reference_Architecture.rst @@ -0,0 +1,54 @@ +====================== +Reference Architecture +====================== + +This section describes the reference architecture, the function blocks, +and the function entities of Escalator for the reader to well understand how +the basic functions to be organized. + +The software stack on each node is generally as shown in the table below. + +.. figure:: images/figure2.png + :name: figure2 + :width: 100% + +Since the upgrading of control node will not affect the VNFs service in the blue +box, this chapter will focusing on the upgrading of compute nodes. + + +Precondition of Upgrade +======================= + +Upgrade related modules in VIM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Upgrade operations are initiated by the user through the VIM. For VIM, upgrade +management mainly contains the object: + +- **Upgrade Manager** + +Mainly responsible for control of the upgrade process.The Escalator is required +to know the software deployment information of the platform and will use these +information during the upgrading. It will be collected from some place, such +as the Installer, Deploy Manager and Escalator itself, etc. + +- **VIM Interface**: + +Mainly responsible for the external interface, include Vi-Vnfm, Or-Vi. This +module stores VNFO and VNFM external information such as address and +authentication. + +- **Cloud Manager**: + +Mainly responsible for virtualization resources management,which might be +considered made up of Openstack and SDN control node. + +- **System Support**: + +This layer is the runtime support environment of upper layers, e.g. Cloud +Manager and VIM interface., including:OS, HA, etc. To upgrade the upper +software is based on this module. + +.. figure:: images/figure3.png + :name: figure3 + :width: 100% diff --git a/docs/design/202-Information_Flows.rst b/docs/design/202-Information_Flows.rst new file mode 100644 index 0000000..eaa39f0 --- /dev/null +++ b/docs/design/202-Information_Flows.rst @@ -0,0 +1,57 @@ +================= +Information Flows +================= + +This section describes the information flows among the function +entities when Escalator is in actions. + +1. Upgrade process of Compute nodes + +1.1 consider VIM as a whole + +.. figure:: images/figure4.png + :name: figure4 + :width: 100% + +process is: +1. Operators add new version files on the VIM,initiate the upgrade. +2. VIM chooses some compute nodes as the upgrade target nodes, and set them +into maintenance mode. VIM queries the list of running VMs on target nodes. +3. VIM notice VNFM corresponding to the virtual machine, to migrate the +business. +4. VNFM migrates the business. If the business is in active of active-standby +mode, it will initiate switch-over. If the business is in loading balance mode, +it will move the business to other node. +5. After VNFM moves business, it notifies the VIM. +6. VIM judges whether the business on the target VM has all been moved. If +not, VIM migrates the VM with business loaded to other free nodes. Then VIM +upgrades the target computer nodes. After upgrade, VIM set the target compute +nodes into normal nodes. +7. If there are computer nodes remained to be upgraded, goto step 2. + +4.2 from inside VIM + +.. figure:: images/figure5.png + :name: figure5 + :width: 100% + +.. figure:: images/figure6.png + :name: figure6 + :width: 100% + +process is: +1. Upgrade manager receives user operation commands. Add new version files. +Upgrade is began. +2. Upgrade Manager selects compute node A to Upgrade. Query list of the VMs +running the compute nodes A to the Cloud Manager, and set the node to +maintenance mode, that is to say creation or migration of new VM on this node +is impossible anymore. +3. Upgrade Manager notifies VNFM compute node A into maintenance mode by VIM +interface, temporarily disabling the inserting of business, and business on +compute node A need move to the other available compute nodes. +4. When receives the VNFM reply, or waited for a timeout, Upgrade Manager +notifies the system support on compute node A to do software upgrade. +5. After upgraded, Upgrade Manager removes maintenance mode for the compute +node A. +6. Upgrade Manager claims VNFM computing nodes A available. +7. Select computer node B to upgrade diff --git a/docs/design/203-Administrative_Interfaces.rst b/docs/design/203-Administrative_Interfaces.rst new file mode 100644 index 0000000..5d8148b --- /dev/null +++ b/docs/design/203-Administrative_Interfaces.rst @@ -0,0 +1,16 @@ +========================= +Administrative Interfaces +========================= + +This section describes the required administrative interfaces of Escalator. + +CLI Interface +============= + +This section describes CLI of Escalator. + +RESTful API +=========== + +This section describes the API of Escalator for developer. + diff --git a/docs/design/204-Configuration_and_Logging.rst b/docs/design/204-Configuration_and_Logging.rst new file mode 100644 index 0000000..309b2c8 --- /dev/null +++ b/docs/design/204-Configuration_and_Logging.rst @@ -0,0 +1,18 @@ +========================= +Configuration and Logging +========================= + +This section describes the required configuration and logging of Escalator. + + +Configuration Format +==================== + +This section will suggest a format of the configuration files and how to +deal with it. + +Logging Format +============== + +This section will suggest a format of the log files and how to deal with +it. diff --git a/docs/design/2A1-Appendix.rst b/docs/design/2A1-Appendix.rst new file mode 100644 index 0000000..80fe447 --- /dev/null +++ b/docs/design/2A1-Appendix.rst @@ -0,0 +1,50 @@ +======== +Appendix +======== + +A.1 Impact Analysis +=================== + +Upgrading the different software modules may cause different impact on +the availability of the infrastructure resources and even on the service +continuity of the vNFs. + +**Software modules in the computing nodes** + +#. Host OS patch + +#. Hypervisor, such as KVM, QEMU, XEN, libvirt +#. Openstack agent in computing nodes (like Nova agent, Ceilometer + agent...) + +.. As SW module, we should list the host OS and maybe its + drivers as well. From upgrade perspective do we limit host OS + upgrades to patches only? + +**Software modules in network nodes** + +#. Neutron L2/L3 agent +#. OVS, SR-IOV Driver + +**Software modules storage nodes** + +#. Ceph + +The table below analyses such an impact - considering a single instance +of each software module - from the following aspects: + +- the function which will be lost during upgrade, +- the duration of the loss of this specific function, +- if this causes the loss of the vNF function, +- if it causes incompatibility in the different parts of the software, +- what should be backed up before the upgrade, +- the duration of restoration time if the upgrade fails + +These values provided come from internal testing and based on some +assumptions, they may vary depending on the deployment techniques. +Please feel free to add if you find more efficient values during your +testing. + +https://wiki.opnfv.org/_media/upgrade_analysis_v0.5.xlsx + +Note that no redundancy of the software modules is considered in the table. diff --git a/docs/design/etc/conf.py b/docs/design/etc/conf.py new file mode 100644 index 0000000..0066035 --- /dev/null +++ b/docs/design/etc/conf.py @@ -0,0 +1,34 @@ +import datetime +import sys +import os + +try: + __import__('imp').find_module('sphinx.ext.numfig') + extensions = ['sphinx.ext.numfig'] +except ImportError: + # 'pip install sphinx_numfig' + extensions = ['sphinx_numfig'] + +# numfig: +number_figures = True +figure_caption_prefix = "Fig." + +source_suffix = '.rst' +master_doc = 'index' +pygments_style = 'sphinx' +html_use_index = False + +pdf_documents = [('index', u'OPNFV', u'OPNFV Project', u'OPNFV')] +pdf_fit_mode = "shrink" +pdf_stylesheets = ['sphinx','kerning','a4'] +#latex_domain_indices = False +#latex_use_modindex = False + +latex_elements = { + 'printindex': '', +} + +project = u'OPNFV: Template documentation config' +copyright = u'%s, OPNFV' % datetime.date.today().year +version = u'1.0.0' +release = u'1.0.0' diff --git a/docs/design/etc/opnfv-logo.png b/docs/design/etc/opnfv-logo.png new file mode 100644 index 0000000..1519503 Binary files /dev/null and b/docs/design/etc/opnfv-logo.png differ diff --git a/docs/design/images/figure2.png b/docs/design/images/figure2.png new file mode 100644 index 0000000..70d16c7 Binary files /dev/null and b/docs/design/images/figure2.png differ diff --git a/docs/design/images/figure3.png b/docs/design/images/figure3.png new file mode 100644 index 0000000..38346de Binary files /dev/null and b/docs/design/images/figure3.png differ diff --git a/docs/design/images/figure4.png b/docs/design/images/figure4.png new file mode 100644 index 0000000..e74e24b Binary files /dev/null and b/docs/design/images/figure4.png differ diff --git a/docs/design/images/figure5.png b/docs/design/images/figure5.png new file mode 100644 index 0000000..a49955d Binary files /dev/null and b/docs/design/images/figure5.png differ diff --git a/docs/design/images/figure6.png b/docs/design/images/figure6.png new file mode 100644 index 0000000..efe7d6f Binary files /dev/null and b/docs/design/images/figure6.png differ diff --git a/docs/design/index.rst b/docs/design/index.rst new file mode 100644 index 0000000..993b5e6 --- /dev/null +++ b/docs/design/index.rst @@ -0,0 +1,33 @@ +.. OPNFV Release Engineering documentation, created by + sphinx-quickstart on Tue Jun 9 19:12:31 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. image:: etc/opnfv-logo.png + :height: 40 + :width: 200 + :alt: OPNFV + :align: left + +ESCALATOR DESIGN CONSIDERATIONS +======================================= + +Contents: + +.. toctree:: + :maxdepth: 4 + :titlesonly: + + + + 201-Reference_Architecture.rst + 202-Information_Flows.rst + 203-Administrative_Interfaces.rst + 204-Configuration_and_Logging.rst + 300-Gap_Analysis_Report.rst + +* :ref:`search` + +Revision: _sha1_ + +Build date: |today| diff --git a/docs/etc/conf.py b/docs/etc/conf.py new file mode 100644 index 0000000..0066035 --- /dev/null +++ b/docs/etc/conf.py @@ -0,0 +1,34 @@ +import datetime +import sys +import os + +try: + __import__('imp').find_module('sphinx.ext.numfig') + extensions = ['sphinx.ext.numfig'] +except ImportError: + # 'pip install sphinx_numfig' + extensions = ['sphinx_numfig'] + +# numfig: +number_figures = True +figure_caption_prefix = "Fig." + +source_suffix = '.rst' +master_doc = 'index' +pygments_style = 'sphinx' +html_use_index = False + +pdf_documents = [('index', u'OPNFV', u'OPNFV Project', u'OPNFV')] +pdf_fit_mode = "shrink" +pdf_stylesheets = ['sphinx','kerning','a4'] +#latex_domain_indices = False +#latex_use_modindex = False + +latex_elements = { + 'printindex': '', +} + +project = u'OPNFV: Template documentation config' +copyright = u'%s, OPNFV' % datetime.date.today().year +version = u'1.0.0' +release = u'1.0.0' diff --git a/docs/etc/opnfv-logo.png b/docs/etc/opnfv-logo.png new file mode 100644 index 0000000..1519503 Binary files /dev/null and b/docs/etc/opnfv-logo.png differ diff --git a/docs/gap_analysis/301-Impact_Analysis.rst b/docs/gap_analysis/301-Impact_Analysis.rst new file mode 100644 index 0000000..c520c7e --- /dev/null +++ b/docs/gap_analysis/301-Impact_Analysis.rst @@ -0,0 +1,47 @@ +=============== +Impact Analysis +=============== + +Upgrading the different software modules may cause different impact on +the availability of the infrastructure resources and even on the service +continuity of the vNFs. + +**Software modules in the computing nodes** + +#. Host OS patch + +#. Hypervisor, such as KVM, QEMU, XEN, libvirt +#. Openstack agent in computing nodes (like Nova agent, Ceilometer + agent...) + +.. As SW module, we should list the host OS and maybe its + drivers as well. From upgrade perspective do we limit host OS + upgrades to patches only? + +**Software modules in network nodes** + +#. Neutron L2/L3 agent +#. OVS, SR-IOV Driver + +**Software modules storage nodes** + +#. Ceph + +The table below analyses such an impact - considering a single instance +of each software module - from the following aspects: + +- the function which will be lost during upgrade, +- the duration of the loss of this specific function, +- if this causes the loss of the vNF function, +- if it causes incompatibility in the different parts of the software, +- what should be backed up before the upgrade, +- the duration of restoration time if the upgrade fails + +These values provided come from internal testing and based on some +assumptions, they may vary depending on the deployment techniques. +Please feel free to add if you find more efficient values during your +testing. + +https://wiki.opnfv.org/_media/upgrade_analysis_v0.5.xlsx + +Note that no redundancy of the software modules is considered in the table. diff --git a/docs/gap_analysis/etc/conf.py b/docs/gap_analysis/etc/conf.py new file mode 100644 index 0000000..0066035 --- /dev/null +++ b/docs/gap_analysis/etc/conf.py @@ -0,0 +1,34 @@ +import datetime +import sys +import os + +try: + __import__('imp').find_module('sphinx.ext.numfig') + extensions = ['sphinx.ext.numfig'] +except ImportError: + # 'pip install sphinx_numfig' + extensions = ['sphinx_numfig'] + +# numfig: +number_figures = True +figure_caption_prefix = "Fig." + +source_suffix = '.rst' +master_doc = 'index' +pygments_style = 'sphinx' +html_use_index = False + +pdf_documents = [('index', u'OPNFV', u'OPNFV Project', u'OPNFV')] +pdf_fit_mode = "shrink" +pdf_stylesheets = ['sphinx','kerning','a4'] +#latex_domain_indices = False +#latex_use_modindex = False + +latex_elements = { + 'printindex': '', +} + +project = u'OPNFV: Template documentation config' +copyright = u'%s, OPNFV' % datetime.date.today().year +version = u'1.0.0' +release = u'1.0.0' diff --git a/docs/gap_analysis/etc/opnfv-logo.png b/docs/gap_analysis/etc/opnfv-logo.png new file mode 100644 index 0000000..1519503 Binary files /dev/null and b/docs/gap_analysis/etc/opnfv-logo.png differ diff --git a/docs/gap_analysis/index.rst b/docs/gap_analysis/index.rst new file mode 100644 index 0000000..e6018f6 --- /dev/null +++ b/docs/gap_analysis/index.rst @@ -0,0 +1,30 @@ +.. OPNFV Release Engineering documentation, created by + sphinx-quickstart on Tue Jun 9 19:12:31 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. image:: etc/opnfv-logo.png + :height: 40 + :width: 200 + :alt: OPNFV + :align: left + +ESCALATOR GAP ANALYSIS REPORT +======================================= + +Contents: + +.. toctree:: + :maxdepth: 4 + :titlesonly: + + + + + 301-Impact_Analysis.rst + +* :ref:`search` + +Revision: _sha1_ + +Build date: |today| diff --git a/docs/how-to-use-docs/README.txt b/docs/how-to-use-docs/README.txt deleted file mode 100644 index 0e69174..0000000 --- a/docs/how-to-use-docs/README.txt +++ /dev/null @@ -1 +0,0 @@ -See https://wiki.opnfv.org/documentation/tools . diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 1ae82e1..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. OPNFV Release Engineering documentation, created by - sphinx-quickstart on Tue Jun 9 19:12:31 2015. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -.. image:: etc/opnfv-logo.png - :height: 40 - :width: 200 - :alt: OPNFV - :align: left - -ESCALATOR -======================================= - -Contents: - -.. toctree:: - :maxdepth: 4 - :titlesonly: - - 00-Authors.rst - 01-Scope.rst - 02-Background_and_Terminologies.rst - 03-Functional_Requirements.rst - 04-Use_Cases_and_Scenarios.rst - 05-Reference_Architecture.rst - 06-Information_Flows.rst - 07-Interfaces_and_Files.rst - 08-Requirements_from_other_OPNFV_Project.rst - 09-Reference.rst - 10-Useful_Working_Drafts_of_ETSI_NFV.rst - A1-Appendix.rst - -* :ref:`search` - -Revision: _sha1_ - -Build date: |today| diff --git a/docs/requirements/000-Contributors.rst b/docs/requirements/000-Contributors.rst new file mode 100644 index 0000000..bf70f59 --- /dev/null +++ b/docs/requirements/000-Contributors.rst @@ -0,0 +1,16 @@ +============ +Contributors +============ + +| Jie Hu (ZTE, hu.jie@zte.com.cn) +| Qiao Fu (China Mobile, fuqiao@chinamobile.com) +| Ulrich Kleber (Huawei, Ulrich.Kleber@huawei.com) +| Maria Toeroe (Ericsson, maria.toeroe@ericsson.com) +| Sama, Malla Reddy (DOCOMO, sama@docomolab-euro.com) +| Zhong Chao (ZTE, chao.zhong@zte.com.cn) +| Julien Zhang (ZTE, zhang.jun3g@zte.com.cn) +| Yuri Yuan (ZTE, yuan.yue@zte.com.cn) +| Zhipeng Huang (Huawei, huangzhipeng@huawei.com) +| Jia Meng (ZTE, meng.jia@zte.com.cn) +| Liyi Meng (Ericsson, liyi.meng@ericsson.com) +| Pasi Vaananen (Stratus, pasi.vaananen@stratus.com) diff --git a/docs/requirements/101-Scope.rst b/docs/requirements/101-Scope.rst new file mode 100644 index 0000000..42da7b9 --- /dev/null +++ b/docs/requirements/101-Scope.rst @@ -0,0 +1,45 @@ +===== +Scope +===== + +This document describes the user requirements on the smooth upgrade +function of the NFVI and VIM with respect to the upgrades of the OPNFV +platform from one version to another. Smooth upgrade means that the +upgrade results in no service outage for the end-users. This requires +that the process of the upgrade is automatically carried out by a tool +(code name: Escalator) with pre-configured data. The upgrade process +includes preparation, validation, execution, monitoring and +conclusion. + +.. While it is good to have a tool for the entire upgrade process, + but it is a challenging task, so maybe we shouldn't require automation + for the entire process right away. Automation is essential at + execution. + +.. Maybe we can analysis information flows of the upgrade tool, + abstract the basic / essential actions from the tool (or tools), and + map them to a command set of NFVI / VIM's interfaces. + +The requirements are defined in a stepwise approach, i.e. in the first +phase focusing on the upgrade of the VIM then widening the scope to the +NFVI. + +The requirements may apply to different NFV functions (NFVI, or VIM, or +both of them). They will be classified in the Appendix of this +document. + +The objects being upgraded described in this document are software modules covered by +red box in the picture below which includes: VIM and NFVI. + +The target of the upgrade is to reduce the impact on the applications in the +blue box below as much as possible. + +Please keep in mind that the upgrade tool does not take Vi-Vnfm and Or-Vi into +consideration. In other words, these two interfaces may not provided service normally +during upgrade procedure. + + +.. figure:: images/figure1.png + :name: figure1 + :width: 100% + diff --git a/docs/requirements/102-Terminologies.rst b/docs/requirements/102-Terminologies.rst new file mode 100644 index 0000000..221196b --- /dev/null +++ b/docs/requirements/102-Terminologies.rst @@ -0,0 +1,129 @@ +=========== +Terminology +=========== + +Terminologies +============= + +Operator + The term refers to network service providers and Virtual Network + Function (VNF) providers. + +End-User + The term refers to a subscriber of the Operator's services. + +Network Service + The term refers to a service provided by an Operator to its + end-users using a set of (virtualized) Network Functions + +Infrastructure Services + The term refers to services provided by the NFV Infrastructure to the VNFs + as required by the Management & Orchestration functions and especially the VIM. + I.e. these are the virtual resources as perceived by the VNFs. + +Smooth Upgrade + The term refers to an upgrade that results in no service outage + for the end-users. + +Rolling Upgrade + The term refers to an upgrade strategy, which upgrades a node or a subset + of nodes at a time in a wave style rolling through the data centre. It + is a popular upgrade strategy to maintain service availability. + +Parallel Universe Upgrade + The term refers to an upgrade strategy, which creates and deploys + a new universe - a system with the new configuration - while the old + system continues running. The state of the old system is transferred + to the new system after sufficient testing of the new system. + +Infrastructure Resource Model + The term refers to the representation of infrastructure resources, + namely: the physical resources, the virtualization + facility resources and the virtual resources. + +Physical Resource + The term refers to a piece of hardware in the NFV infrastructure that may + also include firmware enabling this piece of hardware. + +Virtual Resource + The term refers to a resource, which is provided as services built on top + of the physical resources via the virtualization facilities; in particular, + virtual resources are the resources on which VNFs are deployed. Examples of + virtual resources are: VMs, virtual switches, virtual routers, virtual disks. + +Visualization Facility + The term refers to a resource that enables the creation + of virtual environments on top of the physical resources, e.g. + hypervisor, OpenStack, etc. + +Upgrade Campaign + The term refers to a choreography that describes how the upgrade should + be performed in terms of its targets (i.e. upgrade objects), the + steps/actions required of upgrading each, and the coordination of these + steps so that service availability can be maintained. It is an input to an + upgrade tool (Escalator) to carry out the upgrade. + +Upgrade Duration + The duration of an upgrade characterized by the time elapsed between its + initiation and its completion. E.g. from the moment the execution of an + upgrade campaign has started until it has been committed. Depending on + the upgrade strategy, the state of the configuration and the upgrade target + some parts of the system may be in a more vulnerable state with respect to + service availbility. + +Outage + The period of time during which a given service is not provided is referred + as the outage of that given service. If a subsystem or the entire system + does not provide any service, it is the outage of the given subsystem or the + system. Smooth upgrade means upgrade with no outage for the user plane, i.e. + no VNF should experience service outage. + +Rollback + The term refers to a failure handling strategy that reverts the changes + done by a potentially failed upgrade execution one by one in a reverse order. + I.e. it is like undoing the changes done by the upgrade. + +Backup + The term refers to data persisted to a storage, so that it can be used to + restore the system or a given part of it in the same state as it was when the + backup was created assuming a cold restart. Changes made to the system from + the moment the backup was created till the moment it is used to restore the + (sub)system are lost in the restoration process. + +Restore + The term refers to a failure handling strategy that reverts the changes + done, for example, by an upgrade by restoring the system from some backup + data. This results in the loss of any change and data persisted after the + backup was been taken. To recover those additional measures need to be taken + if necessary (e.g. rollforward). + +Rollforward + The term refers to a failure handling strategy applied after a restore + (from a backup) opertaion to recover any loss of data persisted between + the time the backup has been taken and the moment it is restored. Rollforward + requires that data that needs to survive the restore operation is logged at + a location not impacted by the restore so that it can be re-applied to the + system after its restoration from the backup. + +Downgrade + The term refers to an upgrade in which an earlier version of the software + is restored through the upgrade procedure. A system can be downgraded to any + earlier version and the compatibility of the versions will determine the + applicable upgrade strategies and whether service outage can be avoided. + In particular any data conversion needs special attention. + +Abbreviations +============= + +NFVI + The term is an abbreviation for Network Function Virtualization + Infrastructure; sometimes it is also referred as data plane in this + document. The NFVI provides the virtual resources to the virtual + network functions under the control of the VIM. + +VIM + The term is an abbreviation for Virtual Infrastructure Manager; + sometimes it is also referred as control plane in this document. + The VIM controls and manages the NFVI compute, network and storage + resources to provide the required virtual resources to the VNFs. + diff --git a/docs/requirements/103-Background.rst b/docs/requirements/103-Background.rst new file mode 100644 index 0000000..e21e310 --- /dev/null +++ b/docs/requirements/103-Background.rst @@ -0,0 +1,226 @@ +========== +Background +========== + +Upgrade Objects +=============== + +Physical Resource +^^^^^^^^^^^^^^^^^ + +Most cloud infrastructures support the dynamic addition and removal of +hardware. Accordingly a hardware upgrade could be done by adding the new +piece of hardware and removing the old one. From the persepctive of smooth +upgrade the orchestration/scheduling of these actions is the primary concern. + +Upgrading a physical resource may involve as well the upgrade of its firmware +and/or modifying its configuration data. This may require the restart of the +hardware. + +Virtual Resources +^^^^^^^^^^^^^^^^^ + +Addition and removal of virtual resources may be initiated by the users or be +a result of an elasticity action. Users may also request the upgrade of their +virtual resources using a new VM image. + +.. Needs to be moved to requirement section: Escalator should facilitate such an + option and allow for a smooth upgrade. + +On the other hand changes in the infrastructure, namely, in the hardware and/or +the virtualization facility resources may result in the upgrade of the virtual +resources. For example if by some reason the hypervisor is changed and +the current VMs cannot be migrated to the new hypervisor - they are +incompatible - then the VMs need to be upgraded too. This is not +something the NFVI user (i.e. VNFs ) would know about. + + +Virtualization Facility Resources +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Based on the functionality they provide, virtualization facility +resources could be divided into computing node, networking node, +storage node and management node. + +The possible upgrade objects in these nodes are considered below: +(Note: hardware based virtualization may be considered as virtualization +facility resource, but from escalator perspective, it is better to +consider it as part of the hardware upgrade. ) + +**Computing node** + +1. OS Kernel + +2. Hypvervisor and virtual switch + +3. Other kernel modules, like drivers + +4. User space software packages, like nova-compute agents and other + control plane programs. + +Updating 1 and 2 will cause the loss of virtualzation functionality of +the compute node, which may lead to the interruption of data plane services +if the virtual resource is not redudant. + +Updating 3 might have the same result. + +Updating 4 might lead to control plane services interruption if not an +HA deployment. + +.. I'm not sure why would 4 cause control plane interruption on a + compute node. My understanding is that simply the node cannot be managed. + Redundancy won't help in that either. + + +**Networking node** + +1. OS kernel, optional, not all switches/routers allow the upgrade their + OS since it is more like a firmware than a generic OS. + +2. User space software package, like neutron agents and other control + plane programs + +Updating 1 if allowed will cause a node reboot and therefore leads to +data plane service interruption if the virtual resource is not +redundant. + +Updating 2 might lead to control plane services interruption if not an +HA deployment. + +**Storage node** + +1. OS kernel, optional, not all storage nodes allow the upgrade their OS + since it is more like a firmware than a generic OS. + +2. Kernel modules + +3. User space software packages, control plane programs + +Updating 1 if allowed will cause a node reboot and therefore leads to +data plane services interruption if the virtual resource is not +redundant. + +Update 2 might result in the same. + +Updating 3 might lead to control plane services interruption if not an +HA deployment. + +**Management node** + +1. OS Kernel + +2. Kernel modules, like driver + +3. User space software packages, like database, message queue and + control plane programs. + +Updating 1 will cause a node reboot and therefore leads to control +plane services interruption if not an HA deployment. Updating 2 might +result in the same. + +Updating 3 might lead to control plane services interruption if not an +HA deployment. + +Upgrade Granularity +=================== + +The granularity of an upgrade can be characterized from two perspective: +- the physical dimension and +- the software dimension + +Physical Dimension +^^^^^^^^^^^^^^^^^^ + +The physical dimension characterizes the number of similar upgrade objects +targeted by the upgrade, i.e. whether it is full / partial upgrade of a +data centre, cluster, zone. +Because of the upgrade of a data centre or a zone, it may be divided into +several batches. Thus there is a need for efficiency in the execution of +upgrades of potentially huge number of upgrade objects while still maintain +availability to fulfill the requirement of smooth upgrade. + +The upgrade of a cloud environment (cluster) may also +be partial. For example, in one cloud environment running a number of +VNFs, we may just try to upgrade one of them to check the stability and +performance, before we upgrade all of them. +Thus there is a need for proper organization of the artifacts associated with +the different upgrade objects. Also the different versions should be able +to coextist beyond the upgrade period. + +From this perspective special attention may be needed when upgrading +objects that are collaborating in a redundancy schema as in this case +different versions not only need to coexist but also collaborate. This +puts requirement on the upgrade objects primarily. If this is not possible +the upgrade campaign should be designed in such a way that the proper +isolation is ensured. + +Software Dimension +^^^^^^^^^^^^^^^^^^ + +The software dimension of the upgrade characterizes the upgrade object +type targeted and the combination in which they are upgraded together. + +Even though the upgrade may +initially target only one type of upgrade object, e.g. the hypervisor +the dependency of other upgrade objects on this initial target object may +require their upgrade as well. I.e. the upgrades need to be combined. From this +perspective the main concern is compatibility of the dependent and +sponsor objects. To take into consideration of these dependencies +they need to be described together with the version compatility information. +Breaking dependencies is the major cause of outages during upgrades. + +In other cases it is more efficient to upgrade a combination of upgrade +objects than to do it one by one. One aspect of the combination is how +the upgrade packages can be combined, whether a new image can be created for +them before hand or the different packages can be installed during the upgrade +independently, but activated together. + +The combination of upgrade objects may span across +layers (e.g. software stack in the host and the VM of the VNF). +Thus, it may require additional coordination between the management layers. + +With respect to each upgrade object type and even stacks we can +distingush major and minor upgrades: + +**Major Upgrade** + +Upgrades between major releases may introducing significant changes in +function, configuration and data, such as the upgrade of OPNFV from +Arno to Brahmaputra. + +**Minor Upgrade** + +Upgrades inside one major releases which would not leads to changing +the structure of the platform and may not infect the schema of the +system data. + +Scope of Impact +=============== + +Considering availability and therefore smooth upgrade, one of the major +concerns is the predictability and control of the outcome of the different +upgrade operations. Ideally an upgrade can be performed without impacting any +entity in the system, which means none of the operations change or potentially +change the behaviour of any entity in the system in an uncotrolled manner. +Accordingly the operations of such an upgrade can be performed any time while +the system is running, while all the entities are online. No entity needs to be +taken offline to avoid such adverse effects. Hence such upgrade operations +are referred as online operations. The effects of the upgrade might be activated +next time it is used, or may require a special activation action such as a +restart. Note that the activation action provides more control and predictability. + +If an entity's behavior in the system may change due to the upgrade it may +be better to take it offline for the time of the relevant upgrade operations. +The main question is however considering the hosting relation of an upgrade +object what hosted entities are impacted. Accordingly we can identify a scope +which is impacted by taking the given upgrade object offline. The entities +that are in the scope of impact may need to be taken offline or moved out of +this scope i.e. migrated. + +If the impacted entity is in a different layer managed by another manager +this may require coordination because taking out of service some +infrastructure resources for the time of their upgrade which support virtual +resources used by VNFs that should not experience outages. The hosted VNFs +may or may not allow for the hot migration of their VMs. In case of migration +the VMs placement policy should be considered. + diff --git a/docs/requirements/104-Requirements.rst b/docs/requirements/104-Requirements.rst new file mode 100644 index 0000000..b6e7f57 --- /dev/null +++ b/docs/requirements/104-Requirements.rst @@ -0,0 +1,478 @@ +============ +Requirements +============ + +Upgrade duration +================ + +As the OPNFV end-users are primarily Telecom operators, the network +services provided by the VNFs deployed on the NFVI should meet the +requirement of 'Carrier Grade'.:: + + In telecommunication, a "carrier grade" or"carrier class" refers to a + system, or a hardware or software component that is extremely reliable, + well tested and proven in its capabilities. Carrier grade systems are + tested and engineered to meet or exceed "five nines" high availability + standards, and provide very fast fault recovery through redundancy + (normally less than 50 milliseconds). [from wikipedia.org] + +"five nines" means working all the time in ONE YEAR except 5'15". + +:: + + We have learnt that a well prepared upgrade of OpenStack needs 10 + minutes. The major time slot in the outage time is used spent on + synchronizing the database. [from ' Ten minutes OpenStack Upgrade? Done! + ' by Symantec] + +This 10 minutes of downtime of the OpenStack services however did not impact the +users, i.e. the VMs running on the compute nodes. This was the outage of +the control plane only. On the other hand with respect to the +preparations this was a manually tailored upgrade specific to the +particular deployment and the versions of each OpenStack service. + +The project targets to achieve a more generic methodology, which however +requires that the upgrade objects fulfil certain requirements. Since +this is only possible on the long run we target first the upgrade +of the different VIM services from version to version. + +**Questions:** + +1. Can we manage to upgrade OPNFV in only 5 minutes? + +.. The first question is whether we have the same carrier grade + requirement on the control plane as on the user plane. I.e. how + much control plane outage we can/willing to tolerate? + In the above case probably if the database is only half of the size + we can do the upgrade in 5 minutes, but is that good? It also means + that if the database is twice as much then the outage is 20 + minutes. + For the user plane we should go for less as with two release yearly + that means 10 minutes outage per year. + +.. 10 minutes outage per year to the users? Plus, if we take + control plane into the consideration, then total outage will be + more than 10 minute in whole network, right? + +.. The control plane outage does not have to cause outage to + the users, but it may of course depending on the size of the system + as it's more likely that there's a failure that needs to be handled + by the control plane. + +2. Is it acceptable for end users ? Such as a planed service + interruption will lasting more than ten minutes for software + upgrade. + +.. For user plane, no it's not acceptable in case of + carrier-grade. The 5' 15" downtime should include unplanned and + planned downtimes. + +.. I go agree with Maria, it is not acceptable. + +3. Will any VNFs still working well when VIM is down? + +.. In case of OpenStack it seems yes. .:) + +The maximum duration of an upgrade +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The duration of an upgrade is related to and proportional with the +scale and the complexity of the OPNFV platform as well as the +granularity (in function and in space) of the upgrade. + +.. Also, if is a partial upgrade like module upgrade, it depends + also on the OPNFV modules and their tight connection entities as well. + +.. Since the maintenance window is shrinking and becoming non-existent + the duration of the upgrade is secondary to the requirement of smooth upgrade. + But probably we want to be able to put a time constraint on each upgrade + during which it must complete otherwise it is considered failed and the system + should be rolled back. I.e. in case of automatic execution it might not be clear + if an upgrade is long or just hanging. The time constraints may be a function + of the size of the system in terms of the upgrade object(s). + +The maximum duration of a roll back when an upgrade is failed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The duration of a roll back is short than the corresponding upgrade. It +depends on the duration of restore the software and configure data from +pre-upgrade backup / snapshot. + +.. During the upgrade process two types of failure may happen: + In case we can recover from the failure by undoing the upgrade + actions it is possible to roll back the already executed part of the + upgrade in graceful manner introducing no more service outage than + what was introduced during the upgrade. Such a graceful roll back + requires typically the same amount of time as the executed portion of + the upgrade and impose minimal state/data loss. + +.. Requirement: It should be possible to roll back gracefully the + failed upgrade of stateful services of the control plane. + In case we cannot recover from the failure by just undoing the + upgrade actions, we have to restore the upgraded entities from their + backed up state. In other terms the system falls back to an earlier + state, which is typically a faster recovery procedure than graceful + roll back and depending on the statefulness of the entities involved it + may result in significant state/data loss. + +.. Two possible types of failures can happen during an upgrade + +.. We can recover from the failure that occurred in the upgrade process: + In this case, a graceful rolling back of the executed part of the + upgrade may be possible which would "undo" the executed part in a + similar fashion. Thus, such a roll back introduces no more service + outage during an upgrade than the executed part introduced. This + process typically requires the same amount of time as the executed + portion of the upgrade and impose minimal state/data loss. + +.. We cannot recover from the failure that occurred in the upgrade + process: In this case, the system needs to fall back to an earlier + consistent state by reloading this backed-up state. This is typically + a faster recovery procedure than the graceful roll back, but can cause + state/data loss. The state/data loss usually depends on the + statefulness of the entities whose state is restored from the backup. + +The maximum duration of a VNF interruption (Service outage) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since not the entire process of a smooth upgrade will affect the VNFs, +the duration of the VNF interruption may be shorter than the duration +of the upgrade. In some cases, the VNF running without the control +from of the VIM is acceptable. + +.. Should require explicitly that the NFVI should be able to + provide its services to the VNFs independent of the control plane? + +.. Requirement: The upgrade of the control plane must not cause + interruption of the NFVI services provided to the VNFs. + +.. With respect to carrier-grade the yearly service outage of the + VNF should not exceed 5' 15" regardless whether it is planned or + unplanned outage. Considering the HA requirements TL-9000 requires an + end-to-end service recovery time of 15 seconds based on which the ETSI + GS NFV-REL 001 V1.1.1 (2015-01) document defines three service + availability levels (SAL). The proposed example service recovery times + for these levels are: + +.. SAL1: 5-6 seconds + +.. SAL2: 10-15 seconds + +.. SAL3: 20-25 seconds + +.. my comment was actually that the downtime metrics of the + underlying elements, components and services are small fraction of the + total E2E service availability time. No-one on the E2E service path + will get the whole downtime allocation (in this context it includes + upgrade process related outages for the services provided by VIM etc. + elements that are subject to upgrade process). + +.. So what you are saying is that the upgrade of any entity + (component, service) shouldn't cause even this much service + interruption. This was the reason I brought these figures here as well + that they are posing some kind of upper-upper boundary. Ideally the + interruption is in the millisecond range i.e. no more than a + switch-over or a live migration. + +.. Requirement: Any interruption caused to the VNF by the upgrade + of the NFVI should be in the sub-second range. + +.. In the future we also need to consider the upgrade of the NFVI, + i.e. HW, firmware, hypervisors, host OS etc. + +Pre-upgrading Environment +========================= + +System is running normally. If there are any faults before the upgrade, +it is difficult to distinguish between upgrade introduced and the environment +itself. + +The environment should have the redundant resources. Because the upgrade +process is based on the business migration, in the absence of resource +redundancy,it is impossible to realize the business migration, as well as to +achieve a smooth upgrade. + +Resource redundancy in two levels: + +NFVI level: This level is mainly the compute nodes resource redundancy. +During the upgrade, the virtual machine on business can be migrated to another +free compute node. + +VNF level: This level depends on HA mechanism in VNF, such as: +active-standby, load balance. In this case, as long as business of the target +node on VMs is migrated to other free nodes, the migration of VM might not be +necessary. + +The way of redundancy to be used is subject to the specific environment. +Generally speaking, During the upgrade, the VNF's service level availability +mechanism should be used in higher priority than the NFVI's. This will help +us to reduce the service outage. + +Release version of software components +====================================== + +This is primarily a compatibility requirement. You can refer to Linux/Python +Compatible Semantic Versioning 3.0.0: + +Given a version number MAJOR.MINOR.PATCH, increment the: + +MAJOR version when you make incompatible API changes, + +MINOR version when you add functionality in a backwards-compatible manner, + +PATCH version when you make backwards-compatible bug fixes. + +Some internal interfaces of OpenStack will be used by Escalator indirectly, +such as VM migration related interface between VIM and NFVI. So it is required +to be backward compatible on these interfaces. Refer to "Interface" chapter +for details. + +Work Flows +========== + +Describes the different types of requirements. To have a table to label the source of +the requirements, e.g. Doctor, Multi-site, etc. + +Basic Actions +============= + +This section describes the basic functions may required by Escalator. + +Preparation (offline) +^^^^^^^^^^^^^^^^^^^^^ + +This is the design phase when the upgrade plan (or upgrade campaign) is +being designed so that it can be executed automatically with minimal +service outage. It may include the following work: + +1. Check the dependencies of the software modules and their impact, + backward compatibilities to figure out the appropriate upgrade method + and ordering. +2. Find out if a rolling upgrade could be planned with several rolling + steps to avoid any service outage due to the upgrade some + parts/services at the same time. +3. Collect the proper version files and check the integration for + upgrading. +4. The preparation step should produce an output (i.e. upgrade + campaign/plan), which is executable automatically in an NFV Framework + and which can be validated before execution. + + - The upgrade campaign should not be referring to scalable entities + directly, but allow for adaptation to the system configuration and + state at any given moment. + - The upgrade campaign should describe the ordering of the upgrade + of different entities so that dependencies, redundancies can be + maintained during the upgrade execution + - The upgrade campaign should provide information about the + applicable recovery procedures and their ordering. + - The upgrade campaign should consider information about the + verification/testing procedures to be performed during the upgrade + so that upgrade failures can be detected as soon as possible and + the appropriate recovery procedure can be identified and applied. + - The upgrade campaign should provide information on the expected + execution time so that hanging execution can be identified + - The upgrade campaign should indicate any point in the upgrade when + coordination with the users (VNFs) is required. + +.. Depends on the attributes of the object being upgraded, the + upgrade plan may be slitted into step(s) and/or sub-plan(s), and even + more small sub-plans in design phase. The plan(s) or sub-plan(s) my + include step(s) or sub-plan(s). + +Validation the upgrade plan / Checking the pre-requisites of System( offline / online) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The upgrade plan should be validated before the execution by testing +it in a test environment which is similar to the product environment. + +.. However it could also mean that we can identify some properties + that it should satisfy e.g. what operations can or cannot be executed + simultaneously like never take out two VMs of the same VNF. + +.. Another question is if it requires that the system is in a particular + state when the upgrade is applied. I.e. if there's certain amount of + redundancy in the system, migration is enabled for VMs, when the NFVI + is upgraded the VIM is healthy, when the VIM is upgraded the NFVI is + healthy, etc. + +.. I'm not sure what online validation means: Is it the validation of the + upgrade plan/campaign or the validation of the system that it is in a + state that the upgrade can be performed without too much risk?== + +Before the upgrade plan being executed, the system healthy of the +online product environment should be checked and confirmed to satisfy +the requirements which were described in the upgrade plan. The +sysinfo, e.g. which included system alarms, performance statistics and +diagnostic logs, will be collected and analogized. It is required to +resolve all of the system faults or exclude the unhealthy part before +executing the upgrade plan. + + +Backup/Snapshot (online) +^^^^^^^^^^^^^^^^^^^^^^^^ + +For avoid loss of data when a unsuccessful upgrade was encountered, the +data should be back-upped and the system state snapshot should be taken +before the execution of upgrade plan. This would be considered in the +upgrade plan. + +Several backups/Snapshots may be generated and stored before the single +steps of changes. The following data/files are required to be +considered: + +1. running version files for each node. +2. system components' configuration file and database. +3. image and storage, if it is necessary. + +.. Does 3 imply VNF image and storage? I.e. VNF state and data?== + +.. The following text is derived from previous "4. Negotiate + with the VNF if it's ready for the upgrade" + +Although the upper layer, which include VNFs and VNFMs, is out of the +scope of Escalator, but it is still recommended to let it ready for a +smooth system upgrade. The escalator could not guarantee the safe of +VNFs. The upper layer should have some safe guard mechanism in design, +and ready for avoiding failure in system upgrade. + +Execution (online) +^^^^^^^^^^^^^^^^^^ + +The execution of upgrade plan should be a dynamical procedure which is + controlled by Escalator. + +.. Revised text to be general.== + +1. It is required to supporting execution ether in sequence or in + parallel. +2. It is required to check the result of the execution and take the + action according the situation and the policies in the upgrade plan. +3. It is required to execute properly on various configurations of + system object. I.e. stand-alone, HA, etc. +4. It is required to execute on the designated different parts of the + system. I.e. physical server, virtualized server, rack, chassis, + cluster, even different geographical places. + +Testing (online) +^^^^^^^^^^^^^^^^ + +The testing after upgrade the whole system or parts of system to make +sure the upgraded system(object) is working normally. + +.. Revised text to be general. + +1. It is recommended to run the prepared test cases to see if the + functionalities are available without any problem. +2. It is recommended to check the sysinfo, e.g. system alarms, + performance statistics and diagnostic logs to see if there are any + abnormal. + +Restore/Roll-back (online) +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When upgrade is failure unfortunately, a quick system restore or system +roll-back should be taken to recovery the system and the services. + +.. Revised text to be general. + +1. It is recommend to support system restore from backup when upgrade + was failed. +2. It is recommend to support graceful roll-back with reverse order + steps if possible. + +Monitoring (online) +^^^^^^^^^^^^^^^^^^^ + +Escalator should continually monitor the process of upgrade. It is +keeping update status of each module, each node, each cluster into a +status table during upgrade. + +.. Revised text to be general. + +1. It is required to collect the status of every objects being upgraded + and sending abnormal alarms during the upgrade. +2. It is recommend to reuse the existing monitoring system, like alarm. +3. It is recommend to support pro-actively query. +4. It is recommend to support passively wait for notification. + +**Two possible ways for monitoring:** + +**Pro-Actively Query** requires NFVI/VIM provides proper API or CLI +interface. If Escalator serves as a service, it should pass on these +interfaces. + +**Passively Wait for Notification** requires Escalator provides +callback interface, which could be used by NFVI/VIM systems or upgrade +agent to send back notification. + +.. I am not sure why not to subscribe the notification. + +Logging (online) +^^^^^^^^^^^^^^^^ + +Record the information generated by escalator into log files. The log +file is used for manual diagnostic of exceptions. + +1. It is required to support logging. +2. It is recommended to include time stamp, object id, action name, + error code, etc. + +Administrative Control (online) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Administrative Control is used for control the privilege to start any +escalator's actions for avoiding unauthorized operations. + +#. It is required to support administrative control mechanism +#. It is recommend to reuse the system's own secure system. +#. It is required to avoid conflicts when the system's own secure system + being upgraded. + +Requirements on Object being upgraded +===================================== + +.. We can develop BPs in future from requirements of this section and + gap analysis for upper stream projects + +Escalator focus on smooth upgrade. In practical implementation, it +might be combined with installer/deplorer, or act as an independent +tool/service. In either way, it requires targeting systems(NFVI and +VIM) are developed/deployed in a way that Escalator could perform +upgrade on them. + +On NFVI system, live-migration is likely used to maintain availability +because OPNFV would like to make HA transparent from end user. This +requires VIM system being able to put compute node into maintenance mode +and then isolated from normal service. Otherwise, new NFVI instances +might risk at being schedule into the upgrading node. + +On VIM system, availability is likely achieved by redundancy. This +impose less requirements on system/services being upgrade (see PVA +comments in early version). However, there should be a way to put the +target system into standby mode. Because starting upgrade on the +master node in a cluster is likely a bad idea. + +.. Revised text to be general. + +1. It is required for NFVI/VIM to support **service handover** mechanism + that minimize interruption to 0.001%(i.e. 99.999% service + availability). Possible implementations are live-migration, redundant + deployment, etc, (Note: for VIM, interruption could be less + restrictive) + +2. It is required for NFVI/VIM to restore the early version in a efficient + way, such as **snapshot**. + +3. It is required for NFVI/VIM to **migration data** efficiently between + base and upgraded system. + +4. It is recommend for NFV/VIM's interface to support upgrade + orchestration, e.g. reading/setting system state. + +Functional Requirements +======================= + +Availability mechanism, etc. + +Non-functional Requirements +=========================== diff --git a/docs/requirements/105-Use_Cases.rst b/docs/requirements/105-Use_Cases.rst new file mode 100644 index 0000000..9f13110 --- /dev/null +++ b/docs/requirements/105-Use_Cases.rst @@ -0,0 +1,213 @@ +========= +Use Cases +========= + +This section describes the use cases in different system configuration +to verify the requirements of Escalator. + +System Configurations +===================== + +HA configuration +^^^^^^^^^^^^^^^^ + +A HA configuration system is very popular in the operator's data centre. +It is a typical product environment. It is always running 7\*24 with VNFs +running on it to provide services to the end users. + + +Non-HA configuration +^^^^^^^^^^^^^^^^^^^^ + +A non-HA configuration system is normally deployed for experimental or +development usages, such as a Vagrant/VM environment. + +Escalator supports the upgrade system in this configuration, but it may +not guarantee a smooth upgrade. + +Use cases +========= + +Use case #1: Smooth upgrade in a HA configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For a system with HA configuration, the operator can use Escalator to +smooth-upgrade NFVI/VIM components into a new version without any service +outage. + +When a compute node being upgraded, the VMs on the node may need to be migrated +to other compute nodes to avoid service outage, so it is requred that there are +enough redundant resources to migrate VMs on this compute node. + +Before upgrade, the operator can use Escalator to check whether smooth upgrade +conditions are all satisfied. These conditions include whether there are enough +idle resources to migrate VMs during updrading, and whether the new version is +compatible with the current one, etc. If there are some conditions not +satisfied, Escalator will show them. Escalator can also provide the solutions if +there is any, such as the number and configuration of spare compute nodes which +are needed. + +When upgrade starts, Escalator will also automatically check whether smooth +upgrade conditions are all satisfied. If some smooth upgrade conditions are not +satisfied, Escalator will show the failure of smooth upgrade. + +- Pre-Conditions + + 1. The system is running as normal. + 2. The VNFs are providing services as usual. + +- Upgrading steps + + 1. The VNFs are continually providing services during the upgrade. + 2. The operator successfully logged in the GUI of Escalator to select the + software packages including Linux OS, Hypervisor, OpenStack, ODL and other + OPNFV components, ect. (All or part of components could be selected.) + 3. Select the nodes to be upgraded. i.e. controller node, network node, + storage node and compute node, etc. + 4. Select "Disable Scale-up". It will limit the scale-up operation when + upgrade is in progress to prevent failures due to the shortage of + resources. + 5. Select "Check Smooth Upgrade Conditions". If Escalator shows that there are + some conditions not satisfied, try to resolve them according to the + solutions provided. + 6. Select "Smooth Upgrade", then apply the upgrade operation. + 7. Select "Restore Scale-up" after the upgrade. It will restore scale-up to + the original enabled/disabled state before upgrade. + +- Post-Conditions + + 1. The system is upgraded successfully. + 2. There is no service outage during the upgrade. + 3. The VNFs are providing services as usual after the upgrade. + +Use case #2: Roll-back after a failed smooth upgrade in a HA configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For a system with HA configuration, if the upgrade fails when the operator is +smooth-upgrading NFVI/VIM components into a new version using Escalator, the +operator can roll-back the system without any service outage. + +- Pre-Conditions + + 1. The system is running as normal. + 2. The VNFs are providing services as usual. + 3. Scale-up operation is disabled. + 4. Smooth upgrade failed. + +- Roll-back steps + + 1. Escalator concludes that the upgrade has failed and provides the operator + with the reason. + 2. Select the "Roll-back" operation. + 3. If the roll-back is successful, go to step 4, otherwise the operator can + select "Restore Backup" to restore the system from the backup data. + 4. Select "Restore Scale-up" after the roll-back. It will restore scale-up to + the original enabled/disabled state before upgrade. + +- Post-Conditions + + 1. The system is rolled-back successfully when the upgrade failed. + 2. There is no service outage during the roll-back. + 3. The VNFs are providing services as usual after the roll-back. + +Use case #3: Roll-back after a successful smooth upgrade in a HA configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +When a smooth upgrade in a HA configuration is successful, the operator may want +to roll-back for some reasons, such as performance issues. +Escalator supports roll-back after a successful smooth upgrade without any +service outage. + +- Pre-Conditions + + 1. The system is running as normal. + 2. The VNFs are providing services as usual. + 3. Smooth upgrade succeeded. + +- Roll-back steps + + 1. Select "Disable Scale-up". It will limit the scale-up operation when roll- + back is in progress to prevent failures due to the shortage of resources. + 2. Select "Check Smooth Roll-back Conditions". If Escalator shows that there + are some conditions not satisfied, try to resolve them according to the + solutions provided. + 3. Select "Roll-back", then apply the roll-back operation. + 4. If the roll-back is successful, go to step 5, otherwise the operator can + select "Restore Backup" to restore the system from the backup data. + 5. Select "Restore Scale-up" after the roll-back. It will restore scale-up to + the original enabled/disabled state before roll-back. + +- Post-Conditions + + 1. The system is rolled-back successfully. + 2. There is no service outage during the roll-back. + 3. The VNFs are providing services as usual after the roll-back. + +Use case #4: Non-smooth upgrade in a non-HA/HA configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For a system with non-HA configuration, the operator can also use Escalator to +upgrade NFVI/VIM components into a new version. In this case, the upgrade may +result in service outage. In other words, the upgrade is non-smooth. +For a system with HA configuration, if the service outage is acceptable or +inevitable, the operator can also use Escalator to non-smoothly upgrade the +system. + +- Pre-Conditions + + 1. The system is running as normal. + +- Upgrading steps + + 1. The operator successfully logged in the GUI of Escalator to select the + software packages including Linux OS, Hypervisor, OpenStack, ODL and other + OPNFV components, ect. (All or part of components could be selected.) + 2. Select the nodes to be upgraded. i.e. controller node, network node, + storage node and compute node, etc. + 3. Select "Non-Smooth Upgrade", then apply the upgrade operation. + +- Post-Conditions + + 1. The system is upgraded successfully. + +Use case #5: Roll-back after a failed non-smooth upgrade in a non-HA/HA configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For a system with non-HA/HA configuration, if the upgrade fails when the +operator is non-smoothly upgrading NFVI/VIM components into a new version using +Escalator, the operator can roll-back the system. In this case, the roll-back +may result in service outage. + +- Pre-Conditions + + 1. The system is running as normal. + 2. Non-smooth upgrade failed. + +- Roll-back steps + + 1. Escalator concludes that the upgrade has failed and provides the operator + with the reason. + 2. Select the "Roll-back" operation. + 3. If the roll-back fails, the operator can select "Restore Backup" to restore + the system from the backup data. + +- Post-Conditions + + 1. The system is rolled-back successfully when the upgrade failed. + +Use case #6: Roll-back after a successful non-smooth upgrade in a non-HA/HA configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +When a non-smooth upgrade in a non-HA/HA configuration is successful, the +operator may want to roll-back for some reasons, such as performance issues. +Escalator supports roll-back after a successful non-smooth upgrade. In this +case,the roll-back may result in service outage. + +- Pre-Conditions + + 1. The system is running as normal. + 2. Non-smooth upgrade succeeded. + +- Roll-back steps + + 1. Select the "Roll-back" operation. + 2. If the roll-back fails, the operator can select "Restore Backup" to restore + the system from the backup data. + +- Post-Conditions + + 1. The system is rolled-back successfully when the upgrade failed. diff --git a/docs/requirements/106-Reference.rst b/docs/requirements/106-Reference.rst new file mode 100644 index 0000000..ff087fa --- /dev/null +++ b/docs/requirements/106-Reference.rst @@ -0,0 +1,18 @@ +========= +Reference +========= + +[1] ETSI GS NFV 002 (V1.1.1): “Architectural Framework” + +[2] ETSI GS NFV 003 (V1.1.1): "Terminology for Main Concepts in NFV" + +[3] ETSI GS NFV-SWA001:“Virtual Network Function Architecture” + +[4] ETSI GS NFV-MAN001:“Management and Orchestration” + +[5] ETSI GS NFV-REL001:"Resiliency Requirements" + +[6] QuEST Forum TL-9000:"Quality Management System Requirement +Handbook" + +[7] Service Availability Forum AIS:"Software Management Framework" diff --git a/docs/requirements/1A1-Requirements_from_other_Projects.rst b/docs/requirements/1A1-Requirements_from_other_Projects.rst new file mode 100644 index 0000000..a62405d --- /dev/null +++ b/docs/requirements/1A1-Requirements_from_other_Projects.rst @@ -0,0 +1,34 @@ +================================ +Requirements from other Projects +================================ + +Doctor Project +============== + +.. This scenario could be out of scope in Escalator project, but + having the option to support this should be better to align with + Doctor requirements. + +The scope of Doctor project also covers maintenance scenario in which + +1. The VIM administrator requests host maintenance to VIM. + +2. VIM will notify it to consumer such as VNFM to trigger application level + migration or switching active-standby nodes. + +3. VIM waits response from the consumer for a short while. + +- VIM should send out notification of VM migration to consumer (VNFM) + as abstracted message like "maintenance". + +- VIM could wait VM migration until it receives "VM ready to + maintenance" message from the owner (VNFM) + +HA Project +========== + +Multi-site Project +================== + +- Escalator upgrade one site should at least not lead to the other site + API token validation failed. diff --git a/docs/requirements/1A2-Questionnaire_of_Escalator.rst b/docs/requirements/1A2-Questionnaire_of_Escalator.rst new file mode 100644 index 0000000..c92a391 --- /dev/null +++ b/docs/requirements/1A2-Questionnaire_of_Escalator.rst @@ -0,0 +1,11 @@ +========================== +Questionnaire of Escalator +========================== + +A Questionnaire was created for collecting requirements from other projects. + +Escalator Questionnaire: +https://wiki.opnfv.org/_media/wiki/opnfv_escalator_questionnaire_20150723.pptx + +Answer the questionnaire: https://docs.google.com/forms/d/11o1mt15zcq0WBtXYK0n6lKF8XuIzQTwvv8ePTjmcoF0/viewform?usp=send_form + diff --git a/docs/requirements/300-Gap_Analysis_Report.rst b/docs/requirements/300-Gap_Analysis_Report.rst new file mode 100644 index 0000000..1f1d3fe --- /dev/null +++ b/docs/requirements/300-Gap_Analysis_Report.rst @@ -0,0 +1,50 @@ +=================== +Gap Analysis Report +=================== + +Impact Analysis +=============== + +Upgrading the different software modules may cause different impact on +the availability of the infrastructure resources and even on the service +continuity of the vNFs. + +**Software modules in the computing nodes** + +#. Host OS patch + +#. Hypervisor, such as KVM, QEMU, XEN, libvirt +#. Openstack agent in computing nodes (like Nova agent, Ceilometer + agent...) + +.. As SW module, we should list the host OS and maybe its + drivers as well. From upgrade perspective do we limit host OS + upgrades to patches only? + +**Software modules in network nodes** + +#. Neutron L2/L3 agent +#. OVS, SR-IOV Driver + +**Software modules storage nodes** + +#. Ceph + +The table below analyses such an impact - considering a single instance +of each software module - from the following aspects: + +- the function which will be lost during upgrade, +- the duration of the loss of this specific function, +- if this causes the loss of the vNF function, +- if it causes incompatibility in the different parts of the software, +- what should be backed up before the upgrade, +- the duration of restoration time if the upgrade fails + +These values provided come from internal testing and based on some +assumptions, they may vary depending on the deployment techniques. +Please feel free to add if you find more efficient values during your +testing. + +https://wiki.opnfv.org/_media/upgrade_analysis_v0.5.xlsx + +Note that no redundancy of the software modules is considered in the table. diff --git a/docs/requirements/etc/conf.py b/docs/requirements/etc/conf.py new file mode 100644 index 0000000..0066035 --- /dev/null +++ b/docs/requirements/etc/conf.py @@ -0,0 +1,34 @@ +import datetime +import sys +import os + +try: + __import__('imp').find_module('sphinx.ext.numfig') + extensions = ['sphinx.ext.numfig'] +except ImportError: + # 'pip install sphinx_numfig' + extensions = ['sphinx_numfig'] + +# numfig: +number_figures = True +figure_caption_prefix = "Fig." + +source_suffix = '.rst' +master_doc = 'index' +pygments_style = 'sphinx' +html_use_index = False + +pdf_documents = [('index', u'OPNFV', u'OPNFV Project', u'OPNFV')] +pdf_fit_mode = "shrink" +pdf_stylesheets = ['sphinx','kerning','a4'] +#latex_domain_indices = False +#latex_use_modindex = False + +latex_elements = { + 'printindex': '', +} + +project = u'OPNFV: Template documentation config' +copyright = u'%s, OPNFV' % datetime.date.today().year +version = u'1.0.0' +release = u'1.0.0' diff --git a/docs/requirements/etc/opnfv-logo.png b/docs/requirements/etc/opnfv-logo.png new file mode 100644 index 0000000..1519503 Binary files /dev/null and b/docs/requirements/etc/opnfv-logo.png differ diff --git a/docs/requirements/images/figure1.png b/docs/requirements/images/figure1.png new file mode 100644 index 0000000..5a83842 Binary files /dev/null and b/docs/requirements/images/figure1.png differ diff --git a/docs/requirements/index.rst b/docs/requirements/index.rst new file mode 100644 index 0000000..599f8bd --- /dev/null +++ b/docs/requirements/index.rst @@ -0,0 +1,37 @@ +.. OPNFV Release Engineering documentation, created by + sphinx-quickstart on Tue Jun 9 19:12:31 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. image:: etc/opnfv-logo.png + :height: 40 + :width: 200 + :alt: OPNFV + :align: left + +ESCALATOR USER REQUIREMENTS +======================================= + +Contents: + +.. toctree:: + :maxdepth: 4 + :titlesonly: + + + 000-Contributors.rst + 101-Scope.rst + 102-Terminologies.rst + 103-Background.rst + 104-Requirements.rst + 105-Use_Cases.rst + 106-Reference.rst + 1A1-Requirements_from_other_Projects.rst + 1A2-Questionnaire_of_Escalator.rst + + +* :ref:`search` + +Revision: _sha1_ + +Build date: |today| -- cgit 1.2.3-korg