aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexandru Avadanii <Alexandru.Avadanii@enea.com>2017-08-23 22:40:36 +0200
committerAlexandru Avadanii <Alexandru.Avadanii@enea.com>2017-08-23 22:40:36 +0200
commit72baec2c0c7c9dde7c95b8528aa680ff623c8cae (patch)
treeafd42de78f1f7fb5b31a56fcd62f8743225ae4ed
parent92c6cf9a373eb7c2cbe8f09a193d9fa43935bee3 (diff)
p/fuel: MaaS: commissioning/deployment retry
Change-Id: Ic4aa514e773d2963ad4a45b5be475e17b22885b4 Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
-rw-r--r--patches/opnfv-fuel/0018-MaaS-commissioning-deployment-retry.patch168
1 files changed, 168 insertions, 0 deletions
diff --git a/patches/opnfv-fuel/0018-MaaS-commissioning-deployment-retry.patch b/patches/opnfv-fuel/0018-MaaS-commissioning-deployment-retry.patch
new file mode 100644
index 00000000..3a26d9b0
--- /dev/null
+++ b/patches/opnfv-fuel/0018-MaaS-commissioning-deployment-retry.patch
@@ -0,0 +1,168 @@
+From: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
+Date: Wed, 23 Aug 2017 04:23:26 +0200
+Subject: [PATCH] MaaS: commissioning/deployment retry
+
+While at it, parametrize max attempt number in maas state's "wait_for",
+and reduce retries count for certain simpler tasks.
+
+Change-Id: I3ac2877719cdd32613bcf41186ebbb9f3f3aee93
+Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
+---
+ mcp/config/states/maas | 63 ++++++++++++++++------
+ mcp/salt-formulas/maas/machines/delete.sls | 12 +++++
+ .../maas/machines/mark_broken_fixed.sls | 12 +++++
+ 3 files changed, 70 insertions(+), 17 deletions(-)
+ create mode 100644 mcp/salt-formulas/maas/machines/delete.sls
+ create mode 100644 mcp/salt-formulas/maas/machines/mark_broken_fixed.sls
+
+diff --git a/mcp/config/states/maas b/mcp/config/states/maas
+index 0cf4f68..080bbf5 100755
+--- a/mcp/config/states/maas
++++ b/mcp/config/states/maas
+@@ -1,17 +1,47 @@
+ #!/bin/bash
+ function wait_for() {
++ local total_attempts=$1; shift
+ local cmdstr=$@
+- local total_attempts=360
+ local sleep_time=10
+- local attempt=1
+ echo "[NOTE] Waiting for cmd to return success: ${cmdstr}"
+- while ((attempt <= total_attempts)); do
++ for attempt in $(seq "${total_attempts}"); do
+ eval "${cmdstr}" && break || true
+ echo -n '.'; sleep "${sleep_time}"
+- ((attempt+=1))
+ done
+ }
+
++# Wait for MaaS commissioning/deploy to finish, retry on failure
++function maas_fixup() {
++ local statuscmd="salt 'mas01*' --out yaml state.apply maas.machines.status"
++ wait_for 180 "${statuscmd} | tee /dev/stderr | " \
++ "grep -Eq '((Deployed|Ready): 5|status:Failed|status:Allocated)'"
++ local statusout=$(eval "${statuscmd}")
++
++ local fcnodes=$(echo "${statusout}" | \
++ grep -Po '(?<=system_id:)(.*)(?=,status:Failed commissioning)')
++ for node_system_id in ${fcnodes}; do
++ salt -C 'mas01*' state.apply maas.machines.delete \
++ pillar="{'system_id': '${node_system_id}'}"
++ done
++ if [ -n "${fcnodes}" ]; then
++ salt -C 'mas01*' state.apply maas.machines
++ return 1
++ fi
++
++ local fdnodes=$(echo "${statusout}" | \
++ grep -Po '(?<=system_id:)(.*)(?=,status:(Failed deployment|Allocated))')
++ for node_system_id in ${fdnodes}; do
++ salt -C 'mas01*' state.apply maas.machines.mark_broken_fixed \
++ pillar="{'system_id': '${node_system_id}'}"
++ done
++ if [ -n "${fdnodes}" ]; then
++ salt -C 'mas01*' state.apply maas.machines.deploy
++ return 1
++ fi
++
++ return 0
++}
++
+ # MaaS rack/region controller, node commissioning
+ salt -C 'mas01*' cmd.run "add-apt-repository ppa:maas/stable"
+
+@@ -22,20 +52,18 @@ salt -C 'mas01*' state.apply maas.cluster
+ salt -C 'cfg01*' cmd.run \
+ "route add -net 192.168.11.0/24 gw ${MAAS_IP:-192.168.10.2}"
+
+-wait_for "salt -C 'mas01*' state.apply maas.region"
++wait_for 10 "salt -C 'mas01*' state.apply maas.region"
+
+ salt -C 'mas01*' state.apply maas.machines
+-# TODO: relax cond, as this is not re-entrant (e.g. nodes already deployed)
+-wait_for "salt 'mas01*' --out yaml state.apply maas.machines.status | " \
+- "tee /dev/stderr | fgrep -q 'Ready: 5'"
++wait_for 10 maas_fixup
+
+ # cleanup outdated salt keys
+-salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | xargs -I{} salt-key -yd {}
++salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | \
++ xargs -I{} salt-key -yd {}
+
+ # MaaS node deployment
+ salt -C 'mas01*' state.apply maas.machines.deploy
+-wait_for "salt 'mas01*' --out yaml state.apply maas.machines.status | " \
+- "tee /dev/stderr | fgrep -q 'Deployed: 5'"
++wait_for 10 maas_fixup
+
+ salt -C 'mas01*' pillar.item\
+ maas:region:admin:username \
+@@ -48,7 +76,7 @@ salt -C 'kvm*' pkg.install bridge-utils
+ salt -C 'kvm*' state.apply linux.network
+ salt -C 'kvm*' state.apply armband.bootstrap_script_arm64
+ salt -C 'kvm*' system.reboot
+-wait_for "! salt '*' test.ping | tee /dev/stderr | fgrep -q 'Not connected'"
++wait_for 90 "! salt '*' test.ping | tee /dev/stderr | fgrep -q 'Not connected'"
+
+ salt -C '* and not cfg01* and not mas01*' state.apply linux,ntp
+
+@@ -59,7 +87,8 @@ salt -C '* and not cfg01* and not mas01*' state.apply salt
+ salt -C 'kvm*' saltutil.sync_all
+ salt -C 'kvm*' state.sls salt.control
+
+-vcp_nodes=$(salt --out yaml 'kvm01*' pillar.get salt:control:cluster:internal:node | awk '/\s+\w+:$/ {gsub(/:$/, "*"); print $1}')
++vcp_nodes=$(salt --out yaml 'kvm01*' pillar.get salt:control:cluster:internal:node | \
++ awk '/\s+\w+:$/ {gsub(/:$/, "*"); print $1}')
+
+ # Check all vcp nodes are available
+ rc=1
+@@ -71,9 +100,9 @@ while [ $rc -ne 0 ]; do
+ sleep 5
+ done
+
+-wait_for "salt -C '* and not cfg01* and not mas01*' ssh.set_auth_key ${SUDO_USER} \
++wait_for 10 "salt -C '* and not cfg01* and not mas01*' ssh.set_auth_key ${SUDO_USER} \
+ $(awk 'NR==1{print $2}' $(eval echo ~${SUDO_USER}/.ssh/authorized_keys))"
+
+-wait_for "salt -C '* and not cfg01* and not mas01*' saltutil.sync_all"
+-salt -C '* and not cfg01* and not mas01*' state.apply salt
+-wait_for "salt -C '* and not cfg01* and not mas01*' state.apply linux,ntp"
++wait_for 10 "salt -C '* and not cfg01* and not mas01*' saltutil.sync_all"
++wait_for 10 "salt -C '* and not cfg01* and not mas01*' state.apply salt"
++wait_for 10 "salt -C '* and not cfg01* and not mas01*' state.apply linux,ntp"
+diff --git a/mcp/salt-formulas/maas/machines/delete.sls b/mcp/salt-formulas/maas/machines/delete.sls
+new file mode 100644
+index 0000000..306dbca
+--- /dev/null
++++ b/mcp/salt-formulas/maas/machines/delete.sls
+@@ -0,0 +1,12 @@
++{%- from "maas/map.jinja" import region with context %}
++
++maas_login_admin:
++ cmd.run:
++ - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
++
++# TODO: implement delete_machine via _modules/maas.py
++delete_machine:
++ cmd.run:
++ - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine delete {{ pillar['system_id'] }}"
++ - require:
++ - cmd: maas_login_admin
+diff --git a/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls b/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls
+new file mode 100644
+index 0000000..874718f
+--- /dev/null
++++ b/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls
+@@ -0,0 +1,12 @@
++{%- from "maas/map.jinja" import region with context %}
++
++maas_login_admin:
++ cmd.run:
++ - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
++
++# TODO: implement mark_broken_fixed_machine via _modules/maas.py
++mark_broken_fixed_machine:
++ cmd.run:
++ - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine mark-broken {{ pillar['system_id'] }} && maas opnfv machine mark-fixed {{ pillar['system_id'] }}"
++ - require:
++ - cmd: maas_login_admin