summaryrefslogtreecommitdiffstats
path: root/patches/opnfv-fuel/0018-MaaS-commissioning-deployment-retry.patch
blob: 3a26d9b09aad6d85fe47e1627df0766d3ecde24a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
From: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
Date: Wed, 23 Aug 2017 04:23:26 +0200
Subject: [PATCH] MaaS: commissioning/deployment retry

While at it, parametrize max attempt number in maas state's "wait_for",
and reduce retries count for certain simpler tasks.

Change-Id: I3ac2877719cdd32613bcf41186ebbb9f3f3aee93
Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
---
 mcp/config/states/maas                             | 63 ++++++++++++++++------
 mcp/salt-formulas/maas/machines/delete.sls         | 12 +++++
 .../maas/machines/mark_broken_fixed.sls            | 12 +++++
 3 files changed, 70 insertions(+), 17 deletions(-)
 create mode 100644 mcp/salt-formulas/maas/machines/delete.sls
 create mode 100644 mcp/salt-formulas/maas/machines/mark_broken_fixed.sls

diff --git a/mcp/config/states/maas b/mcp/config/states/maas
index 0cf4f68..080bbf5 100755
--- a/mcp/config/states/maas
+++ b/mcp/config/states/maas
@@ -1,17 +1,47 @@
 #!/bin/bash
 function wait_for() {
+  local total_attempts=$1; shift
   local cmdstr=$@
-  local total_attempts=360
   local sleep_time=10
-  local attempt=1
   echo "[NOTE] Waiting for cmd to return success: ${cmdstr}"
-  while ((attempt <= total_attempts)); do
+  for attempt in $(seq "${total_attempts}"); do
     eval "${cmdstr}" && break || true
     echo -n '.'; sleep "${sleep_time}"
-    ((attempt+=1))
   done
 }

+# Wait for MaaS commissioning/deploy to finish, retry on failure
+function maas_fixup() {
+  local statuscmd="salt 'mas01*' --out yaml state.apply maas.machines.status"
+  wait_for 180 "${statuscmd} | tee /dev/stderr | " \
+           "grep -Eq '((Deployed|Ready): 5|status:Failed|status:Allocated)'"
+  local statusout=$(eval "${statuscmd}")
+
+  local fcnodes=$(echo "${statusout}" | \
+    grep -Po '(?<=system_id:)(.*)(?=,status:Failed commissioning)')
+  for node_system_id in ${fcnodes}; do
+    salt -C 'mas01*' state.apply maas.machines.delete \
+      pillar="{'system_id': '${node_system_id}'}"
+  done
+  if [ -n "${fcnodes}" ]; then
+    salt -C 'mas01*' state.apply maas.machines
+    return 1
+  fi
+
+  local fdnodes=$(echo "${statusout}" | \
+    grep -Po '(?<=system_id:)(.*)(?=,status:(Failed deployment|Allocated))')
+  for node_system_id in ${fdnodes}; do
+    salt -C 'mas01*' state.apply maas.machines.mark_broken_fixed \
+      pillar="{'system_id': '${node_system_id}'}"
+  done
+  if [ -n "${fdnodes}" ]; then
+    salt -C 'mas01*' state.apply maas.machines.deploy
+    return 1
+  fi
+
+  return 0
+}
+
 # MaaS rack/region controller, node commissioning
 salt -C 'mas01*' cmd.run "add-apt-repository ppa:maas/stable"

@@ -22,20 +52,18 @@ salt -C 'mas01*' state.apply maas.cluster
 salt -C 'cfg01*' cmd.run \
   "route add -net 192.168.11.0/24 gw ${MAAS_IP:-192.168.10.2}"

-wait_for "salt -C 'mas01*' state.apply maas.region"
+wait_for 10 "salt -C 'mas01*' state.apply maas.region"

 salt -C 'mas01*' state.apply maas.machines
-# TODO: relax cond, as this is not re-entrant (e.g. nodes already deployed)
-wait_for "salt 'mas01*' --out yaml state.apply maas.machines.status | " \
-         "tee /dev/stderr | fgrep -q 'Ready: 5'"
+wait_for 10 maas_fixup

 # cleanup outdated salt keys
-salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | xargs -I{} salt-key -yd {}
+salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | \
+  xargs -I{} salt-key -yd {}

 # MaaS node deployment
 salt -C 'mas01*' state.apply maas.machines.deploy
-wait_for "salt 'mas01*' --out yaml state.apply maas.machines.status | " \
-         "tee /dev/stderr | fgrep -q 'Deployed: 5'"
+wait_for 10 maas_fixup

 salt -C 'mas01*' pillar.item\
   maas:region:admin:username \
@@ -48,7 +76,7 @@ salt -C 'kvm*' pkg.install bridge-utils
 salt -C 'kvm*' state.apply linux.network
 salt -C 'kvm*' state.apply armband.bootstrap_script_arm64
 salt -C 'kvm*' system.reboot
-wait_for "! salt '*' test.ping | tee /dev/stderr | fgrep -q 'Not connected'"
+wait_for 90 "! salt '*' test.ping | tee /dev/stderr | fgrep -q 'Not connected'"

 salt -C '* and not cfg01* and not mas01*' state.apply linux,ntp

@@ -59,7 +87,8 @@ salt -C '* and not cfg01* and not mas01*' state.apply salt
 salt -C 'kvm*' saltutil.sync_all
 salt -C 'kvm*' state.sls salt.control

-vcp_nodes=$(salt --out yaml 'kvm01*' pillar.get salt:control:cluster:internal:node | awk '/\s+\w+:$/ {gsub(/:$/, "*"); print $1}')
+vcp_nodes=$(salt --out yaml 'kvm01*' pillar.get salt:control:cluster:internal:node | \
+            awk '/\s+\w+:$/ {gsub(/:$/, "*"); print $1}')

 # Check all vcp nodes are available
 rc=1
@@ -71,9 +100,9 @@ while [ $rc -ne 0 ]; do
   sleep 5
 done

-wait_for "salt -C '* and not cfg01* and not mas01*' ssh.set_auth_key ${SUDO_USER} \
+wait_for 10 "salt -C '* and not cfg01* and not mas01*' ssh.set_auth_key ${SUDO_USER} \
   $(awk 'NR==1{print $2}' $(eval echo ~${SUDO_USER}/.ssh/authorized_keys))"

-wait_for "salt -C '* and not cfg01* and not mas01*' saltutil.sync_all"
-salt -C '* and not cfg01* and not mas01*' state.apply salt
-wait_for "salt -C '* and not cfg01* and not mas01*' state.apply linux,ntp"
+wait_for 10 "salt -C '* and not cfg01* and not mas01*' saltutil.sync_all"
+wait_for 10 "salt -C '* and not cfg01* and not mas01*' state.apply salt"
+wait_for 10 "salt -C '* and not cfg01* and not mas01*' state.apply linux,ntp"
diff --git a/mcp/salt-formulas/maas/machines/delete.sls b/mcp/salt-formulas/maas/machines/delete.sls
new file mode 100644
index 0000000..306dbca
--- /dev/null
+++ b/mcp/salt-formulas/maas/machines/delete.sls
@@ -0,0 +1,12 @@
+{%- from "maas/map.jinja" import region with context %}
+
+maas_login_admin:
+  cmd.run:
+  - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
+
+# TODO: implement delete_machine via _modules/maas.py
+delete_machine:
+  cmd.run:
+  - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine delete {{ pillar['system_id'] }}"
+  - require:
+    - cmd: maas_login_admin
diff --git a/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls b/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls
new file mode 100644
index 0000000..874718f
--- /dev/null
+++ b/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls
@@ -0,0 +1,12 @@
+{%- from "maas/map.jinja" import region with context %}
+
+maas_login_admin:
+  cmd.run:
+  - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
+
+# TODO: implement mark_broken_fixed_machine via _modules/maas.py
+mark_broken_fixed_machine:
+  cmd.run:
+  - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine mark-broken {{ pillar['system_id'] }} && maas opnfv machine mark-fixed {{ pillar['system_id'] }}"
+  - require:
+    - cmd: maas_login_admin