From 8da73521d3b9347a982ea6e77114bba0d0f0adeb Mon Sep 17 00:00:00 2001 From: Alexandru Avadanii Date: Mon, 18 Dec 2017 22:12:23 +0100 Subject: [baremetal] MaaS: Reduce timeout values `maas_fixup` is already re-entrant, so we can execute it more than once during a commissioning/deploy cycle. Reduce the timeout waiting for all nodes to reach a stable state, so nodes stuck in 'Ready' state instead of reaching 'Deploying' get dealt with sooner (~5 min vs old 30 min). While at it, let `maas_fixup` handle machine deploy as well, so we can catch nodes stuck in 'Ready' state and re-trigger the deploy. Change-Id: Id24cc97b17489835c5846288639a9a6032bd320a Signed-off-by: Alexandru Avadanii --- mcp/config/states/maas | 17 ++++++++--------- .../0003-maas-region-force-artifact-download.patch | 4 ++-- mcp/salt-formulas/maas/machines/mark_broken_fixed.sls | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'mcp') diff --git a/mcp/config/states/maas b/mcp/config/states/maas index 5e33f3a08..f472c0154 100755 --- a/mcp/config/states/maas +++ b/mcp/config/states/maas @@ -19,7 +19,7 @@ function maas_fixup() { local statuscmd="salt 'mas01*' --out yaml state.apply maas.machines.status" local ncount=$(salt --out yaml 'mas01*' pillar.get maas:region:machines | \ grep -cE '^\s{2}\w+:$') - wait_for 180 "${statuscmd} | tee /dev/stderr | " \ + wait_for 30 "${statuscmd} | tee /dev/stderr | " \ "grep -Eq '((Deployed|Ready): ${ncount}|status: (Failed|Allocated))'" local statusout=$(eval "${statuscmd}") @@ -28,7 +28,7 @@ function maas_fixup() { for node_system_id in ${fcnodes}; do salt -C 'mas01*' state.apply maas.machines.delete \ pillar="{'system_id': '${node_system_id}'}" - sleep 30 + sleep 10 done if [ -n "${fcnodes}" ]; then salt -C 'mas01*' state.apply maas.machines @@ -37,12 +37,14 @@ function maas_fixup() { local fdnodes=$(echo "${statusout}" | \ grep -Pzo 'status: (Failed deployment|Allocated)\n\s+system_id: \K.+\n') + local rnodes=$(echo "${statusout}" | \ + grep -Pzo 'status: Ready\n\s+system_id: \K.+\n') for node_system_id in ${fdnodes}; do salt -C 'mas01*' state.apply maas.machines.mark_broken_fixed \ pillar="{'system_id': '${node_system_id}'}" - sleep 30 + sleep 10 done - if [ -n "${fdnodes}" ]; then + if [ -n "${fdnodes}" ] || [ -n "${rnodes}" ]; then salt -C 'mas01*' state.apply maas.machines.deploy return 1 fi @@ -70,16 +72,13 @@ salt -C 'mas01*' state.apply maas.cluster wait_for 10 "salt -C 'mas01*' state.apply maas.region" salt -C 'mas01*' state.apply maas.machines -wait_for 10 maas_fixup +# MaaS node deployment +wait_for 20 maas_fixup # cleanup outdated salt keys salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | \ xargs -I{} salt-key -yd {} -# MaaS node deployment -salt -C 'mas01*' state.apply maas.machines.deploy -wait_for 10 maas_fixup - salt -C 'mas01*' pillar.item\ maas:region:admin:username \ maas:region:admin:password diff --git a/mcp/patches/0003-maas-region-force-artifact-download.patch b/mcp/patches/0003-maas-region-force-artifact-download.patch index ecda80a02..56e3bd504 100644 --- a/mcp/patches/0003-maas-region-force-artifact-download.patch +++ b/mcp/patches/0003-maas-region-force-artifact-download.patch @@ -66,8 +66,8 @@ new file mode 100644 +maas login {{ region.admin.username }} \ + http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < \ + /var/lib/maas/.maas_credentials || exit 1 -+# wait max 15 min for service up / image download, 5 min region to rack sync -+wait_for 90 "grep -qzE '(Unable to probe for DHCP servers|DHCP probe complete).*Rack controller' /var/log/maas/rackd.log" ++# wait max 5 min for service up, 15 min image download, 5 min region to rack sync ++wait_for 30 "grep -qzE '(Unable to probe for DHCP servers|DHCP probe complete).*Rack controller' /var/log/maas/rackd.log" +maas opnfv boot-resources import || exit 2 +wait_for 90 "! maas opnfv boot-resources is-importing | grep -q 'true'" +maas opnfv rack-controllers import-boot-images || exit 3 diff --git a/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls b/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls index e036d610d..17a7df8d8 100644 --- a/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls +++ b/mcp/salt-formulas/maas/machines/mark_broken_fixed.sls @@ -14,6 +14,6 @@ maas_login_admin: # TODO: implement mark_broken_fixed_machine via _modules/maas.py mark_broken_fixed_machine: cmd.run: - - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine mark-broken {{ pillar['system_id'] }} && sleep 30 && maas opnfv machine mark-fixed {{ pillar['system_id'] }}" + - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine mark-broken {{ pillar['system_id'] }} && sleep 10 && maas opnfv machine mark-fixed {{ pillar['system_id'] }}" - require: - cmd: maas_login_admin -- cgit 1.2.3-korg