aboutsummaryrefslogtreecommitdiffstats
path: root/mcp/config/states/maas
blob: 2a0f94809a378f211813a3940ee085a91d5c3fab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
set -x

function wait_for() {
  local total_attempts=$1; shift
  local cmdstr=$*
  local sleep_time=10
  echo "[NOTE] Waiting for cmd to return success: ${cmdstr}"
  # shellcheck disable=SC2034
  for attempt in $(seq "${total_attempts}"); do
    # shellcheck disable=SC2015
    eval "${cmdstr}" && break || true
    echo -n '.'; sleep "${sleep_time}"
  done
}

# Wait for MaaS commissioning/deploy to finish, retry on failure
function maas_fixup() {
  local statuscmd="salt 'mas01*' --out yaml state.apply maas.machines.status"
  # shellcheck disable=SC2155
  local ncount=$(salt --out yaml 'mas01*' pillar.get maas:region:machines | \
    grep -cE '^\s{2}\w+:$')
  wait_for 180 "${statuscmd} | tee /dev/stderr | " \
    "grep -Eq '((Deployed|Ready): ${ncount}|status:Failed|status:Allocated)'"
  # shellcheck disable=SC2155
  local statusout=$(eval "${statuscmd}")

  # shellcheck disable=SC2155
  local fcnodes=$(echo "${statusout}" | \
    grep -Po '(?<=system_id:)(.*)(?=,status:Failed commissioning)')
  for node_system_id in ${fcnodes}; do
    salt -C 'mas01*' state.apply maas.machines.delete \
      pillar="{'system_id': '${node_system_id}'}"
    sleep 30
  done
  if [ -n "${fcnodes}" ]; then
    salt -C 'mas01*' state.apply maas.machines
    return 1
  fi

  # shellcheck disable=SC2155
  local fdnodes=$(echo "${statusout}" | \
    grep -Po '(?<=system_id:)(.*)(?=,status:(Failed deployment|Allocated))')
  for node_system_id in ${fdnodes}; do
    salt -C 'mas01*' state.apply maas.machines.mark_broken_fixed \
      pillar="{'system_id': '${node_system_id}'}"
    sleep 30
  done
  if [ -n "${fdnodes}" ]; then
    salt -C 'mas01*' state.apply maas.machines.deploy
    return 1
  fi

  return 0
}

# MaaS rack/region controller, node commissioning
salt -C 'mas01*' cmd.run "add-apt-repository ppa:maas/stable"

salt -C 'mas01*' state.apply linux,salt,openssh,ntp
salt -C 'mas01*' state.apply linux.network.interface
salt -C 'mas01*' state.apply maas.pxe_nat
salt -C 'mas01*' state.apply maas.cluster
salt -C 'cfg01*' state.apply maas.pxe_route

wait_for 10 "salt -C 'mas01*' state.apply maas.region"

salt -C 'mas01*' state.apply maas.machines
wait_for 10 maas_fixup

# cleanup outdated salt keys
salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | \
  xargs -I{} salt-key -yd {}

# MaaS node deployment
salt -C 'mas01*' state.apply maas.machines.deploy
wait_for 10 maas_fixup

salt -C 'mas01*' pillar.item\
  maas:region:admin:username \
  maas:region:admin:password

# KVM, compute node prereqs (libvirt first), VCP deployment
salt -C '* and not cfg01* and not mas01*' saltutil.sync_all

salt -C 'kvm*' pkg.install bridge-utils
salt -C 'kvm*' state.apply linux.network
salt -C 'kvm*' system.reboot
wait_for 90 "! salt 'kvm*' test.ping | tee /dev/stderr | fgrep -q 'Not connected'"

salt -C '* and not cfg01* and not mas01*' state.apply linux,ntp

salt -C 'kvm*' state.sls libvirt

salt -C '* and not cfg01* and not mas01*' state.apply salt
salt -C 'kvm*' saltutil.sync_all
wait_for 10 "! salt -C 'kvm*' state.sls salt.control | " \
  "tee /dev/stderr | fgrep -q 'Not connected'"

vcp_nodes=$(salt --out yaml 'kvm01*' pillar.get salt:control:cluster:internal:node | \
            awk '/\s+\w+:$/ {gsub(/:$/, "*"); print $1}')

# Check all vcp nodes are available
rc=1
while [ $rc -ne 0 ]; do
  rc=0
  for node in $vcp_nodes; do
    salt "$node" test.ping 2>/dev/null || { rc=$?; break; };
  done
  sleep 5
done

wait_for 10 "salt -C '* and not cfg01* and not mas01*' saltutil.sync_all"
wait_for 10 "salt -C 'E@^(?!cfg01|mas01|kvm|cmp00).*' state.apply salt"
wait_for 10 "! salt -C 'E@^(?!cfg01|mas01|kvm|cmp00).*' state.apply linux,ntp | " \
  "tee /dev/stderr | fgrep -q 'Not connected'"

wait_for 10 "salt -C 'E@^(?!cfg01|mas01|kvm|cmp00).*' ssh.set_auth_key ${SUDO_USER} \
  $(awk 'NR==1{print $2}' "$(eval echo "~${SUDO_USER}/.ssh/authorized_keys")")"

# Get the latest packages
wait_for 10 "! salt '*' pkg.upgrade refresh=False | " \
  "tee /dev/stderr | fgrep -q 'Not connected'"