aboutsummaryrefslogtreecommitdiffstats
path: root/extraconfig/tasks/yum_update.sh
blob: bab67e01c7f33b15f7e2d6e8d9f3eed731794a01 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/bin/bash

# A heat-config-script which runs yum update during a stack-update.
# Inputs:
#   deploy_action - yum will only be run if this is UPDATE
#   update_identifier - yum will only run for previously unused values of update_identifier
#   command - yum sub-command to run, defaults to "update"
#   command_arguments - yum command arguments, defaults to ""

echo "Started yum_update.sh on server $deploy_server_id at `date`"
echo -n "false" > $heat_outputs_path.update_managed_packages

if [ -f /.dockerenv ]; then
    echo "Not running due to running inside a container"
    exit 0
fi

if [[ -z "$update_identifier" ]]; then
    echo "Not running due to unset update_identifier"
    exit 0
fi

timestamp_dir=/var/lib/overcloud-yum-update
mkdir -p $timestamp_dir

# sanitise to remove unusual characters
update_identifier=${update_identifier//[^a-zA-Z0-9-_]/}

# seconds to wait for this node to rejoin the cluster after update
cluster_start_timeout=600
galera_sync_timeout=1800
cluster_settle_timeout=1800

timestamp_file="$timestamp_dir/$update_identifier"
if [[ -a "$timestamp_file" ]]; then
    echo "Not running for already-run timestamp \"$update_identifier\""
    exit 0
fi
touch "$timestamp_file"

pacemaker_status=""
if hiera -c /etc/puppet/hiera.yaml service_names | grep -q pacemaker; then
    pacemaker_status=$(systemctl is-active pacemaker)
fi

# (NB: when backporting this s/pacemaker_short_bootstrap_node_name/bootstrap_nodeid)
# This runs before the yum_update so we are guaranteed to run it even in the absence
# of packages to update (the check for -z "$update_identifier" guarantees that this
# is run only on overcloud stack update -i)
if [[ "$pacemaker_status" == "active" && \
        "$(hiera -c /etc/puppet/hiera.yaml pacemaker_short_bootstrap_node_name)" == "$(facter hostname)" ]] ; then \
    # OCF scripts don't cope with -eu
    echo "Verifying if we need to fix up any IPv6 VIPs"
    set +eu
    fixup_wrong_ipv6_vip
    ret=$?
    set -eu
    if [ $ret -ne 0 ]; then
        echo "Fixing up IPv6 VIPs failed. Stopping here. (See https://bugs.launchpad.net/tripleo/+bug/1686357 for more info)"
        exit 1
    fi
fi

command_arguments=${command_arguments:-}

# yum check-update exits 100 if updates are available
set +e
check_update=$(yum check-update 2>&1)
check_update_exit=$?
set -e

if [[ "$check_update_exit" == "1" ]]; then
    echo "Failed to check for package updates"
    echo "$check_update"
    exit 1
elif [[ "$check_update_exit" != "100" ]]; then
    echo "No packages require updating"
    exit 0
fi


# TODO: FIXME: remove this in Pike.
# Hack around mod_ssl update and puppet https://bugs.launchpad.net/tripleo/+bug/1682448
touch /etc/httpd/conf.d/ssl.conf

# Fix the redis/rabbit resource start/stop timeouts. See https://bugs.launchpad.net/tripleo/+bug/1633455
# and https://bugs.launchpad.net/tripleo/+bug/1634851
if [[ "$pacemaker_status" == "active" && \
      "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]] ; then
    if pcs resource show rabbitmq | grep -E "start.*timeout=100"; then
        pcs resource update rabbitmq op start timeout=200s
    fi
    if pcs resource show rabbitmq | grep -E "stop.*timeout=90"; then
        pcs resource update rabbitmq op stop timeout=200s
    fi
    if pcs resource show redis | grep -E "start.*timeout=120"; then
        pcs resource update redis op start timeout=200s
    fi
    if pcs resource show redis | grep -E "stop.*timeout=120"; then
        pcs resource update redis op stop timeout=200s
    fi
fi

# special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714
special_case_ovs_upgrade_if_needed

if [[ "$pacemaker_status" == "active" ]] ; then
    echo "Pacemaker running, stopping cluster node and doing full package update"
    node_count=$(pcs status xml | grep -o "<nodes_configured.*/>" | grep -o 'number="[0-9]*"' | grep -o "[0-9]*")
    if [[ "$node_count" == "1" ]] ; then
        echo "Active node count is 1, stopping node with --force"
        pcs cluster stop --force
    else
        pcs cluster stop
    fi
else
    echo "Upgrading openstack-puppet-modules and its dependencies"
    yum -q -y update openstack-puppet-modules
    yum deplist openstack-puppet-modules | awk '/dependency/{print $2}' | xargs yum -q -y update
    echo "Upgrading other packages is handled by config management tooling"
    echo -n "true" > $heat_outputs_path.update_managed_packages
    exit 0
fi

command=${command:-update}
full_command="yum -q -y $command $command_arguments"
echo "Running: $full_command"

result=$($full_command)
return_code=$?
echo "$result"
echo "yum return code: $return_code"

if [[ "$pacemaker_status" == "active" ]] ; then
    echo "Starting cluster node"
    pcs cluster start

    hostname=$(hostname -s)
    tstart=$(date +%s)
    while [[ "$(pcs status | grep "^Online" | grep -F -o $hostname)" == "" ]]; do
        sleep 5
        tnow=$(date +%s)
        if (( tnow-tstart > cluster_start_timeout )) ; then
            echo "ERROR $hostname failed to join cluster in $cluster_start_timeout seconds"
            pcs status
            exit 1
        fi
    done

    RETVAL=$( pcs resource show galera-master | grep wsrep_cluster_address | grep -q `crm_node -n` ; echo $? )

    if [[ $RETVAL -eq 0 && -e /etc/sysconfig/clustercheck ]]; then
        tstart=$(date +%s)
        while ! clustercheck; do
            sleep 5
            tnow=$(date +%s)
            if (( tnow-tstart > galera_sync_timeout )) ; then
                echo "ERROR galera sync timed out"
                exit 1
            fi
        done
    fi

    echo "Waiting for pacemaker cluster to settle"
    if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then
        echo "ERROR timed out while waiting for the cluster to settle"
        exit 1
    fi

    pcs status
fi


echo "Finished yum_update.sh on server $deploy_server_id at `date` with return code: $return_code"

exit $return_code