extraconfig/tasks/major_upgrade_pacemaker_migrations.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

#!/bin/bash

# Special pieces of upgrade migration logic go into this
# file. E.g. Pacemaker cluster transitions for existing deployments,
# matching changes to overcloud_controller_pacemaker.pp (Puppet
# handles deployment, this file handles migrations).
#
# This file shouldn't execute any action on its own, all logic should
# be wrapped into bash functions. Upgrade scripts will source this
# file and call the functions defined in this file where appropriate.
#
# The migration functions should be idempotent. If the migration has
# been already applied, it should be possible to call the function
# again without damaging the deployment or failing the upgrade.

# If the major version of mysql is going to change after the major
# upgrade, the database must be upgraded on disk to avoid failures
# due to internal incompatibilities between major mysql versions
# https://bugs.launchpad.net/tripleo/+bug/1587449
# This function detects whether a database upgrade is required
# after a mysql package upgrade. It returns 0 when no major upgrade
# has to take place, 1 otherwise.
function is_mysql_upgrade_needed {
    # The name of the package which provides mysql might differ
    # after the upgrade. Consider the generic package name, which
    # should capture the major version change (e.g. 5.5 -> 10.1)
    local name="mariadb"
    local output
    local ret
    set +e
    output=$(yum -q check-update $name)
    ret=$?
    set -e
    if [ $ret -ne 100 ]; then
        # no updates so we exit
        echo "0"
        return
    fi

    local currentepoch=$(rpm -q --qf "%{epoch}" $name)
    local currentversion=$(rpm -q --qf "%{version}" $name | cut -d. -f-2)
    local currentrelease=$(rpm -q --qf "%{release}" $name)
    local newoutput=$(repoquery -a --pkgnarrow=updates --qf "%{epoch} %{version} %{release}\n" $name)
    local newepoch=$(echo "$newoutput" | awk '{ print $1 }')
    local newversion=$(echo "$newoutput" | awk '{ print $2 }' | cut -d. -f-2)
    local newrelease=$(echo "$newoutput" | awk '{ print $3 }')

    # With this we trigger the dump restore/path if we change either epoch or
    # version in the package If only the release tag changes we do not do it
    # FIXME: we could refine this by trying to parse the mariadb version
    # into X.Y.Z and trigger the update only if X and/or Y change.
    output=$(python -c "import rpm; rc = rpm.labelCompare((\"$currentepoch\", \"$currentversion\", None), (\"$newepoch\", \"$newversion\", None)); print rc")
    if [ "$output" != "-1" ]; then
        echo "0"
        return
    fi
    echo "1"
}

# This function returns the list of services to be migrated away from pacemaker
# and to systemd. The reason to have these services in a separate function is because
# this list is needed in three different places: major_upgrade_controller_pacemaker_{1,2}
# and in the function to migrate the cluster from full HA to HA NG
function services_to_migrate {
    # The following PCMK resources the ones the we are going to delete
    PCMK_RESOURCE_TODELETE="
    httpd-clone
    memcached-clone
    mongod-clone
    neutron-dhcp-agent-clone
    neutron-l3-agent-clone
    neutron-metadata-agent-clone
    neutron-netns-cleanup-clone
    neutron-openvswitch-agent-clone
    neutron-ovs-cleanup-clone
    neutron-server-clone
    openstack-aodh-evaluator-clone
    openstack-aodh-listener-clone
    openstack-aodh-notifier-clone
    openstack-ceilometer-api-clone
    openstack-ceilometer-central-clone
    openstack-ceilometer-collector-clone
    openstack-ceilometer-notification-clone
    openstack-cinder-api-clone
    openstack-cinder-scheduler-clone
    openstack-glance-api-clone
    openstack-glance-registry-clone
    openstack-gnocchi-metricd-clone
    openstack-gnocchi-statsd-clone
    openstack-heat-api-cfn-clone
    openstack-heat-api-clone
    openstack-heat-api-cloudwatch-clone
    openstack-heat-engine-clone
    openstack-nova-api-clone
    openstack-nova-conductor-clone
    openstack-nova-consoleauth-clone
    openstack-nova-novncproxy-clone
    openstack-nova-scheduler-clone
    openstack-sahara-api-clone
    openstack-sahara-engine-clone
    "
    echo $PCMK_RESOURCE_TODELETE
}

# This function will migrate a mitaka system where all the resources are managed
# via pacemaker to a newton setup where only a few services will be managed by pacemaker
# On a high-level it will operate as follows:
# 1. Set the cluster in maintenance-mode so no start/stop action will actually take place
#    during the conversion
# 2. Remove all the colocation constraints and then the ordering constraints, except the
#    ones related to haproxy/VIPs which exist in Newton as well
# 3. Remove all the resources that won't be managed by pacemaker in newton. Note that they
#    will show up as ORPHANED but they will keep running normally via systemd. They will be
#    enabled to start at boot by puppet during the converge step
# 4. Take the cluster out of maintenance-mode and do a resource cleanup
function migrate_full_to_ng_ha {
    if [[ -n $(pcmk_running) ]]; then
        pcs property set maintenance-mode=true
        # We are making sure here that the property has propagated everywhere
        if ! timeout -k 10 300 crm_resource --wait; then
            echo_error "ERROR: cluster remained unstable after setting maintenance-mode for more than 300 seconds, exiting."
            exit 1
        fi
        # First we go through all the colocation constraints (except the ones we want to keep, i.e. the haproxy/ip ones)
        # and we remove those
        COL_CONSTRAINTS=$(pcs config show | sed -n '/^Colocation Constraints:$/,/^$/p' | grep -v "Colocation Constraints:" | egrep -v "ip-.*haproxy" | awk '{print $NF}' | cut -f2 -d: |cut -f1 -d\))
        for constraint in $COL_CONSTRAINTS; do
            log_debug "Deleting colocation constraint $constraint from CIB"
            pcs constraint remove "$constraint"
        done

        # Now we kill all the ordering constraints (except the haproxy/ip ones)
        ORD_CONSTRAINTS=$(pcs config show | sed -n '/^Ordering Constraints:/,/^Colocation Constraints:$/p' | grep -v "Ordering Constraints:"  | awk '{print $NF}' | cut -f2 -d: |cut -f1 -d\))
        for constraint in $ORD_CONSTRAINTS; do
            log_debug "Deleting ordering constraint $constraint from CIB"
            pcs constraint remove "$constraint"
        done

        # At this stage there are no constraints whatsoever except the haproxy/ip ones
        # which we want to keep. We now delete each resource that will move to systemd
        # Note that the corresponding systemd resource will stay running, which means that
        # later when we do the "yum update", things will be a bit slower because each
        # "systemctl try-restart <service>" is not a no-op any longer because the service is up
        # and running and it will be restarted with rabbitmq being down.
        PCS_STATUS_OUTPUT="$(pcs status)"
        for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do
             if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then
                 log_debug "Deleting $resource from the CIB"

                 # We need to add --force because the cluster is in maintenance mode and the resource
                 # is unmanaged. The if serves to make this idempotent
                 pcs resource delete --force "$resource"
             else
                 log_debug "Service $service not found as a pacemaker resource, not trying to delete."
             fi
        done

        # At this stage all the pacemaker resources are removed from the CIB. Once we remove the
        # maintenance-mode those systemd resources will keep on running. They shall be systemd enabled
        # via the puppet converge step later on
        pcs property set maintenance-mode=false
        # We need to do a pcs resource cleanup here + crm_resource --wait to make sure the
        # cluster is in a clean state before we stop everything, upgrade and restart everything
        pcs resource cleanup
        # We are making sure here that the cluster is stable before proceeding
        if ! timeout -k 10 600 crm_resource --wait; then
            echo_error "ERROR: cluster remained unstable after resource cleanup for more than 600 seconds, exiting."
            exit 1
        fi
    fi
}