aboutsummaryrefslogtreecommitdiffstats
path: root/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
blob: d4200e5f2ee65474a39c341c3395cfca66d02e89 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/bin/bash

set -eu

cluster_sync_timeout=1800

check_cluster
check_pcsd
if [[ -n $(is_bootstrap_node) ]]; then
    check_clean_cluster
fi
check_python_rpm
check_galera_root_password
check_disk_for_mysql_dump

# We want to disable fencing during the cluster --stop as it might fence
# nodes where a service fails to stop, which could be fatal during an upgrade
# procedure. So we remember the stonith state. If it was enabled we reenable it
# at the end of this script
STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
pcs property set stonith-enabled=false

# Migrate to HA NG
if [[ -n $(is_bootstrap_node) ]]; then
    migrate_full_to_ng_ha
fi

# After migrating the cluster to HA-NG the services not under pacemaker's control
# are still up and running. We need to stop them explicitely otherwise during the yum
# upgrade the rpm %post sections will try to do a systemctl try-restart <service>, which
# is going to take a long time because rabbit is down. By having the service stopped
# systemctl try-restart is a noop

for service in $(services_to_migrate); do
    manage_systemd_service stop "${service%%-clone}"
    # So the reason for not reusing check_resource_systemd is that
    # I have observed systemctl is-active returning unknown with at least
    # one service that was stopped (See LP 1627254)
    timeout=600
    tstart=$(date +%s)
    tend=$(( $tstart + $timeout ))
    check_interval=3
    while (( $(date +%s) < $tend )); do
      if [[ "$(systemctl is-active ${service%%-clone})" = "active" ]]; then
        echo "$service still active, sleeping $check_interval seconds."
        sleep $check_interval
      else
        # we do not care if it is inactive, unknown or failed as long as it is
        # not running
        break
      fi

    done
done

# In case the mysql package is updated, the database on disk must be
# upgraded as well. This typically needs to happen during major
# version upgrades (e.g. 5.5 -> 5.6, 5.5 -> 10.1...)
#
# Because in-place upgrades are not supported across 2+ major versions
# (e.g. 5.5 -> 10.1), we rely on logical upgrades via dump/restore cycle
# https://bugzilla.redhat.com/show_bug.cgi?id=1341968
#
# The default is to determine automatically if upgrade is needed based
# on mysql package versionning, but this can be overriden manually
# to support specific upgrade scenario

if [[ -n $(is_bootstrap_node) ]]; then
    if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
        mysqldump $backup_flags > "$MYSQL_BACKUP_DIR/openstack_database.sql"
        cp -rdp /etc/my.cnf* "$MYSQL_BACKUP_DIR"
    fi

    pcs resource disable redis
    check_resource redis stopped 600
    pcs resource disable rabbitmq
    check_resource rabbitmq stopped 600
    pcs resource disable galera
    check_resource galera stopped 600
    pcs resource disable openstack-cinder-volume
    check_resource openstack-cinder-volume stopped 600
    # Disable all VIPs before stopping the cluster, so that pcs doesn't use one as a source address:
    #   https://bugzilla.redhat.com/show_bug.cgi?id=1330688
    for vip in $(pcs resource show | grep ocf::heartbeat:IPaddr2 | grep Started | awk '{ print $1 }'); do
      pcs resource disable $vip
      check_resource $vip stopped 60
    done
    pcs cluster stop --all
fi


# Swift isn't controlled by pacemaker
systemctl_swift stop

tstart=$(date +%s)
while systemctl is-active pacemaker; do
    sleep 5
    tnow=$(date +%s)
    if (( tnow-tstart > cluster_sync_timeout )) ; then
        echo_error "ERROR: cluster shutdown timed out"
        exit 1
    fi
done

# The reason we do an sql dump *and* we move the old dir out of
# the way is because it gives us an extra level of safety in case
# something goes wrong during the upgrade. Once the restore is
# successful we go ahead and remove it. If the directory exists
# we bail out as it means the upgrade process had issues in the last
# run.
if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
    if [ -d $MYSQL_TEMP_UPGRADE_BACKUP_DIR ]; then
        echo_error "ERROR: mysql backup dir already exist"
        exit 1
    fi
    mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR
fi

yum -y install python-zaqarclient  # needed for os-collect-config
yum -y -q update

# We need to ensure at least those two configuration settings, otherwise
# mariadb 10.1+ won't activate galera replication.
# wsrep_cluster_address must only be set though, its value does not
# matter because it's overriden by the galera resource agent.
cat >> /etc/my.cnf.d/galera.cnf <<EOF
[mysqld]
wsrep_on = ON
wsrep_cluster_address = gcomm://localhost
EOF

if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
    # Scripts run via heat have no HOME variable set and this confuses
    # mysqladmin
    export HOME=/root

    mkdir /var/lib/mysql || /bin/true
    chown mysql:mysql /var/lib/mysql
    chmod 0755 /var/lib/mysql
    restorecon -R /var/lib/mysql/
    mysql_install_db --datadir=/var/lib/mysql --user=mysql
    chown -R mysql:mysql /var/lib/mysql/

    if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then
        mysqld_safe --wsrep-new-cluster &
        # We have a populated /root/.my.cnf with root/password here so
        # we need to temporarily rename it because the newly created
        # db is empty and no root password is set
        mv /root/.my.cnf /root/.my.cnf.temporary
        timeout 60 sh -c 'while ! mysql -e "" &> /dev/null; do sleep 1; done'
        mysql -u root < "$MYSQL_BACKUP_DIR/openstack_database.sql"
        mv /root/.my.cnf.temporary /root/.my.cnf
        mysqladmin -u root shutdown
        # The import was successful so we may remove the folder
        rm -r "$MYSQL_BACKUP_DIR"
    fi
fi

# If we reached here without error we can safely blow away the origin
# mysql dir from every controller

# TODO: What if the upgrade fails on the bootstrap node, but not on
# this controller.  Data may be lost.
if [ $DO_MYSQL_UPGRADE -eq 1 ]; then
    rm -r $MYSQL_TEMP_UPGRADE_BACKUP_DIR
fi

# Let's reset the stonith back to true if it was true, before starting the cluster
if [ $STONITH_STATE == "true" ]; then
    pcs -f /var/lib/pacemaker/cib/cib.xml property set stonith-enabled=true
fi

# Pin messages sent to compute nodes to kilo, these will be upgraded later
crudini  --set /etc/nova/nova.conf upgrade_levels compute "$upgrade_level_nova_compute"
# https://bugzilla.redhat.com/show_bug.cgi?id=1284047
# Change-Id: Ib3f6c12ff5471e1f017f28b16b1e6496a4a4b435
crudini  --set /etc/ceilometer/ceilometer.conf DEFAULT rpc_backend rabbit
# https://bugzilla.redhat.com/show_bug.cgi?id=1284058
# Ifd1861e3df46fad0e44ff9b5cbd58711bbc87c97 Swift Ceilometer middleware no longer exists
crudini --set /etc/swift/proxy-server.conf pipeline:main pipeline "catch_errors healthcheck cache ratelimit tempurl formpost authtoken keystone staticweb proxy-logging proxy-server"
# LP: 1615035, required only for M/N upgrade.
crudini --set /etc/nova/nova.conf DEFAULT scheduler_host_manager host_manager
# LP: 1627450, required only for M/N upgrade
crudini --set /etc/nova/nova.conf DEFAULT scheduler_driver filter_scheduler

crudini --set /etc/sahara/sahara.conf DEFAULT plugins ambari,cdh,mapr,vanilla,spark,storm