aboutsummaryrefslogtreecommitdiffstats
path: root/extraconfig
diff options
context:
space:
mode:
Diffstat (limited to 'extraconfig')
-rwxr-xr-xextraconfig/tasks/major_upgrade_ceph_mon.sh21
-rw-r--r--extraconfig/tasks/major_upgrade_ceph_storage.sh77
-rw-r--r--extraconfig/tasks/major_upgrade_pacemaker.yaml15
-rwxr-xr-xextraconfig/tasks/pacemaker_resource_restart.sh24
4 files changed, 115 insertions, 22 deletions
diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh
index 38befbbf..b76dd7c3 100755
--- a/extraconfig/tasks/major_upgrade_ceph_mon.sh
+++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh
@@ -17,6 +17,21 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then
exit 0
fi
+CEPH_STATUS=$(ceph health | awk '{print $1}')
+if [ ${CEPH_STATUS} = HEALTH_ERR ]; do
+ echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded
+ exit 1
+fi
+
+# Useful when upgrading with OSDs num < replica size
+if [ $ignore_ceph_upgrade_warnings != "true" ]; then
+ timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do
+ echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK;
+ sleep 30;
+ CEPH_STATUS=$(ceph health | awk '{print $1}')
+ done"
+fi
+
MON_PID=$(pidof ceph-mon)
MON_ID=$(hostname -s)
@@ -37,8 +52,6 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
echo WARNING: Ceph was not upgraded, restarting daemons
service ceph start mon.${MON_ID}
elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
- echo INFO: Ceph was upgraded to Jewel
-
# RPM could own some of these but we can't take risks on the pre-existing files
for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do
chown -R ceph:ceph $d
@@ -54,9 +67,11 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
# Wait for daemon to be back in the quorum
timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do
- echo Waiting for mon.${MON_ID} to re-join quorum;
+ echo WARNING: Waiting for mon.${MON_ID} to re-join quorum;
sleep 10;
done"
+
+ echo INFO: Ceph was upgraded to Jewel
else
echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention
exit 1
diff --git a/extraconfig/tasks/major_upgrade_ceph_storage.sh b/extraconfig/tasks/major_upgrade_ceph_storage.sh
index de42b16d..03a1c1c2 100644
--- a/extraconfig/tasks/major_upgrade_ceph_storage.sh
+++ b/extraconfig/tasks/major_upgrade_ceph_storage.sh
@@ -4,32 +4,89 @@
# major upgrade workflow.
#
set -eu
+set -o pipefail
UPGRADE_SCRIPT=/root/tripleo_upgrade_node.sh
-cat > $UPGRADE_SCRIPT << ENDOFCAT
+cat > $UPGRADE_SCRIPT << 'ENDOFCAT'
+#!/bin/bash
### DO NOT MODIFY THIS FILE
### This file is automatically delivered to the ceph-storage nodes as part of the
### tripleo upgrades workflow
+set -eu
+
+echo INFO: starting $(basename "$0")
+# Exit if not running
+if ! pidof ceph-osd; then
+ echo INFO: ceph-osd is not running, skipping
+ exit 0
+fi
-function systemctl_ceph {
- action=\$1
- systemctl \$action ceph
-}
+# Exit if not Hammer
+INSTALLED_VERSION=$(ceph --version | awk '{print $3}')
+if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then
+ echo INFO: version of Ceph installed is not 0.94, skipping
+ exit 0
+fi
-# "so that mirrors aren't rebalanced as if the OSD died" - gfidente
+OSD_PIDS=$(pidof ceph-osd)
+OSD_IDS=$(ls /var/lib/ceph/osd | awk 'BEGIN { FS = "-" } ; { print $2 }')
+
+# "so that mirrors aren't rebalanced as if the OSD died" - gfidente / leseb
ceph osd set noout
+ceph osd set norebalance
+ceph osd set nodeep-scrub
+ceph osd set noscrub
+
+# Stop daemon using Hammer sysvinit script
+for OSD_ID in $OSD_IDS; do
+ service ceph stop osd.${OSD_ID}
+done
+
+# Nice guy will return non-0 only when all failed
+timeout 60 bash -c "while kill -0 ${OSD_PIDS} 2> /dev/null; do
+ sleep 2;
+done"
-systemctl_ceph stop
+# Update (Ceph to Jewel)
yum -y install python-zaqarclient # needed for os-collect-config
yum -y update
-systemctl_ceph start
-ceph osd unset noout
+# Restart/Exit if not on Jewel, only in that case we need the changes
+UPDATED_VERSION=$(ceph --version | awk '{print $3}')
+if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
+ echo WARNING: Ceph was not upgraded, restarting daemon
+ for OSD_ID in $OSD_IDS; do
+ service ceph start osd.${OSD_ID}
+ done
+elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
+ # RPM could own some of these but we can't take risks on the pre-existing files
+ for d in /var/lib/ceph/osd /var/log/ceph /var/run/ceph /etc/ceph; do
+ chown -R ceph:ceph $d
+ done
+
+ # Replay udev events with newer rules
+ udevadm trigger && udevadm settle
+
+ # Enable systemd unit
+ systemctl enable ceph-osd.target
+ for OSD_ID in $OSD_IDS; do
+ systemctl enable ceph-osd@${OSD_ID}
+ systemctl start ceph-osd@${OSD_ID}
+ done
+ echo INFO: Ceph was upgraded to Jewel
+else
+ echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention
+ exit 1
+fi
+
+ceph osd unset noout
+ceph osd unset norebalance
+ceph osd unset nodeep-scrub
+ceph osd unset noscrub
ENDOFCAT
# ensure the permissions are OK
chmod 0755 $UPGRADE_SCRIPT
-
diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml
index c2e14880..598d22d0 100644
--- a/extraconfig/tasks/major_upgrade_pacemaker.yaml
+++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml
@@ -26,6 +26,10 @@ parameters:
constraints:
- allowed_values: ['auto', 'yes', 'no']
default: 'auto'
+ IgnoreCephUpgradeWarnings:
+ type: boolean
+ default: false
+ description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
resources:
# TODO(jistr): for Mitaka->Newton upgrades and further we can use
@@ -36,7 +40,16 @@ resources:
type: OS::Heat::SoftwareConfig
properties:
group: script
- config: {get_file: major_upgrade_ceph_mon.sh}
+ config:
+ list_join:
+ - ''
+ - - str_replace:
+ template: |
+ #!/bin/bash
+ ignore_ceph_upgrade_warnings='IGNORE_CEPH_UPGRADE_WARNINGS'
+ params:
+ IGNORE_CEPH_UPGRADE_WARNINGS: {get_param: IgnoreCephUpgradeWarnings}
+ - get_file: major_upgrade_ceph_mon.sh
CephMonUpgradeDeployment:
type: OS::Heat::SoftwareDeploymentGroup
diff --git a/extraconfig/tasks/pacemaker_resource_restart.sh b/extraconfig/tasks/pacemaker_resource_restart.sh
index 1637cee2..fd1fd0dc 100755
--- a/extraconfig/tasks/pacemaker_resource_restart.sh
+++ b/extraconfig/tasks/pacemaker_resource_restart.sh
@@ -7,15 +7,23 @@ pacemaker_status=$(systemctl is-active pacemaker)
# Run if pacemaker is running, we're the bootstrap node,
# and we're updating the deployment (not creating).
if [ "$pacemaker_status" = "active" -a \
- "$(hiera bootstrap_nodeid)" = "$(facter hostname)" -a \
- "$(hiera stack_action)" = "UPDATE" ]; then
+ "$(hiera bootstrap_nodeid)" = "$(facter hostname)" ]; then
- PCMK_RESOURCES="haproxy-clone redis-master rabbitmq-clone galera-master openstack-cinder-volume openstack-cinder-backup"
- # Ten minutes of timeout to restart each resource, given there are no constraints should be enough
TIMEOUT=600
- for resource in $PCMK_RESOURCES; do
- if pcs status | grep $resource; then
- pcs resource restart --wait=$TIMEOUT $resource
- fi
+ SERVICES_TO_RESTART="$(ls /var/lib/tripleo/pacemaker-restarts)"
+ PCS_STATUS_OUTPUT="$(pcs status)"
+
+ for service in $SERVICES_TO_RESTART; do
+ if ! echo "$PCS_STATUS_OUTPUT" | grep $service; then
+ echo "Service $service not found as a pacemaker resource, cannot restart it."
+ exit 1
+ fi
+ done
+
+ for service in $SERVICES_TO_RESTART; do
+ echo "Restarting $service..."
+ pcs resource restart --wait=$TIMEOUT $service
+ rm -f /var/lib/tripleo/pacemaker-restarts/$service
done
+
fi