aboutsummaryrefslogtreecommitdiffstats
path: root/extraconfig/tasks
diff options
context:
space:
mode:
authorGiulio Fidente <gfidente@redhat.com>2016-08-23 22:24:57 +0200
committerGiulio Fidente <gfidente@redhat.com>2016-08-30 18:27:38 +0200
commit4ea34edb793ab4bd3d9d5198ea9ac7568930c384 (patch)
treef7fb92ec5c8433703b3f8252c26da7952833b4a3 /extraconfig/tasks
parente5a627e43af307490a2c4d0fee4f4fe4044aecdb (diff)
Add Ceph cluster health validation on upgrade
This will prevent the Ceph Mon upgrade script from starting if the Ceph cluster is in error state. It also adds a parameter to ignore warning states, useful when performing an upgrade of a cluster where the number of healthy OSDs does not guarantee the desired replica size. Closes-Bug: 1618533 Change-Id: I1beb8ad0812f19b1018ba19b5a9fc85fa132d7f7
Diffstat (limited to 'extraconfig/tasks')
-rwxr-xr-xextraconfig/tasks/major_upgrade_ceph_mon.sh21
-rw-r--r--extraconfig/tasks/major_upgrade_pacemaker.yaml15
2 files changed, 32 insertions, 4 deletions
diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh
index 38befbbf..b76dd7c3 100755
--- a/extraconfig/tasks/major_upgrade_ceph_mon.sh
+++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh
@@ -17,6 +17,21 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then
exit 0
fi
+CEPH_STATUS=$(ceph health | awk '{print $1}')
+if [ ${CEPH_STATUS} = HEALTH_ERR ]; do
+ echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded
+ exit 1
+fi
+
+# Useful when upgrading with OSDs num < replica size
+if [ $ignore_ceph_upgrade_warnings != "true" ]; then
+ timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do
+ echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK;
+ sleep 30;
+ CEPH_STATUS=$(ceph health | awk '{print $1}')
+ done"
+fi
+
MON_PID=$(pidof ceph-mon)
MON_ID=$(hostname -s)
@@ -37,8 +52,6 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
echo WARNING: Ceph was not upgraded, restarting daemons
service ceph start mon.${MON_ID}
elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
- echo INFO: Ceph was upgraded to Jewel
-
# RPM could own some of these but we can't take risks on the pre-existing files
for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do
chown -R ceph:ceph $d
@@ -54,9 +67,11 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
# Wait for daemon to be back in the quorum
timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do
- echo Waiting for mon.${MON_ID} to re-join quorum;
+ echo WARNING: Waiting for mon.${MON_ID} to re-join quorum;
sleep 10;
done"
+
+ echo INFO: Ceph was upgraded to Jewel
else
echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention
exit 1
diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml
index c2e14880..598d22d0 100644
--- a/extraconfig/tasks/major_upgrade_pacemaker.yaml
+++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml
@@ -26,6 +26,10 @@ parameters:
constraints:
- allowed_values: ['auto', 'yes', 'no']
default: 'auto'
+ IgnoreCephUpgradeWarnings:
+ type: boolean
+ default: false
+ description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
resources:
# TODO(jistr): for Mitaka->Newton upgrades and further we can use
@@ -36,7 +40,16 @@ resources:
type: OS::Heat::SoftwareConfig
properties:
group: script
- config: {get_file: major_upgrade_ceph_mon.sh}
+ config:
+ list_join:
+ - ''
+ - - str_replace:
+ template: |
+ #!/bin/bash
+ ignore_ceph_upgrade_warnings='IGNORE_CEPH_UPGRADE_WARNINGS'
+ params:
+ IGNORE_CEPH_UPGRADE_WARNINGS: {get_param: IgnoreCephUpgradeWarnings}
+ - get_file: major_upgrade_ceph_mon.sh
CephMonUpgradeDeployment:
type: OS::Heat::SoftwareDeploymentGroup