Relax pre-upgrade check for failed actions

Before this change we checked the cluster for any failed actions and we stopped the upgrade process if there were any. This is likely eccessive as a failed action could have happened in the past and the cluster is now fully functional. Better to check if any of the resources are in Stopped state and break the upgrade process if any of them are. We also need to restrict this check to the bootstrap node because otherwise the following might happen: 1) Bootstrap node does the check, it is successful and it starts the full HA -> HA NG migration which *will* create failed actions and will start stopping resources 2) If the check now starts on a non-bootstrap node while 1) is ongoing, it will find either failed actions or stopped resources so it will fail. Change-Id: Ib091f6dd8884025d2e23bf2fa700169e2dec778f Closes-Bug: #1628653
author: Michele Baldessari <michele@acksyn.org> 2016-09-28 22:55:25 +0200
committer: Michele Baldessari <michele@acksyn.org> 2016-09-29 09:02:24 +0200
commit: 32c54304f489405ea2e3ab67f5de236ab6f2e5ec (patch)
tree: d187c2ccdb7c938d859ee214ce84606b11d20240 /extraconfig
parent: 1b634c21c118a54071abd2bc5181453f3aa8d7a7 (diff)
2 files changed, 5 insertions, 3 deletions
diff --git a/extraconfig/tasks/major_upgrade_check.sh b/extraconfig/tasks/major_upgrade_check.sh
index dc7ec71a..b65f6915 100755
--- a/extraconfig/tasks/major_upgrade_check.sh
+++ b/extraconfig/tasks/major_upgrade_check.sh
@@ -88,8 +88,8 @@ check_python_rpm()
 
 check_clean_cluster()
 {
-    if crm_mon -1 | grep -A3 Failed; then
-        echo_error "ERROR: upgrade cannot start with failed resources on the cluster. Clean them up before starting: pcs resource cleanup."
+    if pcs status | grep -q Stopped:; then
+        echo_error "ERROR: upgrade cannot start with stopped resources on the cluster. Make sure that all the resources are up and running."
         exit 1
     fi
 }
diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
index cdf3fa70..d4200e5f 100755
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
@@ -6,7 +6,9 @@ cluster_sync_timeout=1800
 
 check_cluster
 check_pcsd
-check_clean_cluster
+if [[ -n $(is_bootstrap_node) ]]; then
+    check_clean_cluster
+fi
 check_python_rpm
 check_galera_root_password
 check_disk_for_mysql_dump
author	Michele Baldessari <michele@acksyn.org>	2016-09-28 22:55:25 +0200
committer	Michele Baldessari <michele@acksyn.org>	2016-09-29 09:02:24 +0200
commit	32c54304f489405ea2e3ab67f5de236ab6f2e5ec (patch)
tree	d187c2ccdb7c938d859ee214ce84606b11d20240 /extraconfig
parent	1b634c21c118a54071abd2bc5181453f3aa8d7a7 (diff)