5 files changed, 33 insertions, 6 deletions
diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh
index b633e658..e0d160f1 100755
--- a/extraconfig/tasks/major_upgrade_ceph_mon.sh
+++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh
@@ -5,7 +5,7 @@ set -o pipefail
 echo INFO: starting $(basename "$0")
 
 # Exit if not running
-if ! pidof ceph-mon; then
+if ! pidof ceph-mon &> /dev/null; then
     echo INFO: ceph-mon is not running, skipping
     exit 0
 fi
@@ -54,7 +54,7 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
 elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
     # RPM could own some of these but we can't take risks on the pre-existing files
     for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do
-        chown -R ceph:ceph $d || echo WARNING: chown of $d failed
+        chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed
     done
 
     # Replay udev events with newer rules
@@ -71,6 +71,10 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
       sleep 10;
     done"
 
+    # if tunables become legacy, cluster status will be HEALTH_WARN causing
+    # upgrade to fail on following node
+    ceph osd crush tunables default
+
     echo INFO: Ceph was upgraded to Jewel
 else
     echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention
diff --git a/extraconfig/tasks/major_upgrade_ceph_storage.sh b/extraconfig/tasks/major_upgrade_ceph_storage.sh
index dc80a724..705e84eb 100644
--- a/extraconfig/tasks/major_upgrade_ceph_storage.sh
+++ b/extraconfig/tasks/major_upgrade_ceph_storage.sh
@@ -18,7 +18,7 @@ set -eu
 echo INFO: starting $(basename "$0")
 
 # Exit if not running
-if ! pidof ceph-osd; then
+if ! pidof ceph-osd &> /dev/null; then
     echo INFO: ceph-osd is not running, skipping
     exit 0
 fi
@@ -63,7 +63,7 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
 elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
     # RPM could own some of these but we can't take risks on the pre-existing files
     for d in /var/lib/ceph/osd /var/log/ceph /var/run/ceph /etc/ceph; do
-        chown -R ceph:ceph $d || echo WARNING: chown of $d failed
+        chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed
     done
 
     # Replay udev events with newer rules
diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
index d4200e5f..23074fcb 100755
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
@@ -20,9 +20,13 @@ check_disk_for_mysql_dump
 STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }')
 pcs property set stonith-enabled=false
 
-# Migrate to HA NG
+# Migrate to HA NG and fix up rabbitmq queues
+# We fix up the rabbitmq ha queues after the migration because it will
+# restart the rabbitmq resource. Doing it after the migration means no other
+# services will be restart as there are no other constraints
 if [[ -n $(is_bootstrap_node) ]]; then
     migrate_full_to_ng_ha
+    rabbitmq_mitaka_newton_upgrade
 fi
 
 # After migrating the cluster to HA-NG the services not under pacemaker's control
diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
index fc365939..4203eba9 100755
--- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
+++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh
@@ -33,7 +33,7 @@ fi
 start_or_enable_service galera
 check_resource galera started 600
 start_or_enable_service redis
-check_resource galera started 600
+check_resource redis started 600
 # We need mongod which is now a systemd service up and running before calling
 # ceilometer-dbsync. There is still a race here: mongod might not be up on all nodes
 # so ceilometer-dbsync will fail a couple of times before that. As it retries indefinitely
diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
index cd78f838..5b834b2b 100644
--- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
+++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh
@@ -169,3 +169,22 @@ function migrate_full_to_ng_ha {
         fi
     fi
 }
+
+# This function will make sure that the rabbitmq ha policies are converted from mitaka to newton
+# In mitaka we had: Attributes: set_policy="ha-all ^(?!amq\.).* {"ha-mode":"all"}"
+# In newton we want: Attributes: set_policy="ha-all ^(?!amq\.).* {"ha-mode":"exactly","ha-params":2}"
+# The nr "2" should be CEIL(N/2) where N is the number of Controllers (i.e. rabbit instances)
+# Note that changing an attribute like this makes the rabbitmq resource restart
+function rabbitmq_mitaka_newton_upgrade {
+    if pcs resource show rabbitmq-clone | grep -q -E "Attributes:.*\"ha-mode\":\"all\""; then
+        # Number of controller is obtained by counting how many hostnames we
+        # have in controller_node_names hiera key
+        nr_controllers=$(($(hiera controller_node_names | grep -o "," |wc -l) + 1))
+        nr_queues=$(($nr_controllers / 2 + ($nr_controllers % 2)))
+        if ! [ $nr_queues -gt 0 -a $nr_queues -le $nr_controllers ]; then
+            echo_error "ERROR: The nr. of HA queues during the M/N upgrade is out of range $nr_queues"
+            exit 1
+        fi
+        pcs resource update rabbitmq set_policy='ha-all ^(?!amq\\.).* {"ha-mode":"exactly","ha-params":'"$nr_queues}" --wait=600
+    fi
+}