diff options
Diffstat (limited to 'extraconfig/tasks')
8 files changed, 61 insertions, 34 deletions
diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh index b633e658..e0d160f1 100755 --- a/extraconfig/tasks/major_upgrade_ceph_mon.sh +++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh @@ -5,7 +5,7 @@ set -o pipefail echo INFO: starting $(basename "$0") # Exit if not running -if ! pidof ceph-mon; then +if ! pidof ceph-mon &> /dev/null; then echo INFO: ceph-mon is not running, skipping exit 0 fi @@ -54,7 +54,7 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do - chown -R ceph:ceph $d || echo WARNING: chown of $d failed + chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed done # Replay udev events with newer rules @@ -71,6 +71,10 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then sleep 10; done" + # if tunables become legacy, cluster status will be HEALTH_WARN causing + # upgrade to fail on following node + ceph osd crush tunables default + echo INFO: Ceph was upgraded to Jewel else echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention diff --git a/extraconfig/tasks/major_upgrade_ceph_storage.sh b/extraconfig/tasks/major_upgrade_ceph_storage.sh index dc80a724..705e84eb 100644 --- a/extraconfig/tasks/major_upgrade_ceph_storage.sh +++ b/extraconfig/tasks/major_upgrade_ceph_storage.sh @@ -18,7 +18,7 @@ set -eu echo INFO: starting $(basename "$0") # Exit if not running -if ! pidof ceph-osd; then +if ! pidof ceph-osd &> /dev/null; then echo INFO: ceph-osd is not running, skipping exit 0 fi @@ -63,7 +63,7 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/osd /var/log/ceph /var/run/ceph /etc/ceph; do - chown -R ceph:ceph $d || echo WARNING: chown of $d failed + chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed done # Replay udev events with newer rules diff --git a/extraconfig/tasks/major_upgrade_check.sh b/extraconfig/tasks/major_upgrade_check.sh index dc7ec71a..b65f6915 100755 --- a/extraconfig/tasks/major_upgrade_check.sh +++ b/extraconfig/tasks/major_upgrade_check.sh @@ -88,8 +88,8 @@ check_python_rpm() check_clean_cluster() { - if crm_mon -1 | grep -A3 Failed; then - echo_error "ERROR: upgrade cannot start with failed resources on the cluster. Clean them up before starting: pcs resource cleanup." + if pcs status | grep -q Stopped:; then + echo_error "ERROR: upgrade cannot start with stopped resources on the cluster. Make sure that all the resources are up and running." exit 1 fi } diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index cdf3fa70..d4200e5f 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -6,7 +6,9 @@ cluster_sync_timeout=1800 check_cluster check_pcsd -check_clean_cluster +if [[ -n $(is_bootstrap_node) ]]; then + check_clean_cluster +fi check_python_rpm check_galera_root_password check_disk_for_mysql_dump diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index 158b57ae..4203eba9 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -32,6 +32,8 @@ fi start_or_enable_service galera check_resource galera started 600 +start_or_enable_service redis +check_resource redis started 600 # We need mongod which is now a systemd service up and running before calling # ceilometer-dbsync. There is still a race here: mongod might not be up on all nodes # so ceilometer-dbsync will fail a couple of times before that. As it retries indefinitely @@ -62,25 +64,7 @@ if [[ -n $(is_bootstrap_node) ]]; then nova-manage db sync nova-manage api_db sync nova-manage db online_data_migrations + gnocchi-upgrade #TODO(marios):someone from sahara needs to check this: # sahara-db-manage --config-file /etc/sahara/sahara.conf upgrade head fi - -start_or_enable_service rabbitmq -check_resource rabbitmq started 600 -start_or_enable_service redis -check_resource redis started 600 -start_or_enable_service openstack-cinder-volume -check_resource openstack-cinder-volume started 600 - - -# Swift isn't controled by pacemaker -systemctl_swift start - -# We need to start the systemd services we explicitely stopped at step _1.sh -# FIXME: Should we let puppet during the convergence step do the service enabling or -# should we add it here? -for service in $(services_to_migrate); do - manage_systemd_service start "${service%%-clone}" - check_resource_systemd "${service%%-clone}" started 600 -done diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh new file mode 100755 index 00000000..4d72fbd8 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu + +start_or_enable_service rabbitmq +check_resource rabbitmq started 600 +start_or_enable_service redis +check_resource redis started 600 +start_or_enable_service openstack-cinder-volume +check_resource openstack-cinder-volume started 600 + + +# Swift isn't controled by pacemaker +systemctl_swift start + +# We need to start the systemd services we explicitely stopped at step _1.sh +# FIXME: Should we let puppet during the convergence step do the service enabling or +# should we add it here? +for service in $(services_to_migrate); do + manage_systemd_service start "${service%%-clone}" + check_resource_systemd "${service%%-clone}" started 600 +done diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index a2a1bb5d..30ae8d1e 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -120,3 +120,22 @@ resources: config: {get_resource: ControllerPacemakerUpgradeConfig_Step2} input_values: {get_param: input_values} + ControllerPacemakerUpgradeConfig_Step3: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_3.sh + + ControllerPacemakerUpgradeDeployment_Step3: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: ControllerPacemakerUpgradeDeployment_Step2 + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step3} + input_values: {get_param: input_values} + diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh index d974bb79..cd78f838 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh +++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh @@ -109,7 +109,7 @@ function services_to_migrate { # during the conversion # 2. Remove all the colocation constraints and then the ordering constraints, except the # ones related to haproxy/VIPs which exist in Newton as well -# 3. Take the cluster out of maintenance-mode and do a resource cleanup +# 3. Take the cluster out of maintenance-mode # 4. Remove all the resources that won't be managed by pacemaker in newton. The # outcome will be # that they are stopped and removed from pacemakers control @@ -117,13 +117,9 @@ function services_to_migrate { function migrate_full_to_ng_ha { if [[ -n $(pcmk_running) ]]; then pcs property set maintenance-mode=true - # We are making sure here that the property has propagated everywhere - if ! timeout -k 10 300 crm_resource --wait; then - echo_error "ERROR: cluster remained unstable after setting maintenance-mode for more than 300 seconds, exiting." - exit 1 - fi - # First we go through all the colocation constraints (except the ones we want to keep, i.e. the haproxy/ip ones) - # and we remove those + + # First we go through all the colocation constraints (except the ones + # we want to keep, i.e. the haproxy/ip ones) and we remove those COL_CONSTRAINTS=$(pcs config show | sed -n '/^Colocation Constraints:$/,/^$/p' | grep -v "Colocation Constraints:" | egrep -v "ip-.*haproxy" | awk '{print $NF}' | cut -f2 -d: |cut -f1 -d\)) for constraint in $COL_CONSTRAINTS; do log_debug "Deleting colocation constraint $constraint from CIB" |