diff options
Diffstat (limited to 'extraconfig/tasks')
9 files changed, 166 insertions, 61 deletions
diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh index 21a2b5bc..e0d160f1 100755 --- a/extraconfig/tasks/major_upgrade_ceph_mon.sh +++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh @@ -5,7 +5,7 @@ set -o pipefail echo INFO: starting $(basename "$0") # Exit if not running -if ! pidof ceph-mon; then +if ! pidof ceph-mon &> /dev/null; then echo INFO: ceph-mon is not running, skipping exit 0 fi @@ -24,7 +24,7 @@ if [ ${CEPH_STATUS} = HEALTH_ERR ]; then fi # Useful when upgrading with OSDs num < replica size -if [ ${ignore_ceph_upgrade_warnings:-false} != "true" ]; then +if [[ ${ignore_ceph_upgrade_warnings:-False} != [Tt]rue ]]; then timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK; sleep 30; @@ -54,7 +54,7 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do - chown -R ceph:ceph $d || echo WARNING: chown of $d failed + chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed done # Replay udev events with newer rules @@ -71,6 +71,10 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then sleep 10; done" + # if tunables become legacy, cluster status will be HEALTH_WARN causing + # upgrade to fail on following node + ceph osd crush tunables default + echo INFO: Ceph was upgraded to Jewel else echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention diff --git a/extraconfig/tasks/major_upgrade_ceph_storage.sh b/extraconfig/tasks/major_upgrade_ceph_storage.sh index dc80a724..56b54e22 100644 --- a/extraconfig/tasks/major_upgrade_ceph_storage.sh +++ b/extraconfig/tasks/major_upgrade_ceph_storage.sh @@ -18,7 +18,7 @@ set -eu echo INFO: starting $(basename "$0") # Exit if not running -if ! pidof ceph-osd; then +if ! pidof ceph-osd &> /dev/null; then echo INFO: ceph-osd is not running, skipping exit 0 fi @@ -63,12 +63,22 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/osd /var/log/ceph /var/run/ceph /etc/ceph; do - chown -R ceph:ceph $d || echo WARNING: chown of $d failed + chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed done # Replay udev events with newer rules udevadm trigger && udevadm settle + # If on ext4, we need to enforce lower values for name and namespace len + # or ceph-osd will refuse to start, see: http://tracker.ceph.com/issues/16187 + for OSD_ID in $OSD_IDS; do + OSD_FS=$(findmnt -n -o FSTYPE -T /var/lib/ceph/osd/ceph-${OSD_ID}) + if [ ${OSD_FS} = ext4 ]; then + crudini --set /etc/ceph/ceph.conf global osd_max_object_name_len 256 + crudini --set /etc/ceph/ceph.conf global osd_max_object_namespace_len 64 + fi + done + # Enable systemd unit systemctl enable ceph-osd.target for OSD_ID in $OSD_IDS; do diff --git a/extraconfig/tasks/major_upgrade_check.sh b/extraconfig/tasks/major_upgrade_check.sh index dc7ec71a..b65f6915 100755 --- a/extraconfig/tasks/major_upgrade_check.sh +++ b/extraconfig/tasks/major_upgrade_check.sh @@ -88,8 +88,8 @@ check_python_rpm() check_clean_cluster() { - if crm_mon -1 | grep -A3 Failed; then - echo_error "ERROR: upgrade cannot start with failed resources on the cluster. Clean them up before starting: pcs resource cleanup." + if pcs status | grep -q Stopped:; then + echo_error "ERROR: upgrade cannot start with stopped resources on the cluster. Make sure that all the resources are up and running." exit 1 fi } diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index 2490ce27..23074fcb 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -6,7 +6,9 @@ cluster_sync_timeout=1800 check_cluster check_pcsd -check_clean_cluster +if [[ -n $(is_bootstrap_node) ]]; then + check_clean_cluster +fi check_python_rpm check_galera_root_password check_disk_for_mysql_dump @@ -18,9 +20,13 @@ check_disk_for_mysql_dump STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }') pcs property set stonith-enabled=false -# Migrate to HA NG -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then +# Migrate to HA NG and fix up rabbitmq queues +# We fix up the rabbitmq ha queues after the migration because it will +# restart the rabbitmq resource. Doing it after the migration means no other +# services will be restart as there are no other constraints +if [[ -n $(is_bootstrap_node) ]]; then migrate_full_to_ng_ha + rabbitmq_mitaka_newton_upgrade fi # After migrating the cluster to HA-NG the services not under pacemaker's control @@ -29,9 +35,26 @@ fi # is going to take a long time because rabbit is down. By having the service stopped # systemctl try-restart is a noop -for $service in $(services_to_migrate); do +for service in $(services_to_migrate); do manage_systemd_service stop "${service%%-clone}" - check_resource_systemd "${service%%-clone}" stopped 600 + # So the reason for not reusing check_resource_systemd is that + # I have observed systemctl is-active returning unknown with at least + # one service that was stopped (See LP 1627254) + timeout=600 + tstart=$(date +%s) + tend=$(( $tstart + $timeout )) + check_interval=3 + while (( $(date +%s) < $tend )); do + if [[ "$(systemctl is-active ${service%%-clone})" = "active" ]]; then + echo "$service still active, sleeping $check_interval seconds." + sleep $check_interval + else + # we do not care if it is inactive, unknown or failed as long as it is + # not running + break + fi + + done done # In case the mysql package is updated, the database on disk must be @@ -46,7 +69,7 @@ done # on mysql package versionning, but this can be overriden manually # to support specific upgrade scenario -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then +if [[ -n $(is_bootstrap_node) ]]; then if [ $DO_MYSQL_UPGRADE -eq 1 ]; then mysqldump $backup_flags > "$MYSQL_BACKUP_DIR/openstack_database.sql" cp -rdp /etc/my.cnf* "$MYSQL_BACKUP_DIR" @@ -58,6 +81,8 @@ if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname) check_resource rabbitmq stopped 600 pcs resource disable galera check_resource galera stopped 600 + pcs resource disable openstack-cinder-volume + check_resource openstack-cinder-volume stopped 600 # Disable all VIPs before stopping the cluster, so that pcs doesn't use one as a source address: # https://bugzilla.redhat.com/show_bug.cgi?id=1330688 for vip in $(pcs resource show | grep ocf::heartbeat:IPaddr2 | grep Started | awk '{ print $1 }'); do @@ -68,7 +93,7 @@ if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname) fi -# Swift isn't controled by pacemaker +# Swift isn't controlled by pacemaker systemctl_swift stop tstart=$(date +%s) @@ -151,5 +176,15 @@ fi # Pin messages sent to compute nodes to kilo, these will be upgraded later crudini --set /etc/nova/nova.conf upgrade_levels compute "$upgrade_level_nova_compute" +# https://bugzilla.redhat.com/show_bug.cgi?id=1284047 +# Change-Id: Ib3f6c12ff5471e1f017f28b16b1e6496a4a4b435 +crudini --set /etc/ceilometer/ceilometer.conf DEFAULT rpc_backend rabbit +# https://bugzilla.redhat.com/show_bug.cgi?id=1284058 +# Ifd1861e3df46fad0e44ff9b5cbd58711bbc87c97 Swift Ceilometer middleware no longer exists +crudini --set /etc/swift/proxy-server.conf pipeline:main pipeline "catch_errors healthcheck cache ratelimit tempurl formpost authtoken keystone staticweb proxy-logging proxy-server" +# LP: 1615035, required only for M/N upgrade. +crudini --set /etc/nova/nova.conf DEFAULT scheduler_host_manager host_manager +# LP: 1627450, required only for M/N upgrade +crudini --set /etc/nova/nova.conf DEFAULT scheduler_driver filter_scheduler crudini --set /etc/sahara/sahara.conf DEFAULT plugins ambari,cdh,mapr,vanilla,spark,storm diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index 6bb2fa73..4203eba9 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -32,6 +32,15 @@ fi start_or_enable_service galera check_resource galera started 600 +start_or_enable_service redis +check_resource redis started 600 +# We need mongod which is now a systemd service up and running before calling +# ceilometer-dbsync. There is still a race here: mongod might not be up on all nodes +# so ceilometer-dbsync will fail a couple of times before that. As it retries indefinitely +# we should be good. +# Due to LP Bug https://bugs.launchpad.net/tripleo/+bug/1627254 am using systemctl directly atm +systemctl start mongod +check_resource mongod started 600 if [[ -n $(is_bootstrap_node) ]]; then tstart=$(date +%s) @@ -53,22 +62,9 @@ if [[ -n $(is_bootstrap_node) ]]; then keystone-manage db_sync neutron-db-manage --config-file /etc/neutron/neutron.conf --config-file /etc/neutron/plugin.ini upgrade head nova-manage db sync + nova-manage api_db sync + nova-manage db online_data_migrations + gnocchi-upgrade #TODO(marios):someone from sahara needs to check this: # sahara-db-manage --config-file /etc/sahara/sahara.conf upgrade head fi - -start_or_enable_service rabbitmq -check_resource rabbitmq started 600 -start_or_enable_service redis -check_resource redis started 600 - -# Swift isn't controled by pacemaker -systemctl_swift start - -# We need to start the systemd services we explicitely stopped at step _1.sh -# FIXME: Should we let puppet during the convergence step do the service enabling or -# should we add it here? -for $service in $(services_to_migrate); do - manage_systemd_service stop "${service%%-clone}" - check_resource_systemd "${service%%-clone}" started 600 -done diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh new file mode 100755 index 00000000..4d72fbd8 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu + +start_or_enable_service rabbitmq +check_resource rabbitmq started 600 +start_or_enable_service redis +check_resource redis started 600 +start_or_enable_service openstack-cinder-volume +check_resource openstack-cinder-volume started 600 + + +# Swift isn't controled by pacemaker +systemctl_swift start + +# We need to start the systemd services we explicitely stopped at step _1.sh +# FIXME: Should we let puppet during the convergence step do the service enabling or +# should we add it here? +for service in $(services_to_migrate); do + manage_systemd_service start "${service%%-clone}" + check_resource_systemd "${service%%-clone}" started 600 +done diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index 7244f949..30ae8d1e 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -46,7 +46,7 @@ resources: CephMonUpgradeDeployment: type: OS::Heat::SoftwareDeploymentGroup properties: - servers: {get_param: servers, Controller} + servers: {get_param: [servers, Controller]} config: {get_resource: CephMonUpgradeConfig} input_values: {get_param: input_values} update_policy: @@ -83,7 +83,7 @@ resources: type: OS::Heat::SoftwareDeploymentGroup depends_on: CephMonUpgradeDeployment properties: - servers: {get_param: servers, Controller} + servers: {get_param: [servers, Controller]} config: {get_resource: ControllerPacemakerUpgradeConfig_Step1} input_values: {get_param: input_values} @@ -97,7 +97,7 @@ resources: BlockStorageUpgradeDeployment: type: OS::Heat::SoftwareDeploymentGroup properties: - servers: {get_param: servers, BlockStorage} + servers: {get_param: [servers, BlockStorage]} config: {get_resource: BlockStorageUpgradeConfig} input_values: {get_param: input_values} @@ -116,7 +116,26 @@ resources: type: OS::Heat::SoftwareDeploymentGroup depends_on: BlockStorageUpgradeDeployment properties: - servers: {get_param: servers, Controller} + servers: {get_param: [servers, Controller]} config: {get_resource: ControllerPacemakerUpgradeConfig_Step2} input_values: {get_param: input_values} + ControllerPacemakerUpgradeConfig_Step3: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_3.sh + + ControllerPacemakerUpgradeDeployment_Step3: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: ControllerPacemakerUpgradeDeployment_Step2 + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step3} + input_values: {get_param: input_values} + diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh index b8c5321b..df87c93f 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh +++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh @@ -109,20 +109,17 @@ function services_to_migrate { # during the conversion # 2. Remove all the colocation constraints and then the ordering constraints, except the # ones related to haproxy/VIPs which exist in Newton as well -# 3. Remove all the resources that won't be managed by pacemaker in newton. Note that they -# will show up as ORPHANED but they will keep running normally via systemd. They will be -# enabled to start at boot by puppet during the converge step -# 4. Take the cluster out of maintenance-mode and do a resource cleanup +# 3. Take the cluster out of maintenance-mode +# 4. Remove all the resources that won't be managed by pacemaker in newton. The +# outcome will be +# that they are stopped and removed from pacemakers control +# 5. Do a resource cleanup to make sure the cluster is in a clean state function migrate_full_to_ng_ha { if [[ -n $(pcmk_running) ]]; then pcs property set maintenance-mode=true - # We are making sure here that the property has propagated everywhere - if ! timeout -k 10 300 crm_resource --wait; then - echo_error "ERROR: cluster remained unstable after setting maintenance-mode for more than 300 seconds, exiting." - exit 1 - fi - # First we go through all the colocation constraints (except the ones we want to keep, i.e. the haproxy/ip ones) - # and we remove those + + # First we go through all the colocation constraints (except the ones + # we want to keep, i.e. the haproxy/ip ones) and we remove those COL_CONSTRAINTS=$(pcs config show | sed -n '/^Colocation Constraints:$/,/^$/p' | grep -v "Colocation Constraints:" | egrep -v "ip-.*haproxy" | awk '{print $NF}' | cut -f2 -d: |cut -f1 -d\)) for constraint in $COL_CONSTRAINTS; do log_debug "Deleting colocation constraint $constraint from CIB" @@ -135,32 +132,35 @@ function migrate_full_to_ng_ha { log_debug "Deleting ordering constraint $constraint from CIB" pcs constraint remove "$constraint" done + # At this stage all the pacemaker resources are removed from the CIB. + # Once we remove the maintenance-mode those systemd resources will keep + # on running. They shall be systemd enabled via the puppet converge + # step later on + pcs property set maintenance-mode=false # At this stage there are no constraints whatsoever except the haproxy/ip ones - # which we want to keep. We now delete each resource that will move to systemd - # Note that the corresponding systemd resource will stay running, which means that - # later when we do the "yum update", things will be a bit slower because each - # "systemctl try-restart <service>" is not a no-op any longer because the service is up - # and running and it will be restarted with rabbitmq being down. + # which we want to keep. We now disable and then delete each resource + # that will move to systemd. + # We want the systemd resources be stopped before doing "yum update", + # that way "systemctl try-restart <service>" is no-op because the + # service was down already PCS_STATUS_OUTPUT="$(pcs status)" for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then log_debug "Deleting $resource from the CIB" - - # We need to add --force because the cluster is in maintenance mode and the resource - # is unmanaged. The if serves to make this idempotent + if ! pcs resource disable "$resource" --wait=600; then + echo_error "ERROR: resource $resource failed to be disabled" + exit 1 + fi pcs resource delete --force "$resource" else - log_debug "Service $service not found as a pacemaker resource, not trying to delete." + log_debug "Service $resource not found as a pacemaker resource, not trying to delete." fi done - # At this stage all the pacemaker resources are removed from the CIB. Once we remove the - # maintenance-mode those systemd resources will keep on running. They shall be systemd enabled - # via the puppet converge step later on - pcs property set maintenance-mode=false - # We need to do a pcs resource cleanup here + crm_resource --wait to make sure the - # cluster is in a clean state before we stop everything, upgrade and restart everything + # We need to do a pcs resource cleanup here + crm_resource --wait to + # make sure the cluster is in a clean state before we stop everything, + # upgrade and restart everything pcs resource cleanup # We are making sure here that the cluster is stable before proceeding if ! timeout -k 10 600 crm_resource --wait; then @@ -169,3 +169,22 @@ function migrate_full_to_ng_ha { fi fi } + +# This function will make sure that the rabbitmq ha policies are converted from mitaka to newton +# In mitaka we had: Attributes: set_policy="ha-all ^(?!amq\.).* {"ha-mode":"all"}" +# In newton we want: Attributes: set_policy="ha-all ^(?!amq\.).* {"ha-mode":"exactly","ha-params":2}" +# The nr "2" should be CEIL(N/2) where N is the number of Controllers (i.e. rabbit instances) +# Note that changing an attribute like this makes the rabbitmq resource restart +function rabbitmq_mitaka_newton_upgrade { + if pcs resource show rabbitmq-clone | grep -q -E "Attributes:.*\"ha-mode\":\"all\""; then + # Number of controller is obtained by counting how many hostnames we + # have in controller_node_names hiera key + nr_controllers=$(($(hiera controller_node_names | grep -o "," |wc -l) + 1)) + nr_queues=$(($nr_controllers / 2 + ($nr_controllers % 2))) + if ! [ $nr_queues -gt 0 -a $nr_queues -le $nr_controllers ]; then + echo_error "ERROR: The nr. of HA queues during the M/N upgrade is out of range $nr_queues" + exit 1 + fi + pcs resource update rabbitmq set_policy='ha-all ^(?!amq\\.).* {"ha-mode":"exactly","ha-params":'"$nr_queues}" --wait=600 + fi +} diff --git a/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml b/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml index 91406fba..b9a87d33 100644 --- a/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml +++ b/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml @@ -20,6 +20,6 @@ resources: AodhMysqlMigrationScriptDeployment: type: OS::Heat::SoftwareDeploymentGroup properties: - servers: {get_param: servers, Controller} + servers: {get_param: [servers, Controller]} config: {get_resource: AodhMysqlMigrationScriptConfig} input_values: {get_param: input_values} |