diff options
Diffstat (limited to 'extraconfig')
-rwxr-xr-x | extraconfig/tasks/major_upgrade_ceph_mon.sh | 78 | ||||
-rw-r--r-- | extraconfig/tasks/major_upgrade_ceph_storage.sh | 77 | ||||
-rwxr-xr-x | extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh | 31 | ||||
-rwxr-xr-x | extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh | 1 | ||||
-rw-r--r-- | extraconfig/tasks/major_upgrade_pacemaker.yaml | 31 | ||||
-rwxr-xr-x | extraconfig/tasks/pacemaker_resource_restart.sh | 45 |
6 files changed, 214 insertions, 49 deletions
diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh new file mode 100755 index 00000000..b76dd7c3 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh @@ -0,0 +1,78 @@ +#!/bin/bash +set -eu +set -o pipefail + +echo INFO: starting $(basename "$0") + +# Exit if not running +if ! pidof ceph-mon; then + echo INFO: ceph-mon is not running, skipping + exit 0 +fi + +# Exit if not Hammer +INSTALLED_VERSION=$(ceph --version | awk '{print $3}') +if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then + echo INFO: version of Ceph installed is not 0.94, skipping + exit 0 +fi + +CEPH_STATUS=$(ceph health | awk '{print $1}') +if [ ${CEPH_STATUS} = HEALTH_ERR ]; do + echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded + exit 1 +fi + +# Useful when upgrading with OSDs num < replica size +if [ $ignore_ceph_upgrade_warnings != "true" ]; then + timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do + echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK; + sleep 30; + CEPH_STATUS=$(ceph health | awk '{print $1}') + done" +fi + +MON_PID=$(pidof ceph-mon) +MON_ID=$(hostname -s) + +# Stop daemon using Hammer sysvinit script +service ceph stop mon.${MON_ID} + +# Ensure it's stopped +timeout 60 bash -c "while kill -0 ${MON_PID} 2> /dev/null; do + sleep 2; +done" + +# Update to Jewel +yum -y -q update ceph-mon + +# Restart/Exit if not on Jewel, only in that case we need the changes +UPDATED_VERSION=$(ceph --version | awk '{print $3}') +if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then + echo WARNING: Ceph was not upgraded, restarting daemons + service ceph start mon.${MON_ID} +elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then + # RPM could own some of these but we can't take risks on the pre-existing files + for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do + chown -R ceph:ceph $d + done + + # Replay udev events with newer rules + udevadm trigger + + # Enable systemd unit + systemctl enable ceph-mon.target + systemctl enable ceph-mon@${MON_ID} + systemctl start ceph-mon@${MON_ID} + + # Wait for daemon to be back in the quorum + timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do + echo WARNING: Waiting for mon.${MON_ID} to re-join quorum; + sleep 10; + done" + + echo INFO: Ceph was upgraded to Jewel +else + echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention + exit 1 +fi diff --git a/extraconfig/tasks/major_upgrade_ceph_storage.sh b/extraconfig/tasks/major_upgrade_ceph_storage.sh index de42b16d..03a1c1c2 100644 --- a/extraconfig/tasks/major_upgrade_ceph_storage.sh +++ b/extraconfig/tasks/major_upgrade_ceph_storage.sh @@ -4,32 +4,89 @@ # major upgrade workflow. # set -eu +set -o pipefail UPGRADE_SCRIPT=/root/tripleo_upgrade_node.sh -cat > $UPGRADE_SCRIPT << ENDOFCAT +cat > $UPGRADE_SCRIPT << 'ENDOFCAT' +#!/bin/bash ### DO NOT MODIFY THIS FILE ### This file is automatically delivered to the ceph-storage nodes as part of the ### tripleo upgrades workflow +set -eu + +echo INFO: starting $(basename "$0") +# Exit if not running +if ! pidof ceph-osd; then + echo INFO: ceph-osd is not running, skipping + exit 0 +fi -function systemctl_ceph { - action=\$1 - systemctl \$action ceph -} +# Exit if not Hammer +INSTALLED_VERSION=$(ceph --version | awk '{print $3}') +if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then + echo INFO: version of Ceph installed is not 0.94, skipping + exit 0 +fi -# "so that mirrors aren't rebalanced as if the OSD died" - gfidente +OSD_PIDS=$(pidof ceph-osd) +OSD_IDS=$(ls /var/lib/ceph/osd | awk 'BEGIN { FS = "-" } ; { print $2 }') + +# "so that mirrors aren't rebalanced as if the OSD died" - gfidente / leseb ceph osd set noout +ceph osd set norebalance +ceph osd set nodeep-scrub +ceph osd set noscrub + +# Stop daemon using Hammer sysvinit script +for OSD_ID in $OSD_IDS; do + service ceph stop osd.${OSD_ID} +done + +# Nice guy will return non-0 only when all failed +timeout 60 bash -c "while kill -0 ${OSD_PIDS} 2> /dev/null; do + sleep 2; +done" -systemctl_ceph stop +# Update (Ceph to Jewel) yum -y install python-zaqarclient # needed for os-collect-config yum -y update -systemctl_ceph start -ceph osd unset noout +# Restart/Exit if not on Jewel, only in that case we need the changes +UPDATED_VERSION=$(ceph --version | awk '{print $3}') +if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then + echo WARNING: Ceph was not upgraded, restarting daemon + for OSD_ID in $OSD_IDS; do + service ceph start osd.${OSD_ID} + done +elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then + # RPM could own some of these but we can't take risks on the pre-existing files + for d in /var/lib/ceph/osd /var/log/ceph /var/run/ceph /etc/ceph; do + chown -R ceph:ceph $d + done + + # Replay udev events with newer rules + udevadm trigger && udevadm settle + + # Enable systemd unit + systemctl enable ceph-osd.target + for OSD_ID in $OSD_IDS; do + systemctl enable ceph-osd@${OSD_ID} + systemctl start ceph-osd@${OSD_ID} + done + echo INFO: Ceph was upgraded to Jewel +else + echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention + exit 1 +fi + +ceph osd unset noout +ceph osd unset norebalance +ceph osd unset nodeep-scrub +ceph osd unset noscrub ENDOFCAT # ensure the permissions are OK chmod 0755 $UPGRADE_SCRIPT - diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index d67d5a1a..0b702630 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -18,7 +18,7 @@ STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk pcs property set stonith-enabled=false # If for some reason rpm-python are missing we want to error out early enough -if [ ! rpm -q rpm-python &> /dev/null ]; then +if ! rpm -q rpm-python &> /dev/null; then echo_error "ERROR: upgrade cannot start without rpm-python installed" exit 1 fi @@ -155,17 +155,19 @@ wsrep_on = ON wsrep_cluster_address = gcomm://localhost EOF -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then - if [ $DO_MYSQL_UPGRADE -eq 1 ]; then - # Scripts run via heat have no HOME variable set and this confuses - # mysqladmin - export HOME=/root - mkdir /var/lib/mysql || /bin/true - chown mysql:mysql /var/lib/mysql - chmod 0755 /var/lib/mysql - restorecon -R /var/lib/mysql/ - mysql_install_db --datadir=/var/lib/mysql --user=mysql - chown -R mysql:mysql /var/lib/mysql/ +if [ $DO_MYSQL_UPGRADE -eq 1 ]; then + # Scripts run via heat have no HOME variable set and this confuses + # mysqladmin + export HOME=/root + + mkdir /var/lib/mysql || /bin/true + chown mysql:mysql /var/lib/mysql + chmod 0755 /var/lib/mysql + restorecon -R /var/lib/mysql/ + mysql_install_db --datadir=/var/lib/mysql --user=mysql + chown -R mysql:mysql /var/lib/mysql/ + + if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then mysqld_safe --wsrep-new-cluster & # We have a populated /root/.my.cnf with root/password here so # we need to temporarily rename it because the newly created @@ -182,6 +184,9 @@ fi # If we reached here without error we can safely blow away the origin # mysql dir from every controller + +# TODO: What if the upgrade fails on the bootstrap node, but not on +# this controller. Data may be lost. if [ $DO_MYSQL_UPGRADE -eq 1 ]; then rm -r $MYSQL_TEMP_UPGRADE_BACKUP_DIR fi @@ -199,3 +204,5 @@ crudini --set /etc/ceilometer/ceilometer.conf DEFAULT rpc_backend rabbit # https://bugzilla.redhat.com/show_bug.cgi?id=1284058 # Ifd1861e3df46fad0e44ff9b5cbd58711bbc87c97 Swift Ceilometer middleware no longer exists crudini --set /etc/swift/proxy-server.conf pipeline:main pipeline "catch_errors healthcheck cache ratelimit tempurl formpost authtoken keystone staticweb proxy-logging proxy-server" +# LP: 1615035, required only for M/N upgrade. +crudini --set /etc/nova/nova.conf DEFAULT scheduler_host_manager host_manager diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index 643ae57f..bc708cce 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -53,6 +53,7 @@ if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname) keystone-manage db_sync neutron-db-manage --config-file /etc/neutron/neutron.conf --config-file /etc/neutron/plugin.ini upgrade head nova-manage db sync + nova-manage api_db sync pcs resource enable memcached check_resource memcached started 600 diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index c70a954f..598d22d0 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -26,12 +26,42 @@ parameters: constraints: - allowed_values: ['auto', 'yes', 'no'] default: 'auto' + IgnoreCephUpgradeWarnings: + type: boolean + default: false + description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean resources: # TODO(jistr): for Mitaka->Newton upgrades and further we can use # map_merge with input_values instead of feeding params into scripts # via str_replace on bash snippets + CephMonUpgradeConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - str_replace: + template: | + #!/bin/bash + ignore_ceph_upgrade_warnings='IGNORE_CEPH_UPGRADE_WARNINGS' + params: + IGNORE_CEPH_UPGRADE_WARNINGS: {get_param: IgnoreCephUpgradeWarnings} + - get_file: major_upgrade_ceph_mon.sh + + CephMonUpgradeDeployment: + type: OS::Heat::SoftwareDeploymentGroup + properties: + servers: {get_param: controller_servers} + config: {get_resource: CephMonUpgradeConfig} + input_values: {get_param: input_values} + batch_create: + max_batch_size: 1 + rolling_update: + max_batch_size: 1 + ControllerPacemakerUpgradeConfig_Step1: type: OS::Heat::SoftwareConfig properties: @@ -57,6 +87,7 @@ resources: ControllerPacemakerUpgradeDeployment_Step1: type: OS::Heat::SoftwareDeploymentGroup + depends_on: CephMonUpgradeDeployment properties: servers: {get_param: controller_servers} config: {get_resource: ControllerPacemakerUpgradeConfig_Step1} diff --git a/extraconfig/tasks/pacemaker_resource_restart.sh b/extraconfig/tasks/pacemaker_resource_restart.sh index b2e5be16..fd1fd0dc 100755 --- a/extraconfig/tasks/pacemaker_resource_restart.sh +++ b/extraconfig/tasks/pacemaker_resource_restart.sh @@ -7,32 +7,23 @@ pacemaker_status=$(systemctl is-active pacemaker) # Run if pacemaker is running, we're the bootstrap node, # and we're updating the deployment (not creating). if [ "$pacemaker_status" = "active" -a \ - "$(hiera bootstrap_nodeid)" = "$(facter hostname)" -a \ - "$(hiera stack_action)" = "UPDATE" ]; then - - #ensure neutron constraints like - #https://review.openstack.org/#/c/245093/ - if pcs constraint order show | grep "start neutron-server-clone then start neutron-ovs-cleanup-clone"; then - pcs constraint remove order-neutron-server-clone-neutron-ovs-cleanup-clone-mandatory - fi - - pcs resource disable httpd - check_resource httpd stopped 300 - pcs resource disable openstack-core - check_resource openstack-core stopped 1800 - - if pcs status | grep haproxy-clone; then - pcs resource restart haproxy-clone - fi - pcs resource restart redis-master - pcs resource restart mongod-clone - pcs resource restart rabbitmq-clone - pcs resource restart memcached-clone - pcs resource restart galera-master - - pcs resource enable openstack-core - check_resource openstack-core started 1800 - pcs resource enable httpd - check_resource httpd started 800 + "$(hiera bootstrap_nodeid)" = "$(facter hostname)" ]; then + + TIMEOUT=600 + SERVICES_TO_RESTART="$(ls /var/lib/tripleo/pacemaker-restarts)" + PCS_STATUS_OUTPUT="$(pcs status)" + + for service in $SERVICES_TO_RESTART; do + if ! echo "$PCS_STATUS_OUTPUT" | grep $service; then + echo "Service $service not found as a pacemaker resource, cannot restart it." + exit 1 + fi + done + + for service in $SERVICES_TO_RESTART; do + echo "Restarting $service..." + pcs resource restart --wait=$TIMEOUT $service + rm -f /var/lib/tripleo/pacemaker-restarts/$service + done fi |