From aa5d0120f2ec3965f58ad6b8deec342853e840d7 Mon Sep 17 00:00:00 2001 From: Derek Higgins Date: Thu, 3 Dec 2015 23:45:15 +0000 Subject: Add resources for major upgrade in Pacemaker scenario Add Heat software deployments to be used to upgrade major versions of OpenStack on the controller nodes. All controller services are taken down while the upgrade is in progress. The new updated yum repositories should be configured by another process e.g. the deployment artifacts transfer via Swift. Change-Id: Ia0a04e4a11d67e7a5acc53c1f8a8f01ed5ca8675 Co-Authored-By: Giulio Fidente Co-Authored-By: Jiri Stransky --- environments/major-upgrade-pacemaker.yaml | 3 ++ extraconfig/tasks/major_upgrade_pacemaker.yaml | 45 ++++++++++++++++ extraconfig/tasks/major_upgrade_pacemaker_1.sh | 58 +++++++++++++++++++++ extraconfig/tasks/major_upgrade_pacemaker_2.sh | 71 ++++++++++++++++++++++++++ 4 files changed, 177 insertions(+) create mode 100644 environments/major-upgrade-pacemaker.yaml create mode 100644 extraconfig/tasks/major_upgrade_pacemaker.yaml create mode 100755 extraconfig/tasks/major_upgrade_pacemaker_1.sh create mode 100755 extraconfig/tasks/major_upgrade_pacemaker_2.sh diff --git a/environments/major-upgrade-pacemaker.yaml b/environments/major-upgrade-pacemaker.yaml new file mode 100644 index 00000000..db078404 --- /dev/null +++ b/environments/major-upgrade-pacemaker.yaml @@ -0,0 +1,3 @@ +resource_registry: + OS::TripleO::Tasks::ControllerPrePuppet: ../extraconfig/tasks/major_upgrade_pacemaker.yaml + OS::TripleO::Tasks::ControllerPostPuppet: ../extraconfig/tasks/noop.yaml diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml new file mode 100644 index 00000000..58d7b6d8 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -0,0 +1,45 @@ +heat_template_version: 2014-10-16 +description: 'Upgrade for Pacemaker deployments' + +parameters: + servers: + type: json + input_values: + type: json + description: input values for the software deployments + +resources: + ControllerPacemakerUpgradeConfig_Step1: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_1.sh + + ControllerPacemakerUpgradeDeployment_Step1: + type: OS::Heat::SoftwareDeploymentGroup + properties: + servers: {get_param: servers} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step1} + input_values: {get_param: input_values} + + ControllerPacemakerUpgradeConfig_Step2: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_2.sh + + ControllerPacemakerUpgrade2Deployment_Step2: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: ControllerPacemakerUpgradeDeployment_Step1 + properties: + servers: {get_param: servers} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step2} + input_values: {get_param: input_values} diff --git a/extraconfig/tasks/major_upgrade_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_pacemaker_1.sh new file mode 100755 index 00000000..bee9a939 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_pacemaker_1.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -eu + +cluster_sync_timeout=600 + +if pcs status 2>&1 | grep -E '(cluster is not currently running)|(OFFLINE:)'; then + echo_error "ERROR: upgrade cannot start with some cluster nodes being offline" + exit 1 +fi + +if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then + pcs resource disable httpd + check_resource httpd stopped 1800 + if pcs status | grep openstack-keystone; then + pcs resource disable openstack-keystone + check_resource openstack-keystone stopped 1800 + fi + pcs resource disable redis + check_resource redis stopped 600 + pcs resource disable mongod + check_resource mongod stopped 600 + pcs resource disable rabbitmq + check_resource rabbitmq stopped 600 + pcs resource disable memcached + check_resource memcached stopped 600 + pcs resource disable galera + check_resource galera stopped 600 + pcs cluster stop --all +fi + +# Swift isn't controled by pacemaker +for S in openstack-swift-account-auditor openstack-swift-account-reaper openstack-swift-account-replicator openstack-swift-account \ +openstack-swift-container-auditor openstack-swift-container-replicator openstack-swift-container-updater openstack-swift-container \ +openstack-swift-object-auditor openstack-swift-object-replicator openstack-swift-object-updater openstack-swift-object openstack-swift-proxy; do + systemctl stop $S +done + +tstart=$(date +%s) +while systemctl is-active pacemaker; do + sleep 5 + tnow=$(date +%s) + if (( tnow-tstart > cluster_sync_timeout )) ; then + echo_error "ERROR: cluster shutdown timed out" + exit 1 + fi +done + +yum update -y + +# Pin messages sent to compute nodes to kilo, these will be upgraded later +crudini --set /etc/nova/nova.conf upgrade_levels compute liberty +# https://bugzilla.redhat.com/show_bug.cgi?id=1284047 +# Change-Id: Ib3f6c12ff5471e1f017f28b16b1e6496a4a4b435 +crudini --set /etc/ceilometer/ceilometer.conf DEFAULT rpc_backend rabbit +# https://bugzilla.redhat.com/show_bug.cgi?id=1284058 +# Ifd1861e3df46fad0e44ff9b5cbd58711bbc87c97 Swift Ceilometer middleware no longer exists +crudini --set /etc/swift/proxy-server.conf pipeline:main pipeline "catch_errors healthcheck cache ratelimit tempurl formpost authtoken keystone staticweb proxy-logging proxy-server" diff --git a/extraconfig/tasks/major_upgrade_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_pacemaker_2.sh new file mode 100755 index 00000000..0b92a3bb --- /dev/null +++ b/extraconfig/tasks/major_upgrade_pacemaker_2.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +set -eu + +cluster_form_timeout=600 +cluster_settle_timeout=600 +galera_sync_timeout=600 + +if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then + pcs cluster start --all + + tstart=$(date +%s) + while pcs status 2>&1 | grep -E '(cluster is not currently running)|(OFFLINE:)'; do + sleep 5 + tnow=$(date +%s) + if (( tnow-tstart > cluster_form_timeout )) ; then + echo_error "ERROR: timed out forming the cluster" + exit 1 + fi + done + + if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then + echo_error "ERROR: timed out waiting for cluster to finish transition" + exit 1 + fi + + pcs resource enable galera + check_resource galera started 600 + pcs resource enable mongod + check_resource mongod started 600 + + tstart=$(date +%s) + while ! clustercheck; do + sleep 5 + tnow=$(date +%s) + if (( tnow-tstart > galera_sync_timeout )) ; then + echo_error "ERROR galera sync timed out" + exit 1 + fi + done + + # Run all the db syncs + # TODO: check if this can be triggered in puppet and removed from here + ceilometer-dbsync --config-file=/etc/ceilometer/ceilometer.conf + cinder-manage db sync + glance-manage --config-file=/etc/glance/glance-registry.conf db_sync + heat-manage --config-file /etc/heat/heat.conf db_sync + keystone-manage db_sync + neutron-db-manage --config-file /etc/neutron/neutron.conf --config-file /etc/neutron/plugin.ini upgrade head + nova-manage db sync + + pcs resource enable memcached + check_resource memcached started 600 + pcs resource enable rabbitmq + check_resource rabbitmq started 600 + pcs resource enable redis + check_resource redis started 600 + if pcs status | grep openstack-keystone; then + pcs resource enable openstack-keystone + check_resource openstack-keystone started 1800 + fi + pcs resource enable httpd + check_resource httpd started 1800 +fi + +# Swift isn't controled by heat +for S in openstack-swift-account-auditor openstack-swift-account-reaper openstack-swift-account-replicator openstack-swift-account \ +openstack-swift-container-auditor openstack-swift-container-replicator openstack-swift-container-updater openstack-swift-container \ +openstack-swift-object-auditor openstack-swift-object-replicator openstack-swift-object-updater openstack-swift-object openstack-swift-proxy; do + systemctl start $S +done -- cgit 1.2.3-korg