diff options
Diffstat (limited to 'extraconfig/tasks')
-rw-r--r-- | extraconfig/tasks/noop.yaml | 10 | ||||
-rwxr-xr-x | extraconfig/tasks/pacemaker_resource_restart.sh | 70 | ||||
-rw-r--r-- | extraconfig/tasks/post_puppet_pacemaker.yaml | 44 | ||||
-rw-r--r-- | extraconfig/tasks/pre_puppet_pacemaker.yaml | 30 | ||||
-rwxr-xr-x | extraconfig/tasks/yum_update.sh | 117 |
5 files changed, 270 insertions, 1 deletions
diff --git a/extraconfig/tasks/noop.yaml b/extraconfig/tasks/noop.yaml new file mode 100644 index 00000000..0cff7469 --- /dev/null +++ b/extraconfig/tasks/noop.yaml @@ -0,0 +1,10 @@ +heat_template_version: 2014-10-16 +description: 'No-op task' + +parameters: + servers: + type: json + input_values: + type: json + default: {} + description: input values for the software deployments diff --git a/extraconfig/tasks/pacemaker_resource_restart.sh b/extraconfig/tasks/pacemaker_resource_restart.sh new file mode 100755 index 00000000..12201097 --- /dev/null +++ b/extraconfig/tasks/pacemaker_resource_restart.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -eux + +pacemaker_status=$(systemctl is-active pacemaker) +check_interval=3 + +function check_resource { + + service=$1 + state=$2 + timeout=$3 + tstart=$(date +%s) + tend=$(( $tstart + $timeout )) + + if [ "$state" = "stopped" ]; then + match_for_incomplete='Started' + else # started + match_for_incomplete='Stopped' + fi + + while (( $(date +%s) < $tend )); do + node_states=$(pcs status --full | grep "$service" | grep -v Clone) + if echo "$node_states" | grep -q "$match_for_incomplete"; then + echo "$service not yet $state, sleeping $check_interval seconds." + sleep $check_interval + else + echo "$service has $state" + timeout -k 10 $timeout crm_resource --wait + return + fi + done + + echo "$service never $state after $timeout seconds" | tee /dev/fd/2 + exit 1 + +} + +# Run if pacemaker is running, we're the bootstrap node, +# and we're updating the deployment (not creating). +if [ "$pacemaker_status" = "active" -a \ + "$(hiera bootstrap_nodeid)" = "$(facter hostname)" -a \ + "$(hiera update_identifier)" != "nil" ]; then + + #ensure neutron constraints like + #https://review.openstack.org/#/c/245093/ + if pcs constraint order show | grep "start neutron-server-clone then start neutron-ovs-cleanup-clone"; then + pcs constraint remove order-neutron-server-clone-neutron-ovs-cleanup-clone-mandatory + fi + + pcs resource disable httpd + check_resource httpd stopped 300 + pcs resource disable openstack-keystone + check_resource openstack-keystone stopped 1800 + + if pcs status | grep haproxy-clone; then + pcs resource restart haproxy-clone + fi + pcs resource restart redis-master + pcs resource restart mongod-clone + pcs resource restart rabbitmq-clone + pcs resource restart memcached-clone + pcs resource restart galera-master + + pcs resource enable openstack-keystone + check_resource openstack-keystone started 1800 + pcs resource enable httpd + check_resource httpd started 800 + +fi diff --git a/extraconfig/tasks/post_puppet_pacemaker.yaml b/extraconfig/tasks/post_puppet_pacemaker.yaml new file mode 100644 index 00000000..7de41d94 --- /dev/null +++ b/extraconfig/tasks/post_puppet_pacemaker.yaml @@ -0,0 +1,44 @@ +heat_template_version: 2014-10-16 +description: 'Post-Puppet Config for Pacemaker deployments' + +parameters: + servers: + type: json + input_values: + type: json + description: input values for the software deployments + +resources: + + ControllerPostPuppetMaintenanceModeConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: | + #!/bin/bash + pacemaker_status=$(systemctl is-active pacemaker) + + if [ "$pacemaker_status" = "active" ]; then + pcs property set maintenance-mode=false + fi + + ControllerPostPuppetMaintenanceModeDeployment: + type: OS::Heat::SoftwareDeployments + properties: + servers: {get_param: servers} + config: {get_resource: ControllerPostPuppetMaintenanceModeConfig} + input_values: {get_param: input_values} + + ControllerPostPuppetRestartConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: {get_file: pacemaker_resource_restart.sh} + + ControllerPostPuppetRestartDeployment: + type: OS::Heat::SoftwareDeployments + depends_on: ControllerPostPuppetMaintenanceModeDeployment + properties: + servers: {get_param: servers} + config: {get_resource: ControllerPostPuppetRestartConfig} + input_values: {get_param: input_values} diff --git a/extraconfig/tasks/pre_puppet_pacemaker.yaml b/extraconfig/tasks/pre_puppet_pacemaker.yaml new file mode 100644 index 00000000..2cfe92a7 --- /dev/null +++ b/extraconfig/tasks/pre_puppet_pacemaker.yaml @@ -0,0 +1,30 @@ +heat_template_version: 2014-10-16 +description: 'Pre-Puppet Config for Pacemaker deployments' + +parameters: + servers: + type: json + input_values: + type: json + description: input values for the software deployments + +resources: + + ControllerPrePuppetMaintenanceModeConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: | + #!/bin/bash + pacemaker_status=$(systemctl is-active pacemaker) + + if [ "$pacemaker_status" = "active" ]; then + pcs property set maintenance-mode=true + fi + + ControllerPrePuppetMaintenanceModeDeployment: + type: OS::Heat::SoftwareDeployments + properties: + servers: {get_param: servers} + config: {get_resource: ControllerPrePuppetMaintenanceModeConfig} + input_values: {get_param: input_values} diff --git a/extraconfig/tasks/yum_update.sh b/extraconfig/tasks/yum_update.sh index 9125ca07..c6313d9d 100755 --- a/extraconfig/tasks/yum_update.sh +++ b/extraconfig/tasks/yum_update.sh @@ -22,7 +22,9 @@ mkdir -p $timestamp_dir update_identifier=${update_identifier//[^a-zA-Z0-9-_]/} # seconds to wait for this node to rejoin the cluster after update -cluster_start_timeout=360 +cluster_start_timeout=600 +galera_sync_timeout=360 +cluster_settle_timeout=1800 timestamp_file="$timestamp_dir/$update_identifier" if [[ -a "$timestamp_file" ]]; then @@ -41,8 +43,97 @@ if [[ "$list_updates" == "" ]]; then fi pacemaker_status=$(systemctl is-active pacemaker) +pacemaker_dumpfile=$(mktemp) if [[ "$pacemaker_status" == "active" ]] ; then +SERVICES="memcached +httpd +neutron-dhcp-agent +neutron-l3-agent +neutron-metadata-agent +neutron-openvswitch-agent +neutron-server +openstack-ceilometer-alarm-evaluator +openstack-ceilometer-alarm-notifier +openstack-ceilometer-api +openstack-ceilometer-central +openstack-ceilometer-collector +openstack-ceilometer-notification +openstack-cinder-api +openstack-cinder-scheduler +openstack-cinder-volume +openstack-glance-api +openstack-glance-registry +openstack-heat-api +openstack-heat-api-cfn +openstack-heat-api-cloudwatch +openstack-heat-engine +openstack-keystone +openstack-nova-api +openstack-nova-conductor +openstack-nova-consoleauth +openstack-nova-novncproxy +openstack-nova-scheduler" + + echo "Dumping Pacemaker config" + pcs cluster cib $pacemaker_dumpfile + + echo "Checking for missing constraints" + + if ! pcs constraint order show | grep "start openstack-nova-novncproxy-clone then start openstack-nova-api-clone"; then + pcs -f $pacemaker_dumpfile constraint order start openstack-nova-novncproxy-clone then openstack-nova-api-clone + fi + + if ! pcs constraint order show | grep "start rabbitmq-clone then start openstack-keystone-clone"; then + pcs -f $pacemaker_dumpfile constraint order start rabbitmq-clone then openstack-keystone-clone + fi + + if ! pcs constraint order show | grep "promote galera-master then start openstack-keystone-clone"; then + pcs -f $pacemaker_dumpfile constraint order promote galera-master then openstack-keystone-clone + fi + + if pcs resource | grep "haproxy-clone"; then + SERVICES="$SERVICES haproxy" + if ! pcs constraint order show | grep "start haproxy-clone then start openstack-keystone-clone"; then + pcs -f $pacemaker_dumpfile constraint order start haproxy-clone then openstack-keystone-clone + fi + fi + + if ! pcs constraint order show | grep "start memcached-clone then start openstack-keystone-clone"; then + pcs -f $pacemaker_dumpfile constraint order start memcached-clone then openstack-keystone-clone + fi + + if ! pcs constraint order show | grep "promote redis-master then start openstack-ceilometer-central-clone"; then + pcs -f $pacemaker_dumpfile constraint order promote redis-master then start openstack-ceilometer-central-clone require-all=false + fi + + # ensure neutron constraints https://review.openstack.org/#/c/229466 + # remove ovs-cleanup after server and add openvswitch-agent instead + if pcs constraint order show | grep "start neutron-server-clone then start neutron-ovs-cleanup-clone"; then + pcs -f $pacemaker_dumpfile constraint remove order-neutron-server-clone-neutron-ovs-cleanup-clone-mandatory + fi + if ! pcs constraint order show | grep "start neutron-server-clone then start neutron-openvswitch-agent-clone"; then + pcs -f $pacemaker_dumpfile constraint order start neutron-server-clone then neutron-openvswitch-agent-clone + fi + + + if ! pcs resource defaults | grep "resource-stickiness: INFINITY"; then + pcs -f $pacemaker_dumpfile resource defaults resource-stickiness=INFINITY + fi + + echo "Setting resource start/stop timeouts" + for service in $SERVICES; do + pcs -f $pacemaker_dumpfile resource update $service op start timeout=200s op stop timeout=200s + done + # mongod start timeout is higher, setting only stop timeout + pcs -f $pacemaker_dumpfile resource update mongod op start timeout=370s op stop timeout=200s + + echo "Applying new Pacemaker config" + if ! pcs cluster cib-push $pacemaker_dumpfile; then + echo "ERROR failed to apply new pacemaker config" + exit 1 + fi + echo "Pacemaker running, stopping cluster node and doing full package update" node_count=$(pcs status xml | grep -o "<nodes_configured.*/>" | grep -o 'number="[0-9]*"' | grep -o "[0-9]*") if [[ "$node_count" == "1" ]] ; then @@ -51,6 +142,13 @@ if [[ "$pacemaker_status" == "active" ]] ; then else pcs cluster stop fi + + # clean leftover keepalived and radvd instances from neutron + # (can be removed when we remove neutron-netns-cleanup from cluster services) + # see https://review.gerrithub.io/#/c/248931/1/neutron-netns-cleanup.init + killall neutron-keepalived-state-change 2>/dev/null || : + kill $(ps ax | grep -e "keepalived.*\.pid-vrrp" | awk '{print $1}') 2>/dev/null || : + kill $(ps ax | grep -e "radvd.*\.pid\.radvd" | awk '{print $1}') 2>/dev/null || : else echo "Excluding upgrading packages that are handled by config management tooling" command_arguments="$command_arguments --skip-broken" @@ -83,6 +181,23 @@ if [[ "$pacemaker_status" == "active" ]] ; then exit 1 fi done + + tstart=$(date +%s) + while ! clustercheck; do + sleep 5 + tnow=$(date +%s) + if (( tnow-tstart > galera_sync_timeout )) ; then + echo "ERROR galera sync timed out" + exit 1 + fi + done + + echo "Waiting for pacemaker cluster to settle" + if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then + echo "ERROR timed out while waiting for the cluster to settle" + exit 1 + fi + pcs status else |