diff options
Diffstat (limited to 'extraconfig')
27 files changed, 1318 insertions, 616 deletions
diff --git a/extraconfig/all_nodes/mac_hostname.yaml b/extraconfig/all_nodes/mac_hostname.j2.yaml index 7d8704e3..75ffc9e6 100644 --- a/extraconfig/all_nodes/mac_hostname.yaml +++ b/extraconfig/all_nodes/mac_hostname.j2.yaml @@ -9,15 +9,7 @@ description: > # out-of-tree templates they may require additional parameters if the # in-tree templates add a new role. parameters: - controller_servers: - type: json - compute_servers: - type: json - blockstorage_servers: - type: json - objectstorage_servers: - type: json - cephstorage_servers: + servers: type: json # Note extra parameters can be defined, then passed data via the # environment parameter_defaults, without modifying the parent template @@ -37,47 +29,17 @@ resources: # FIXME(shardy): Long term it'd be better if Heat SoftwareDeployments accepted # list instead of a map, then we could join the lists of servers into one # deployment instead of requiring one deployment per-role. - CollectMacDeploymentsController: +{% for role in roles %} + CollectMacDeployments{{role.name}}: type: OS::Heat::SoftwareDeployments properties: name: CollectMacDeploymentsController - servers: {get_param: controller_servers} - config: {get_resource: CollectMacConfig} - actions: ['CREATE'] # Only do this on CREATE - - CollectMacDeploymentsCompute: - type: OS::Heat::SoftwareDeployments - properties: - name: CollectMacDeploymentsCompute - servers: {get_param: compute_servers} - config: {get_resource: CollectMacConfig} - actions: ['CREATE'] # Only do this on CREATE - - CollectMacDeploymentsBlockStorage: - type: OS::Heat::SoftwareDeployments - properties: - name: CollectMacDeploymentsBlockStorage - servers: {get_param: blockstorage_servers} - config: {get_resource: CollectMacConfig} - actions: ['CREATE'] # Only do this on CREATE - - CollectMacDeploymentsObjectStorage: - type: OS::Heat::SoftwareDeployments - properties: - name: CollectMacDeploymentsObjectStorage - servers: {get_param: objectstorage_servers} - config: {get_resource: CollectMacConfig} - actions: ['CREATE'] # Only do this on CREATE - - CollectMacDeploymentsCephStorage: - type: OS::Heat::SoftwareDeployments - properties: - name: CollectMacDeploymentsCephStorage - servers: {get_param: cephstorage_servers} + servers: {get_param: [servers, {{role.name}}]} config: {get_resource: CollectMacConfig} actions: ['CREATE'] # Only do this on CREATE +{% endfor %} - # Now we distribute all-the-macs to all nodes + # Now we distribute all-the-macs to all Controller nodes DistributeMacConfig: type: OS::Heat::SoftwareConfig properties: @@ -101,7 +63,7 @@ resources: type: OS::Heat::SoftwareDeployments properties: name: DistributeMacDeploymentsController - servers: {get_param: controller_servers} + servers: {get_param: [servers, Controller]} config: {get_resource: DistributeMacConfig} input_values: # FIXME(shardy): It'd be more convenient if we could join these diff --git a/extraconfig/all_nodes/random_string.yaml b/extraconfig/all_nodes/random_string.j2.yaml index d38701e2..9ce2ca8a 100644 --- a/extraconfig/all_nodes/random_string.yaml +++ b/extraconfig/all_nodes/random_string.j2.yaml @@ -10,15 +10,7 @@ description: > # out-of-tree templates they may require additional parameters if the # in-tree templates add a new role. parameters: - controller_servers: - type: json - compute_servers: - type: json - blockstorage_servers: - type: json - objectstorage_servers: - type: json - cephstorage_servers: + servers: type: json # Note extra parameters can be defined, then passed data via the # environment parameter_defaults, without modifying the parent template @@ -42,7 +34,7 @@ resources: type: OS::Heat::SoftwareDeployments properties: name: RandomDeploymentsController - servers: {get_param: controller_servers} + servers: {get_param: [servers, Controller]} config: {get_resource: RandomConfig} actions: ['CREATE'] # Only do this on CREATE input_values: @@ -52,7 +44,7 @@ resources: type: OS::Heat::SoftwareDeployments properties: name: RandomDeploymentsCompute - servers: {get_param: compute_servers} + servers: {get_param: [servers, Compute]} config: {get_resource: RandomConfig} actions: ['CREATE'] # Only do this on CREATE input_values: diff --git a/extraconfig/all_nodes/swap-partition.j2.yaml b/extraconfig/all_nodes/swap-partition.j2.yaml new file mode 100644 index 00000000..36076b0c --- /dev/null +++ b/extraconfig/all_nodes/swap-partition.j2.yaml @@ -0,0 +1,44 @@ +heat_template_version: 2014-10-16 + +description: > + Extra config to add swap space to nodes. + +# Parameters passed from the parent template - note if you maintain +# out-of-tree templates they may require additional parameters if the +# in-tree templates add a new role. +parameters: + servers: + type: json + swap_partition_label: + type: string + description: Swap partition label + default: 'swap1' + + +resources: + + SwapConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: | + #!/bin/bash + set -eux + swap_partition=$(realpath /dev/disk/by-label/$swap_partition_label) + swapon $swap_partition + echo "$swap_partition swap swap defaults 0 0" >> /etc/fstab + inputs: + - name: swap_partition_label + description: Swap partition label + default: 'swap1' + +{% for role in roles %} + {{role.name}}SwapDeployment: + type: OS::Heat::SoftwareDeploymentGroup + properties: + config: {get_resource: SwapConfig} + servers: {get_param: [servers, {{role.name}}]} + input_values: + swap_partition_label: {get_param: swap_partition_label} + actions: ["CREATE"] +{% endfor %} diff --git a/extraconfig/all_nodes/swap-partition.yaml b/extraconfig/all_nodes/swap-partition.yaml deleted file mode 100644 index e6fa9eca..00000000 --- a/extraconfig/all_nodes/swap-partition.yaml +++ /dev/null @@ -1,86 +0,0 @@ -heat_template_version: 2014-10-16 - -description: > - Extra config to add swap space to nodes. - -# Parameters passed from the parent template - note if you maintain -# out-of-tree templates they may require additional parameters if the -# in-tree templates add a new role. -parameters: - controller_servers: - type: json - compute_servers: - type: json - blockstorage_servers: - type: json - objectstorage_servers: - type: json - cephstorage_servers: - type: json - swap_partition_label: - type: string - description: Swap partition label - default: 'swap1' - - -resources: - - SwapConfig: - type: OS::Heat::SoftwareConfig - properties: - group: script - config: | - #!/bin/bash - set -eux - swap_partition=$(realpath /dev/disk/by-label/$swap_partition_label) - swapon $swap_partition - echo "$swap_partition swap swap defaults 0 0" >> /etc/fstab - inputs: - - name: swap_partition_label - description: Swap partition label - default: 'swap1' - - ControllerSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: controller_servers} - input_values: - swap_partition_label: {get_param: swap_partition_label} - actions: ["CREATE"] - - ComputeSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: compute_servers} - input_values: - swap_partition_label: {get_param: swap_partition_label} - actions: ["CREATE"] - - BlockStorageSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: blockstorage_servers} - input_values: - swap_partition_label: {get_param: swap_partition_label} - actions: ["CREATE"] - - ObjectStorageSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: objectstorage_servers} - input_values: - swap_partition_label: {get_param: swap_partition_label} - actions: ["CREATE"] - - CephStorageSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: cephstorage_servers} - input_values: - swap_partition_label: {get_param: swap_partition_label} - actions: ["CREATE"] diff --git a/extraconfig/all_nodes/swap.j2.yaml b/extraconfig/all_nodes/swap.j2.yaml new file mode 100644 index 00000000..ce65dacb --- /dev/null +++ b/extraconfig/all_nodes/swap.j2.yaml @@ -0,0 +1,58 @@ +heat_template_version: 2014-10-16 + +description: > + Extra config to add swap space to nodes. + +# Parameters passed from the parent template - note if you maintain +# out-of-tree templates they may require additional parameters if the +# in-tree templates add a new role. +parameters: + servers: + type: json + swap_size_megabytes: + type: string + description: Amount of swap space to allocate in megabytes + default: '4096' + swap_path: + type: string + description: Full path to location of swap file + default: '/swap' + + +resources: + + SwapConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: | + #!/bin/bash + set -eux + if [ ! -f $swap_path ]; then + dd if=/dev/zero of=$swap_path count=$swap_size_megabytes bs=1M + chmod 0600 $swap_path + mkswap $swap_path + swapon $swap_path + else + echo "$swap_path already exists" + fi + echo "$swap_path swap swap defaults 0 0" >> /etc/fstab + inputs: + - name: swap_size_megabytes + description: Amount of swap space to allocate in megabytes + default: '4096' + - name: swap_path + description: Full path to location of swap file + default: '/swap' + +{% for role in roles %} + {{role.name}}SwapDeployment: + type: OS::Heat::SoftwareDeploymentGroup + properties: + config: {get_resource: SwapConfig} + servers: {get_param: [servers, {{role.name}}]} + input_values: + swap_size_megabytes: {get_param: swap_size_megabytes} + swap_path: {get_param: swap_path} + actions: ["CREATE"] +{% endfor %} diff --git a/extraconfig/all_nodes/swap.yaml b/extraconfig/all_nodes/swap.yaml deleted file mode 100644 index 5383ffc9..00000000 --- a/extraconfig/all_nodes/swap.yaml +++ /dev/null @@ -1,104 +0,0 @@ -heat_template_version: 2014-10-16 - -description: > - Extra config to add swap space to nodes. - -# Parameters passed from the parent template - note if you maintain -# out-of-tree templates they may require additional parameters if the -# in-tree templates add a new role. -parameters: - controller_servers: - type: json - compute_servers: - type: json - blockstorage_servers: - type: json - objectstorage_servers: - type: json - cephstorage_servers: - type: json - swap_size_megabytes: - type: string - description: Amount of swap space to allocate in megabytes - default: '4096' - swap_path: - type: string - description: Full path to location of swap file - default: '/swap' - - -resources: - - SwapConfig: - type: OS::Heat::SoftwareConfig - properties: - group: script - config: | - #!/bin/bash - set -eux - if [ ! -f $swap_path ]; then - dd if=/dev/zero of=$swap_path count=$swap_size_megabytes bs=1M - chmod 0600 $swap_path - mkswap $swap_path - swapon $swap_path - else - echo "$swap_path already exists" - fi - echo "$swap_path swap swap defaults 0 0" >> /etc/fstab - inputs: - - name: swap_size_megabytes - description: Amount of swap space to allocate in megabytes - default: '4096' - - name: swap_path - description: Full path to location of swap file - default: '/swap' - - ControllerSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: controller_servers} - input_values: - swap_size_megabytes: {get_param: swap_size_megabytes} - swap_path: {get_param: swap_path} - actions: ["CREATE"] - - ComputeSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: compute_servers} - input_values: - swap_size_megabytes: {get_param: swap_size_megabytes} - swap_path: {get_param: swap_path} - actions: ["CREATE"] - - BlockStorageSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: blockstorage_servers} - input_values: - swap_size_megabytes: {get_param: swap_size_megabytes} - swap_path: {get_param: swap_path} - actions: ["CREATE"] - - ObjectStorageSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: objectstorage_servers} - input_values: - swap_size_megabytes: {get_param: swap_size_megabytes} - swap_path: {get_param: swap_path} - actions: ["CREATE"] - - CephStorageSwapDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - config: {get_resource: SwapConfig} - servers: {get_param: cephstorage_servers} - input_values: - swap_size_megabytes: {get_param: swap_size_megabytes} - swap_path: {get_param: swap_path} - actions: ["CREATE"] diff --git a/extraconfig/tasks/major_upgrade_block_storage.sh b/extraconfig/tasks/major_upgrade_block_storage.sh index 07666245..39861826 100644 --- a/extraconfig/tasks/major_upgrade_block_storage.sh +++ b/extraconfig/tasks/major_upgrade_block_storage.sh @@ -4,5 +4,19 @@ # set -eu +# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1635205 +if [[ -n $(rpm -q --scripts openvswitch | awk '/postuninstall/,/*/' | grep "systemctl.*try-restart") ]]; then + echo "Manual upgrade of openvswitch - restart in postun detected" + mkdir OVS_UPGRADE || true + pushd OVS_UPGRADE + echo "Attempting to downloading latest openvswitch with yumdownloader" + yumdownloader --resolve openvswitch + echo "Updating openvswitch with nopostun option" + rpm -U --replacepkgs --nopostun ./*.rpm + popd +else + echo "Skipping manual upgrade of openvswitch - no restart in postun detected" +fi + yum -y install python-zaqarclient # needed for os-collect-config yum -y -q update diff --git a/extraconfig/tasks/major_upgrade_ceilometer_wsgi_mitaka_newton.yaml b/extraconfig/tasks/major_upgrade_ceilometer_wsgi_mitaka_newton.yaml new file mode 100644 index 00000000..c87e6824 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_ceilometer_wsgi_mitaka_newton.yaml @@ -0,0 +1,62 @@ +heat_template_version: 2014-10-16 + +description: > + Software-config for ceilometer configuration under httpd during upgrades + +parameters: + servers: + type: json + input_values: + type: json + description: input values for the software deployments +resources: + CeilometerWsgiMitakaNewtonPreUpgradeConfig: + type: OS::Heat::SoftwareConfig + properties: + group: puppet + config: + get_file: mitaka_to_newton_ceilometer_wsgi_upgrade.pp + + CeilometerWsgiMitakaNewtonUpgradeConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - "#!/bin/bash\n\nset -e\n\n" + - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - "disable_standalone_ceilometer_api\n\n" + + CeilometerWsgiMitakaNewtonPostUpgradeConfig: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: | + #!/bin/bash + set -e + /usr/bin/systemctl reload httpd + + CeilometerWsgiMitakaNewtonPreUpgradeDeployment: + type: OS::Heat::SoftwareDeploymentGroup + properties: + name: CeilometerWsgiMitakaNewtonPreUpgradeDeployment + servers: {get_param: [servers, Controller]} + config: {get_resource: CeilometerWsgiMitakaNewtonPreUpgradeConfig} + + CeilometerWsgiMitakaNewtonUpgradeConfigDeployment: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: CeilometerWsgiMitakaNewtonPreUpgradeDeployment + properties: + name: CeilometerWsgiMitakaNewtonUpgradeConfigDeployment + servers: {get_param: [servers, Controller]} + config: {get_resource: CeilometerWsgiMitakaNewtonUpgradeConfig} + + CeilometerWsgiMitakaNewtonPostUpgradeDeployment: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: CeilometerWsgiMitakaNewtonUpgradeConfigDeployment + properties: + name: CeilometerWsgiMitakaNewtonPostUpgradeDeployment + servers: {get_param: [servers, Controller]} + config: {get_resource: CeilometerWsgiMitakaNewtonPostUpgradeConfig} diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh index b76dd7c3..e0d160f1 100755 --- a/extraconfig/tasks/major_upgrade_ceph_mon.sh +++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh @@ -5,7 +5,7 @@ set -o pipefail echo INFO: starting $(basename "$0") # Exit if not running -if ! pidof ceph-mon; then +if ! pidof ceph-mon &> /dev/null; then echo INFO: ceph-mon is not running, skipping exit 0 fi @@ -18,13 +18,13 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then fi CEPH_STATUS=$(ceph health | awk '{print $1}') -if [ ${CEPH_STATUS} = HEALTH_ERR ]; do +if [ ${CEPH_STATUS} = HEALTH_ERR ]; then echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded exit 1 fi # Useful when upgrading with OSDs num < replica size -if [ $ignore_ceph_upgrade_warnings != "true" ]; then +if [[ ${ignore_ceph_upgrade_warnings:-False} != [Tt]rue ]]; then timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK; sleep 30; @@ -44,7 +44,7 @@ timeout 60 bash -c "while kill -0 ${MON_PID} 2> /dev/null; do done" # Update to Jewel -yum -y -q update ceph-mon +yum -y -q update ceph-mon ceph # Restart/Exit if not on Jewel, only in that case we need the changes UPDATED_VERSION=$(ceph --version | awk '{print $3}') @@ -54,7 +54,7 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do - chown -R ceph:ceph $d + chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed done # Replay udev events with newer rules @@ -71,6 +71,10 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then sleep 10; done" + # if tunables become legacy, cluster status will be HEALTH_WARN causing + # upgrade to fail on following node + ceph osd crush tunables default + echo INFO: Ceph was upgraded to Jewel else echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention diff --git a/extraconfig/tasks/major_upgrade_ceph_storage.sh b/extraconfig/tasks/major_upgrade_ceph_storage.sh index 03a1c1c2..d84cad45 100644 --- a/extraconfig/tasks/major_upgrade_ceph_storage.sh +++ b/extraconfig/tasks/major_upgrade_ceph_storage.sh @@ -18,7 +18,7 @@ set -eu echo INFO: starting $(basename "$0") # Exit if not running -if ! pidof ceph-osd; then +if ! pidof ceph-osd &> /dev/null; then echo INFO: ceph-osd is not running, skipping exit 0 fi @@ -49,6 +49,20 @@ timeout 60 bash -c "while kill -0 ${OSD_PIDS} 2> /dev/null; do sleep 2; done" +# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1635205 +if [[ -n $(rpm -q --scripts openvswitch | awk '/postuninstall/,/*/' | grep "systemctl.*try-restart") ]]; then + echo "Manual upgrade of openvswitch - restart in postun detected" + mkdir OVS_UPGRADE || true + pushd OVS_UPGRADE + echo "Attempting to downloading latest openvswitch with yumdownloader" + yumdownloader --resolve openvswitch + echo "Updating openvswitch with nopostun option" + rpm -U --replacepkgs --nopostun ./*.rpm + popd +else + echo "Skipping manual upgrade of openvswitch - no restart in postun detected" +fi + # Update (Ceph to Jewel) yum -y install python-zaqarclient # needed for os-collect-config yum -y update @@ -63,12 +77,22 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/osd /var/log/ceph /var/run/ceph /etc/ceph; do - chown -R ceph:ceph $d + chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed done # Replay udev events with newer rules udevadm trigger && udevadm settle + # If on ext4, we need to enforce lower values for name and namespace len + # or ceph-osd will refuse to start, see: http://tracker.ceph.com/issues/16187 + for OSD_ID in $OSD_IDS; do + OSD_FS=$(findmnt -n -o FSTYPE -T /var/lib/ceph/osd/ceph-${OSD_ID}) + if [ ${OSD_FS} = ext4 ]; then + crudini --set /etc/ceph/ceph.conf global osd_max_object_name_len 256 + crudini --set /etc/ceph/ceph.conf global osd_max_object_namespace_len 64 + fi + done + # Enable systemd unit systemctl enable ceph-osd.target for OSD_ID in $OSD_IDS; do diff --git a/extraconfig/tasks/major_upgrade_check.sh b/extraconfig/tasks/major_upgrade_check.sh new file mode 100755 index 00000000..8bdff5e7 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_check.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +set -eu + +check_cluster() +{ + if pcs status 2>&1 | grep -E '(cluster is not currently running)|(OFFLINE:)'; then + echo_error "ERROR: upgrade cannot start with some cluster nodes being offline" + exit 1 + fi +} + +check_pcsd() +{ + if pcs status 2>&1 | grep -E 'Offline'; then + echo_error "ERROR: upgrade cannot start with some pcsd daemon offline" + exit 1 + fi +} + +mysql_need_update() +{ + # Shall we upgrade mysql data directory during the stack upgrade? + if [ "$mariadb_do_major_upgrade" = "auto" ]; then + ret=$(is_mysql_upgrade_needed) + if [ $ret = "1" ]; then + DO_MYSQL_UPGRADE=1 + else + DO_MYSQL_UPGRADE=0 + fi + echo "mysql upgrade required: $DO_MYSQL_UPGRADE" + elif [ "$mariadb_do_major_upgrade" = "no" ]; then + DO_MYSQL_UPGRADE=0 + else + DO_MYSQL_UPGRADE=1 + fi +} + +check_disk_for_mysql_dump() +{ + # Where to backup current database if mysql need to be upgraded + MYSQL_BACKUP_DIR=/var/tmp/mysql_upgrade_osp + MYSQL_TEMP_UPGRADE_BACKUP_DIR=/var/lib/mysql-temp-upgrade-backup + # Spare disk ratio for extra safety + MYSQL_BACKUP_SIZE_RATIO=1.2 + + mysql_need_update + + if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then + if [ $DO_MYSQL_UPGRADE -eq 1 ]; then + + if [ -d "$MYSQL_BACKUP_DIR" ]; then + echo_error "Error: $MYSQL_BACKUP_DIR exists already. Likely an upgrade failed previously" + exit 1 + fi + mkdir "$MYSQL_BACKUP_DIR" + if [ $? -ne 0 ]; then + echo_error "Error: could not create temporary backup directory $MYSQL_BACKUP_DIR" + exit 1 + fi + + # the /root/.my.cnf is needed because we set the mysql root + # password from liberty onwards + backup_flags="--defaults-extra-file=/root/.my.cnf -u root --flush-privileges --all-databases --single-transaction" + # While not ideal, this step allows us to calculate exactly how much space the dump + # will need. Our main goal here is avoiding any chance of corruption due to disk space + # exhaustion + backup_size=$(mysqldump $backup_flags 2>/dev/null | wc -c) + database_size=$(du -cb /var/lib/mysql | tail -1 | awk '{ print $1 }') + free_space=$(df -B1 --output=avail "$MYSQL_BACKUP_DIR" | tail -1) + + # we need at least space for a new mysql database + dump of the existing one, + # times a small factor for additional safety room + # note: bash doesn't do floating point math or floats in if statements, + # so use python to apply the ratio and cast it back to integer + required_space=$(python -c "from __future__ import print_function; print(\"%d\" % int((($database_size + $backup_size) * $MYSQL_BACKUP_SIZE_RATIO)))") + if [ $required_space -ge $free_space ]; then + echo_error "Error: not enough free space in $MYSQL_BACKUP_DIR ($required_space bytes required)" + exit 1 + fi + fi + fi +} + +check_python_rpm() +{ + # If for some reason rpm-python are missing we want to error out early enough + if ! rpm -q rpm-python &> /dev/null; then + echo_error "ERROR: upgrade cannot start without rpm-python installed" + exit 1 + fi +} + +check_clean_cluster() +{ + if pcs status | grep -q Stopped:; then + echo_error "ERROR: upgrade cannot start with stopped resources on the cluster. Make sure that all the resources are up and running." + exit 1 + fi +} + +check_galera_root_password() +{ + # BZ: 1357112 + if [ ! -e /root/.my.cnf ]; then + echo_error "ERROR: upgrade cannot be started, the galera password is missing. The overcloud needs update." + exit 1 + fi +} diff --git a/extraconfig/tasks/major_upgrade_compute.sh b/extraconfig/tasks/major_upgrade_compute.sh index 78628c8c..b0d42806 100644 --- a/extraconfig/tasks/major_upgrade_compute.sh +++ b/extraconfig/tasks/major_upgrade_compute.sh @@ -12,13 +12,33 @@ cat > $UPGRADE_SCRIPT << ENDOFCAT ### This file is automatically delivered to the compute nodes as part of the ### tripleo upgrades workflow +set -eu + # pin nova to kilo (messaging +-1) for the nova-compute service crudini --set /etc/nova/nova.conf upgrade_levels compute $upgrade_level_nova_compute +# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1635205 +if [[ -n \$(rpm -q --scripts openvswitch | awk '/postuninstall/,/*/' | grep "systemctl.*try-restart") ]]; then + echo "Manual upgrade of openvswitch - restart in postun detected" + mkdir OVS_UPGRADE || true + pushd OVS_UPGRADE + echo "Attempting to downloading latest openvswitch with yumdownloader" + yumdownloader --resolve openvswitch + echo "Updating openvswitch with nopostun option" + rpm -U --replacepkgs --nopostun ./*.rpm + popd +else + echo "Skipping manual upgrade of openvswitch - no restart in postun detected" +fi + yum -y install python-zaqarclient # needed for os-collect-config yum -y update +# Due to bug#1640177 we need to restart compute agent +echo "Restarting openstack ceilometer agent compute" +systemctl restart openstack-ceilometer-compute + ENDOFCAT # ensure the permissions are OK diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index 0b702630..739c01dd 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -2,207 +2,35 @@ set -eu -cluster_sync_timeout=1800 - -if pcs status 2>&1 | grep -E '(cluster is not currently running)|(OFFLINE:)'; then - echo_error "ERROR: upgrade cannot start with some cluster nodes being offline" - exit 1 +check_cluster +check_pcsd +if [[ -n $(is_bootstrap_node) ]]; then + check_clean_cluster fi - +check_python_rpm +check_galera_root_password +check_disk_for_mysql_dump # We want to disable fencing during the cluster --stop as it might fence # nodes where a service fails to stop, which could be fatal during an upgrade # procedure. So we remember the stonith state. If it was enabled we reenable it # at the end of this script -STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }') -pcs property set stonith-enabled=false - -# If for some reason rpm-python are missing we want to error out early enough -if ! rpm -q rpm-python &> /dev/null; then - echo_error "ERROR: upgrade cannot start without rpm-python installed" - exit 1 -fi - -# In case the mysql package is updated, the database on disk must be -# upgraded as well. This typically needs to happen during major -# version upgrades (e.g. 5.5 -> 5.6, 5.5 -> 10.1...) -# -# Because in-place upgrades are not supported across 2+ major versions -# (e.g. 5.5 -> 10.1), we rely on logical upgrades via dump/restore cycle -# https://bugzilla.redhat.com/show_bug.cgi?id=1341968 -# -# The default is to determine automatically if upgrade is needed based -# on mysql package versionning, but this can be overriden manually -# to support specific upgrade scenario - -# Where to backup current database if mysql need to be upgraded -MYSQL_BACKUP_DIR=/var/tmp/mysql_upgrade_osp -MYSQL_TEMP_UPGRADE_BACKUP_DIR=/var/lib/mysql-temp-upgrade-backup -# Spare disk ratio for extra safety -MYSQL_BACKUP_SIZE_RATIO=1.2 - -# Shall we upgrade mysql data directory during the stack upgrade? -if [ "$mariadb_do_major_upgrade" = "auto" ]; then - ret=$(is_mysql_upgrade_needed) - if [ $ret = "1" ]; then - DO_MYSQL_UPGRADE=1 - else - DO_MYSQL_UPGRADE=0 - fi - echo "mysql upgrade required: $DO_MYSQL_UPGRADE" -elif [ "$mariadb_do_major_upgrade" = "no" ]; then - DO_MYSQL_UPGRADE=0 -else - DO_MYSQL_UPGRADE=1 -fi - -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then - if [ $DO_MYSQL_UPGRADE -eq 1 ]; then - if [ -d "$MYSQL_BACKUP_DIR" ]; then - echo_error "Error: $MYSQL_BACKUP_DIR exists already. Likely an upgrade failed previously" - exit 1 - fi - mkdir "$MYSQL_BACKUP_DIR" - if [ $? -ne 0 ]; then - echo_error "Error: could not create temporary backup directory $MYSQL_BACKUP_DIR" - exit 1 - fi - - # the /root/.my.cnf is needed because we set the mysql root - # password from liberty onwards - backup_flags="--defaults-extra-file=/root/.my.cnf -u root --flush-privileges --all-databases --single-transaction" - # While not ideal, this step allows us to calculate exactly how much space the dump - # will need. Our main goal here is avoiding any chance of corruption due to disk space - # exhaustion - backup_size=$(mysqldump $backup_flags 2>/dev/null | wc -c) - database_size=$(du -cb /var/lib/mysql | tail -1 | awk '{ print $1 }') - free_space=$(df -B1 --output=avail "$MYSQL_BACKUP_DIR" | tail -1) - - # we need at least space for a new mysql database + dump of the existing one, - # times a small factor for additional safety room - # note: bash doesn't do floating point math or floats in if statements, - # so use python to apply the ratio and cast it back to integer - required_space=$(python -c "from __future__ import print_function; print(\"%d\" % int((($database_size + $backup_size) * $MYSQL_BACKUP_SIZE_RATIO)))") - if [ $required_space -ge $free_space ]; then - echo_error "Error: not enough free space in $MYSQL_BACKUP_DIR ($required_space bytes required)" - exit 1 - fi - - mysqldump $backup_flags > "$MYSQL_BACKUP_DIR/openstack_database.sql" - cp -rdp /etc/my.cnf* "$MYSQL_BACKUP_DIR" - fi - - pcs resource disable httpd - check_resource httpd stopped 1800 - pcs resource disable openstack-core - check_resource openstack-core stopped 1800 - pcs resource disable redis - check_resource redis stopped 600 - pcs resource disable mongod - check_resource mongod stopped 600 - pcs resource disable rabbitmq - check_resource rabbitmq stopped 600 - pcs resource disable memcached - check_resource memcached stopped 600 - pcs resource disable galera - check_resource galera stopped 600 - # Disable all VIPs before stopping the cluster, so that pcs doesn't use one as a source address: - # https://bugzilla.redhat.com/show_bug.cgi?id=1330688 - for vip in $(pcs resource show | grep ocf::heartbeat:IPaddr2 | grep Started | awk '{ print $1 }'); do - pcs resource disable $vip - check_resource $vip stopped 60 - done - pcs cluster stop --all -fi - -# Swift isn't controled by pacemaker -systemctl_swift stop - -tstart=$(date +%s) -while systemctl is-active pacemaker; do - sleep 5 - tnow=$(date +%s) - if (( tnow-tstart > cluster_sync_timeout )) ; then - echo_error "ERROR: cluster shutdown timed out" - exit 1 - fi -done - -# The reason we do an sql dump *and* we move the old dir out of -# the way is because it gives us an extra level of safety in case -# something goes wrong during the upgrade. Once the restore is -# successful we go ahead and remove it. If the directory exists -# we bail out as it means the upgrade process had issues in the last -# run. -if [ $DO_MYSQL_UPGRADE -eq 1 ]; then - if [ -d $MYSQL_TEMP_UPGRADE_BACKUP_DIR ]; then - echo_error "ERROR: mysql backup dir already exist" - exit 1 +if [[ -n $(is_bootstrap_node) ]]; then + STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk '{ print $2 }') + # We create this empty file if stonith was set to true so we can reenable stonith in step2 + rm -f /var/tmp/stonith-true + if [ $STONITH_STATE == "true" ]; then + touch /var/tmp/stonith-true fi - mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR -fi - -yum -y install python-zaqarclient # needed for os-collect-config -yum -y -q update - -# We need to ensure at least those two configuration settings, otherwise -# mariadb 10.1+ won't activate galera replication. -# wsrep_cluster_address must only be set though, its value does not -# matter because it's overriden by the galera resource agent. -cat >> /etc/my.cnf.d/galera.cnf <<EOF -[mysqld] -wsrep_on = ON -wsrep_cluster_address = gcomm://localhost -EOF - -if [ $DO_MYSQL_UPGRADE -eq 1 ]; then - # Scripts run via heat have no HOME variable set and this confuses - # mysqladmin - export HOME=/root - - mkdir /var/lib/mysql || /bin/true - chown mysql:mysql /var/lib/mysql - chmod 0755 /var/lib/mysql - restorecon -R /var/lib/mysql/ - mysql_install_db --datadir=/var/lib/mysql --user=mysql - chown -R mysql:mysql /var/lib/mysql/ - - if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then - mysqld_safe --wsrep-new-cluster & - # We have a populated /root/.my.cnf with root/password here so - # we need to temporarily rename it because the newly created - # db is empty and no root password is set - mv /root/.my.cnf /root/.my.cnf.temporary - timeout 60 sh -c 'while ! mysql -e "" &> /dev/null; do sleep 1; done' - mysql -u root < "$MYSQL_BACKUP_DIR/openstack_database.sql" - mv /root/.my.cnf.temporary /root/.my.cnf - mysqladmin -u root shutdown - # The import was successful so we may remove the folder - rm -r "$MYSQL_BACKUP_DIR" - fi -fi - -# If we reached here without error we can safely blow away the origin -# mysql dir from every controller - -# TODO: What if the upgrade fails on the bootstrap node, but not on -# this controller. Data may be lost. -if [ $DO_MYSQL_UPGRADE -eq 1 ]; then - rm -r $MYSQL_TEMP_UPGRADE_BACKUP_DIR + pcs property set stonith-enabled=false fi -# Let's reset the stonith back to true if it was true, before starting the cluster -if [ $STONITH_STATE == "true" ]; then - pcs -f /var/lib/pacemaker/cib/cib.xml property set stonith-enabled=true +# Migrate to HA NG and fix up rabbitmq queues +# We fix up the rabbitmq ha queues after the migration because it will +# restart the rabbitmq resource. Doing it after the migration means no other +# services will be restart as there are no other constraints +if [[ -n $(is_bootstrap_node) ]]; then + migrate_full_to_ng_ha + rabbitmq_mitaka_newton_upgrade fi -# Pin messages sent to compute nodes to kilo, these will be upgraded later -crudini --set /etc/nova/nova.conf upgrade_levels compute "$upgrade_level_nova_compute" -# https://bugzilla.redhat.com/show_bug.cgi?id=1284047 -# Change-Id: Ib3f6c12ff5471e1f017f28b16b1e6496a4a4b435 -crudini --set /etc/ceilometer/ceilometer.conf DEFAULT rpc_backend rabbit -# https://bugzilla.redhat.com/show_bug.cgi?id=1284058 -# Ifd1861e3df46fad0e44ff9b5cbd58711bbc87c97 Swift Ceilometer middleware no longer exists -crudini --set /etc/swift/proxy-server.conf pipeline:main pipeline "catch_errors healthcheck cache ratelimit tempurl formpost authtoken keystone staticweb proxy-logging proxy-server" -# LP: 1615035, required only for M/N upgrade. -crudini --set /etc/nova/nova.conf DEFAULT scheduler_host_manager host_manager diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index bc708cce..7cc6735f 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -2,70 +2,186 @@ set -eu -cluster_form_timeout=600 -cluster_settle_timeout=600 -galera_sync_timeout=600 +cluster_sync_timeout=1800 -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then - pcs cluster start --all +# After migrating the cluster to HA-NG the services not under pacemaker's control +# are still up and running. We need to stop them explicitely otherwise during the yum +# upgrade the rpm %post sections will try to do a systemctl try-restart <service>, which +# is going to take a long time because rabbit is down. By having the service stopped +# systemctl try-restart is a noop +for service in $(services_to_migrate); do + manage_systemd_service stop "${service%%-clone}" + # So the reason for not reusing check_resource_systemd is that + # I have observed systemctl is-active returning unknown with at least + # one service that was stopped (See LP 1627254) + timeout=600 tstart=$(date +%s) - while pcs status 2>&1 | grep -E '(cluster is not currently running)|(OFFLINE:)'; do - sleep 5 - tnow=$(date +%s) - if (( tnow-tstart > cluster_form_timeout )) ; then - echo_error "ERROR: timed out forming the cluster" - exit 1 - fi + tend=$(( $tstart + $timeout )) + check_interval=3 + while (( $(date +%s) < $tend )); do + if [[ "$(systemctl is-active ${service%%-clone})" = "active" ]]; then + echo "$service still active, sleeping $check_interval seconds." + sleep $check_interval + else + # we do not care if it is inactive, unknown or failed as long as it is + # not running + break + fi + done +done - if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then - echo_error "ERROR: timed out waiting for cluster to finish transition" - exit 1 +# In case the mysql package is updated, the database on disk must be +# upgraded as well. This typically needs to happen during major +# version upgrades (e.g. 5.5 -> 5.6, 5.5 -> 10.1...) +# +# Because in-place upgrades are not supported across 2+ major versions +# (e.g. 5.5 -> 10.1), we rely on logical upgrades via dump/restore cycle +# https://bugzilla.redhat.com/show_bug.cgi?id=1341968 +# +# The default is to determine automatically if upgrade is needed based +# on mysql package versionning, but this can be overriden manually +# to support specific upgrade scenario + +# Calling this function will set the DO_MYSQL_UPGRADE variable which is used +# later +mysql_need_update + +if [[ -n $(is_bootstrap_node) ]]; then + if [ $DO_MYSQL_UPGRADE -eq 1 ]; then + mysqldump $backup_flags > "$MYSQL_BACKUP_DIR/openstack_database.sql" + cp -rdp /etc/my.cnf* "$MYSQL_BACKUP_DIR" fi - for vip in $(pcs resource show | grep ocf::heartbeat:IPaddr2 | grep Stopped | awk '{ print $1 }'); do - pcs resource enable $vip - check_resource $vip started 60 + pcs resource disable redis + check_resource redis stopped 600 + pcs resource disable rabbitmq + check_resource rabbitmq stopped 600 + pcs resource disable galera + check_resource galera stopped 600 + pcs resource disable openstack-cinder-volume + check_resource openstack-cinder-volume stopped 600 + # Disable all VIPs before stopping the cluster, so that pcs doesn't use one as a source address: + # https://bugzilla.redhat.com/show_bug.cgi?id=1330688 + for vip in $(pcs resource show | grep ocf::heartbeat:IPaddr2 | grep Started | awk '{ print $1 }'); do + pcs resource disable $vip + check_resource $vip stopped 60 done + pcs cluster stop --all +fi - pcs resource enable galera - check_resource galera started 600 - pcs resource enable mongod - check_resource mongod started 600 - tstart=$(date +%s) - while ! clustercheck; do - sleep 5 - tnow=$(date +%s) - if (( tnow-tstart > galera_sync_timeout )) ; then - echo_error "ERROR galera sync timed out" - exit 1 - fi - done +# Swift isn't controlled by pacemaker +systemctl_swift stop - # Run all the db syncs - # TODO: check if this can be triggered in puppet and removed from here - ceilometer-dbsync --config-file=/etc/ceilometer/ceilometer.conf - cinder-manage db sync - glance-manage --config-file=/etc/glance/glance-registry.conf db_sync - heat-manage --config-file /etc/heat/heat.conf db_sync - keystone-manage db_sync - neutron-db-manage --config-file /etc/neutron/neutron.conf --config-file /etc/neutron/plugin.ini upgrade head - nova-manage db sync - nova-manage api_db sync - - pcs resource enable memcached - check_resource memcached started 600 - pcs resource enable rabbitmq - check_resource rabbitmq started 600 - pcs resource enable redis - check_resource redis started 600 - pcs resource enable openstack-core - check_resource openstack-core started 1800 - pcs resource enable httpd - check_resource httpd started 1800 +tstart=$(date +%s) +while systemctl is-active pacemaker; do + sleep 5 + tnow=$(date +%s) + if (( tnow-tstart > cluster_sync_timeout )) ; then + echo_error "ERROR: cluster shutdown timed out" + exit 1 + fi +done + +# The reason we do an sql dump *and* we move the old dir out of +# the way is because it gives us an extra level of safety in case +# something goes wrong during the upgrade. Once the restore is +# successful we go ahead and remove it. If the directory exists +# we bail out as it means the upgrade process had issues in the last +# run. +if [ $DO_MYSQL_UPGRADE -eq 1 ]; then + if [ -d $MYSQL_TEMP_UPGRADE_BACKUP_DIR ]; then + echo_error "ERROR: mysql backup dir already exist" + exit 1 + fi + mv /var/lib/mysql $MYSQL_TEMP_UPGRADE_BACKUP_DIR fi -# Swift isn't controled by heat -systemctl_swift start +# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1635205 +if [[ -n $(rpm -q --scripts openvswitch | awk '/postuninstall/,/*/' | grep "systemctl.*try-restart") ]]; then + echo "Manual upgrade of openvswitch - restart in postun detected" + mkdir OVS_UPGRADE || true + pushd OVS_UPGRADE + echo "Attempting to downloading latest openvswitch with yumdownloader" + yumdownloader --resolve openvswitch + echo "Updating openvswitch with nopostun option" + rpm -U --replacepkgs --nopostun ./*.rpm + popd +else + echo "Skipping manual upgrade of openvswitch - no restart in postun detected" +fi + +yum -y install python-zaqarclient # needed for os-collect-config +yum -y -q update + +# We need to ensure at least those two configuration settings, otherwise +# mariadb 10.1+ won't activate galera replication. +# wsrep_cluster_address must only be set though, its value does not +# matter because it's overriden by the galera resource agent. +cat >> /etc/my.cnf.d/galera.cnf <<EOF +[mysqld] +wsrep_on = ON +wsrep_cluster_address = gcomm://localhost +EOF + +if [ $DO_MYSQL_UPGRADE -eq 1 ]; then + # Scripts run via heat have no HOME variable set and this confuses + # mysqladmin + export HOME=/root + + mkdir /var/lib/mysql || /bin/true + chown mysql:mysql /var/lib/mysql + chmod 0755 /var/lib/mysql + restorecon -R /var/lib/mysql/ + mysql_install_db --datadir=/var/lib/mysql --user=mysql + chown -R mysql:mysql /var/lib/mysql/ + + if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then + mysqld_safe --wsrep-new-cluster & + # We have a populated /root/.my.cnf with root/password here so + # we need to temporarily rename it because the newly created + # db is empty and no root password is set + mv /root/.my.cnf /root/.my.cnf.temporary + timeout 60 sh -c 'while ! mysql -e "" &> /dev/null; do sleep 1; done' + mysql -u root < "$MYSQL_BACKUP_DIR/openstack_database.sql" + mv /root/.my.cnf.temporary /root/.my.cnf + mysqladmin -u root shutdown + # The import was successful so we may remove the folder + rm -r "$MYSQL_BACKUP_DIR" + fi +fi + +# If we reached here without error we can safely blow away the origin +# mysql dir from every controller + +# TODO: What if the upgrade fails on the bootstrap node, but not on +# this controller. Data may be lost. +if [ $DO_MYSQL_UPGRADE -eq 1 ]; then + rm -r $MYSQL_TEMP_UPGRADE_BACKUP_DIR +fi + +# Let's reset the stonith back to true if it was true, before starting the cluster +if [[ -n $(is_bootstrap_node) ]]; then + if [ -f /var/tmp/stonith-true ]; then + pcs -f /var/lib/pacemaker/cib/cib.xml property set stonith-enabled=true + fi + rm -f /var/tmp/stonith-true +fi + +# Pin messages sent to compute nodes to kilo, these will be upgraded later +crudini --set /etc/nova/nova.conf upgrade_levels compute "$upgrade_level_nova_compute" +# https://bugzilla.redhat.com/show_bug.cgi?id=1284047 +# Change-Id: Ib3f6c12ff5471e1f017f28b16b1e6496a4a4b435 +crudini --set /etc/ceilometer/ceilometer.conf DEFAULT rpc_backend rabbit +# https://bugzilla.redhat.com/show_bug.cgi?id=1284058 +# Ifd1861e3df46fad0e44ff9b5cbd58711bbc87c97 Swift Ceilometer middleware no longer exists +crudini --set /etc/swift/proxy-server.conf pipeline:main pipeline "catch_errors healthcheck cache ratelimit tempurl formpost authtoken keystone staticweb proxy-logging proxy-server" +# LP: 1615035, required only for M/N upgrade. +crudini --set /etc/nova/nova.conf DEFAULT scheduler_host_manager host_manager +# LP: 1627450, required only for M/N upgrade +crudini --set /etc/nova/nova.conf DEFAULT scheduler_driver filter_scheduler + +crudini --set /etc/sahara/sahara.conf DEFAULT plugins ambari,cdh,mapr,vanilla,spark,storm + diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh new file mode 100755 index 00000000..37061512 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +set -eu + +cluster_form_timeout=600 +cluster_settle_timeout=1800 +galera_sync_timeout=600 + +if [[ -n $(is_bootstrap_node) ]]; then + pcs cluster start --all + + tstart=$(date +%s) + while pcs status 2>&1 | grep -E '(cluster is not currently running)|(OFFLINE:)'; do + sleep 5 + tnow=$(date +%s) + if (( tnow-tstart > cluster_form_timeout )) ; then + echo_error "ERROR: timed out forming the cluster" + exit 1 + fi + done + + if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then + echo_error "ERROR: timed out waiting for cluster to finish transition" + exit 1 + fi + + for vip in $(pcs resource show | grep ocf::heartbeat:IPaddr2 | grep Stopped | awk '{ print $1 }'); do + pcs resource enable $vip + check_resource_pacemaker $vip started 60 + done +fi + +start_or_enable_service galera +check_resource galera started 600 +start_or_enable_service redis +check_resource redis started 600 +# We need mongod which is now a systemd service up and running before calling +# ceilometer-dbsync. There is still a race here: mongod might not be up on all nodes +# so ceilometer-dbsync will fail a couple of times before that. As it retries indefinitely +# we should be good. +# Due to LP Bug https://bugs.launchpad.net/tripleo/+bug/1627254 am using systemctl directly atm +systemctl start mongod +check_resource mongod started 600 + +if [[ -n $(is_bootstrap_node) ]]; then + tstart=$(date +%s) + while ! clustercheck; do + sleep 5 + tnow=$(date +%s) + if (( tnow-tstart > galera_sync_timeout )) ; then + echo_error "ERROR galera sync timed out" + exit 1 + fi + done + + # Run all the db syncs + # TODO: check if this can be triggered in puppet and removed from here + ceilometer-dbsync --config-file=/etc/ceilometer/ceilometer.conf + cinder-manage db sync + glance-manage --config-file=/etc/glance/glance-registry.conf db_sync + heat-manage --config-file /etc/heat/heat.conf db_sync + keystone-manage db_sync + neutron-db-manage --config-file /etc/neutron/neutron.conf --config-file /etc/neutron/plugin.ini upgrade head + nova-manage db sync + nova-manage api_db sync + nova-manage db online_data_migrations + sahara-db-manage --config-file /etc/sahara/sahara.conf upgrade head +fi diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_4.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_4.sh new file mode 100755 index 00000000..d2cb9553 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_4.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eu + +start_or_enable_service rabbitmq +check_resource rabbitmq started 600 +start_or_enable_service redis +check_resource redis started 600 +start_or_enable_service openstack-cinder-volume +check_resource openstack-cinder-volume started 600 + +# start httpd so keystone is available for gnocchi +# upgrade to run. +systemctl start httpd + +# Swift isn't controled by pacemaker +systemctl_swift start diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_5.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_5.sh new file mode 100755 index 00000000..fa95f1f8 --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_5.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -eu + +if [[ -n $(is_bootstrap_node) ]]; then + # run gnocchi upgrade + gnocchi-upgrade +fi diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh new file mode 100755 index 00000000..d569084d --- /dev/null +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_6.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -eu + +# We need to start the systemd services we explicitely stopped at step _1.sh +# FIXME: Should we let puppet during the convergence step do the service enabling or +# should we add it here? +services=$(services_to_migrate) +if [[ ${keep_sahara_services_on_upgrade} =~ [Ff]alse ]] ; then + services=${services%%openstack-sahara*} +fi +for service in $services; do + manage_systemd_service start "${service%%-clone}" + check_resource_systemd "${service%%-clone}" started 600 +done diff --git a/extraconfig/tasks/major_upgrade_object_storage.sh b/extraconfig/tasks/major_upgrade_object_storage.sh index 931f4f42..2667bb16 100644 --- a/extraconfig/tasks/major_upgrade_object_storage.sh +++ b/extraconfig/tasks/major_upgrade_object_storage.sh @@ -12,6 +12,7 @@ cat > $UPGRADE_SCRIPT << ENDOFCAT ### This file is automatically delivered to the swift-storage nodes as part of the ### tripleo upgrades workflow +set -eu function systemctl_swift { action=\$1 @@ -22,6 +23,19 @@ function systemctl_swift { done } +# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1635205 +if [[ -n \$(rpm -q --scripts openvswitch | awk '/postuninstall/,/*/' | grep "systemctl.*try-restart") ]]; then + echo "Manual upgrade of openvswitch - restart in postun detected" + mkdir OVS_UPGRADE || true + pushd OVS_UPGRADE + echo "Attempting to downloading latest openvswitch with yumdownloader" + yumdownloader --resolve openvswitch + echo "Updating openvswitch with nopostun option" + rpm -U --replacepkgs --nopostun ./*.rpm + popd +else + echo "Skipping manual upgrade of openvswitch - no restart in postun detected" +fi systemctl_swift stop diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index 598d22d0..b0418a56 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -1,16 +1,8 @@ -heat_template_version: 2014-10-16 +heat_template_version: 2016-10-14 description: 'Upgrade for Pacemaker deployments' parameters: - controller_servers: - type: json - compute_servers: - type: json - blockstorage_servers: - type: json - objectstorage_servers: - type: json - cephstorage_servers: + servers: type: json input_values: type: json @@ -30,6 +22,11 @@ parameters: type: boolean default: false description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean + KeepSaharaServicesOnUpgrade: + type: boolean + default: true + description: Whether to keep Sahara services when upgrading controller nodes from mitaka to newton + resources: # TODO(jistr): for Mitaka->Newton upgrades and further we can use @@ -54,9 +51,10 @@ resources: CephMonUpgradeDeployment: type: OS::Heat::SoftwareDeploymentGroup properties: - servers: {get_param: controller_servers} + servers: {get_param: [servers, Controller]} config: {get_resource: CephMonUpgradeConfig} input_values: {get_param: input_values} + update_policy: batch_create: max_batch_size: 1 rolling_update: @@ -82,6 +80,7 @@ resources: params: MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade} - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_check.sh - get_file: major_upgrade_pacemaker_migrations.sh - get_file: major_upgrade_controller_pacemaker_1.sh @@ -89,7 +88,7 @@ resources: type: OS::Heat::SoftwareDeploymentGroup depends_on: CephMonUpgradeDeployment properties: - servers: {get_param: controller_servers} + servers: {get_param: [servers, Controller]} config: {get_resource: ControllerPacemakerUpgradeConfig_Step1} input_values: {get_param: input_values} @@ -103,7 +102,7 @@ resources: BlockStorageUpgradeDeployment: type: OS::Heat::SoftwareDeploymentGroup properties: - servers: {get_param: blockstorage_servers} + servers: {get_param: [servers, BlockStorage]} config: {get_resource: BlockStorageUpgradeConfig} input_values: {get_param: input_values} @@ -114,7 +113,20 @@ resources: config: list_join: - '' - - - get_file: pacemaker_common_functions.sh + - - str_replace: + template: | + #!/bin/bash + upgrade_level_nova_compute='UPGRADE_LEVEL_NOVA_COMPUTE' + params: + UPGRADE_LEVEL_NOVA_COMPUTE: {get_param: UpgradeLevelNovaCompute} + - str_replace: + template: | + #!/bin/bash + mariadb_do_major_upgrade='MYSQL_MAJOR_UPGRADE' + params: + MYSQL_MAJOR_UPGRADE: {get_param: MySqlMajorUpgrade} + - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_check.sh - get_file: major_upgrade_pacemaker_migrations.sh - get_file: major_upgrade_controller_pacemaker_2.sh @@ -122,7 +134,88 @@ resources: type: OS::Heat::SoftwareDeploymentGroup depends_on: BlockStorageUpgradeDeployment properties: - servers: {get_param: controller_servers} + servers: {get_param: [servers, Controller]} config: {get_resource: ControllerPacemakerUpgradeConfig_Step2} input_values: {get_param: input_values} + ControllerPacemakerUpgradeConfig_Step3: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_3.sh + + ControllerPacemakerUpgradeDeployment_Step3: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: ControllerPacemakerUpgradeDeployment_Step2 + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step3} + input_values: {get_param: input_values} + + ControllerPacemakerUpgradeConfig_Step4: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_4.sh + + ControllerPacemakerUpgradeDeployment_Step4: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: ControllerPacemakerUpgradeDeployment_Step3 + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step4} + input_values: {get_param: input_values} + + ControllerPacemakerUpgradeConfig_Step5: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_5.sh + + ControllerPacemakerUpgradeDeployment_Step5: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: ControllerPacemakerUpgradeDeployment_Step4 + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step5} + input_values: {get_param: input_values} + + ControllerPacemakerUpgradeConfig_Step6: + type: OS::Heat::SoftwareConfig + properties: + group: script + config: + list_join: + - '' + - - str_replace: + template: | + #!/bin/bash + keep_sahara_services_on_upgrade='KEEP_SAHARA_SERVICES_ON_UPGRADE' + params: + KEEP_SAHARA_SERVICES_ON_UPGRADE: {get_param: KeepSaharaServicesOnUpgrade} + - get_file: pacemaker_common_functions.sh + - get_file: major_upgrade_pacemaker_migrations.sh + - get_file: major_upgrade_controller_pacemaker_6.sh + + ControllerPacemakerUpgradeDeployment_Step6: + type: OS::Heat::SoftwareDeploymentGroup + depends_on: ControllerPacemakerUpgradeDeployment_Step5 + properties: + servers: {get_param: [servers, Controller]} + config: {get_resource: ControllerPacemakerUpgradeConfig_Step6} + input_values: {get_param: input_values} diff --git a/extraconfig/tasks/major_upgrade_pacemaker_init.yaml b/extraconfig/tasks/major_upgrade_pacemaker_init.j2.yaml index 623549a0..f6aa3066 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_init.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker_init.j2.yaml @@ -3,15 +3,7 @@ description: 'Upgrade for Pacemaker deployments' parameters: - controller_servers: - type: json - compute_servers: - type: json - blockstorage_servers: - type: json - objectstorage_servers: - type: json - cephstorage_servers: + servers: type: json input_values: type: json @@ -43,45 +35,12 @@ resources: - "if [[ -f /etc/resolv.conf.save ]] ; then rm /etc/resolv.conf.save; fi\n\n" - get_param: UpgradeInitCommand - UpgradeInitControllerDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - servers: {get_param: controller_servers} - config: {get_resource: UpgradeInitConfig} - input_values: {get_param: input_values} - - UpgradeInitComputeDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - servers: {get_param: compute_servers} - config: {get_resource: UpgradeInitConfig} - input_values: {get_param: input_values} - - UpgradeInitBlockStorageDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - servers: {get_param: blockstorage_servers} - config: {get_resource: UpgradeInitConfig} - input_values: {get_param: input_values} - - UpgradeInitObjectStorageDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - servers: {get_param: objectstorage_servers} - config: {get_resource: UpgradeInitConfig} - input_values: {get_param: input_values} - - UpgradeInitCephStorageDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - servers: {get_param: cephstorage_servers} - config: {get_resource: UpgradeInitConfig} - input_values: {get_param: input_values} - # TODO(jistr): for Mitaka->Newton upgrades and further we can use # map_merge with input_values instead of feeding params into scripts # via str_replace on bash snippets + # FIXME(shardy) we have hard-coded per-role *ScriptConfig's here + # Would be better to have a common config for all roles ComputeDeliverUpgradeScriptConfig: type: OS::Heat::SoftwareConfig properties: @@ -97,35 +56,32 @@ resources: UPGRADE_LEVEL_NOVA_COMPUTE: {get_param: UpgradeLevelNovaCompute} - get_file: major_upgrade_compute.sh - ComputeDeliverUpgradeScriptDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - servers: {get_param: compute_servers} - config: {get_resource: ComputeDeliverUpgradeScriptConfig} - input_values: {get_param: input_values} - ObjectStorageDeliverUpgradeScriptConfig: type: OS::Heat::SoftwareConfig properties: group: script config: {get_file: major_upgrade_object_storage.sh} - ObjectStorageDeliverUpgradeScriptDeployment: - type: OS::Heat::SoftwareDeploymentGroup - properties: - servers: {get_param: objectstorage_servers} - config: {get_resource: ObjectStorageDeliverUpgradeScriptConfig} - input_values: {get_param: input_values} - CephStorageDeliverUpgradeScriptConfig: type: OS::Heat::SoftwareConfig properties: group: script config: {get_file: major_upgrade_ceph_storage.sh} - CephStorageDeliverUpgradeScriptDeployment: +{% for role in roles %} + UpgradeInit{{role.name}}Deployment: + type: OS::Heat::SoftwareDeploymentGroup + properties: + servers: {get_param: [servers, {{role.name}}]} + config: {get_resource: UpgradeInitConfig} + input_values: {get_param: input_values} + + {% if not role.name in ['Controller', 'BlockStorage'] %} + {{role.name}}DeliverUpgradeScriptDeployment: type: OS::Heat::SoftwareDeploymentGroup properties: - servers: {get_param: cephstorage_servers} - config: {get_resource: CephStorageDeliverUpgradeScriptConfig} + servers: {get_param: [servers, {{role.name}}]} + config: {get_resource: {{role.name}}DeliverUpgradeScriptConfig} input_values: {get_param: input_values} + {% endif %} +{% endfor %} diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh index 7ed7012d..7c9083a4 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh +++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh @@ -56,3 +56,126 @@ function is_mysql_upgrade_needed { fi echo "1" } + +# This function returns the list of services to be migrated away from pacemaker +# and to systemd. The reason to have these services in a separate function is because +# this list is needed in three different places: major_upgrade_controller_pacemaker_{1,2} +# and in the function to migrate the cluster from full HA to HA NG +function services_to_migrate { + # The following PCMK resources the ones the we are going to delete + PCMK_RESOURCE_TODELETE=" + httpd-clone + memcached-clone + mongod-clone + neutron-dhcp-agent-clone + neutron-l3-agent-clone + neutron-metadata-agent-clone + neutron-netns-cleanup-clone + neutron-openvswitch-agent-clone + neutron-ovs-cleanup-clone + neutron-server-clone + openstack-aodh-evaluator-clone + openstack-aodh-listener-clone + openstack-aodh-notifier-clone + openstack-ceilometer-central-clone + openstack-ceilometer-collector-clone + openstack-ceilometer-notification-clone + openstack-cinder-api-clone + openstack-cinder-scheduler-clone + openstack-glance-api-clone + openstack-glance-registry-clone + openstack-gnocchi-metricd-clone + openstack-gnocchi-statsd-clone + openstack-heat-api-cfn-clone + openstack-heat-api-clone + openstack-heat-api-cloudwatch-clone + openstack-heat-engine-clone + openstack-nova-api-clone + openstack-nova-conductor-clone + openstack-nova-consoleauth-clone + openstack-nova-novncproxy-clone + openstack-nova-scheduler-clone + openstack-sahara-api-clone + openstack-sahara-engine-clone + " + echo $PCMK_RESOURCE_TODELETE +} + +# This function will migrate a mitaka system where all the resources are managed +# via pacemaker to a newton setup where only a few services will be managed by pacemaker +# On a high-level it will operate as follows: +# 1. Set the cluster in maintenance-mode so no start/stop action will actually take place +# during the conversion +# 2. Remove all the colocation constraints and then the ordering constraints, except the +# ones related to haproxy/VIPs which exist in Newton as well +# 3. Take the cluster out of maintenance-mode +# 4. Remove all the resources that won't be managed by pacemaker in newton. The +# outcome will be +# that they are stopped and removed from pacemakers control +# 5. Do a resource cleanup to make sure the cluster is in a clean state +function migrate_full_to_ng_ha { + if [[ -n $(pcmk_running) ]]; then + pcs property set maintenance-mode=true + + # First we go through all the colocation constraints (except the ones + # we want to keep, i.e. the haproxy/ip ones) and we remove those + COL_CONSTRAINTS=$(pcs config show | sed -n '/^Colocation Constraints:$/,/^$/p' | grep -v "Colocation Constraints:" | egrep -v "ip-.*haproxy" | awk '{print $NF}' | cut -f2 -d: |cut -f1 -d\)) + for constraint in $COL_CONSTRAINTS; do + log_debug "Deleting colocation constraint $constraint from CIB" + pcs constraint remove "$constraint" + done + + # Now we kill all the ordering constraints (except the haproxy/ip ones) + ORD_CONSTRAINTS=$(pcs config show | sed -n '/^Ordering Constraints:/,/^Colocation Constraints:$/p' | grep -v "Ordering Constraints:" | awk '{print $NF}' | cut -f2 -d: |cut -f1 -d\)) + for constraint in $ORD_CONSTRAINTS; do + log_debug "Deleting ordering constraint $constraint from CIB" + pcs constraint remove "$constraint" + done + # At this stage all the pacemaker resources are removed from the CIB. + # Once we remove the maintenance-mode those systemd resources will keep + # on running. They shall be systemd enabled via the puppet converge + # step later on + pcs property set maintenance-mode=false + + # At this stage there are no constraints whatsoever except the haproxy/ip ones + # which we want to keep. We now disable and then delete each resource + # that will move to systemd. + # We want the systemd resources be stopped before doing "yum update", + # that way "systemctl try-restart <service>" is no-op because the + # service was down already + PCS_STATUS_OUTPUT="$(pcs status)" + for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do + if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then + log_debug "Deleting $resource from the CIB" + if ! pcs resource disable "$resource" --wait=600; then + echo_error "ERROR: resource $resource failed to be disabled" + exit 1 + fi + pcs resource delete --force "$resource" + else + log_debug "Service $resource not found as a pacemaker resource, not trying to delete." + fi + done + + # We need to do a pcs resource cleanup here + crm_resource --wait to + # make sure the cluster is in a clean state before we stop everything, + # upgrade and restart everything + pcs resource cleanup + # We are making sure here that the cluster is stable before proceeding + if ! timeout -k 10 600 crm_resource --wait; then + echo_error "ERROR: cluster remained unstable after resource cleanup for more than 600 seconds, exiting." + exit 1 + fi + fi +} + +function disable_standalone_ceilometer_api { + if [[ -n $(is_bootstrap_node) ]]; then + if [[ -n $(is_pacemaker_managed openstack-ceilometer-api) ]]; then + # Disable pacemaker resources for ceilometer-api + manage_pacemaker_service disable openstack-ceilometer-api + check_resource_pacemaker openstack-ceilometer-api stopped 600 + pcs resource delete openstack-ceilometer-api --wait=600 + fi + fi +} diff --git a/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml b/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml index 9414ac19..b9a87d33 100644 --- a/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml +++ b/extraconfig/tasks/mitaka_to_newton_aodh_data_migration.yaml @@ -4,15 +4,7 @@ description: > Software-config for performing aodh data migration parameters: - controller_servers: - type: json - compute_servers: - type: json - blockstorage_servers: - type: json - objectstorage_servers: - type: json - cephstorage_servers: + servers: type: json input_values: type: json @@ -28,6 +20,6 @@ resources: AodhMysqlMigrationScriptDeployment: type: OS::Heat::SoftwareDeploymentGroup properties: - servers: {get_param: controller_servers} + servers: {get_param: [servers, Controller]} config: {get_resource: AodhMysqlMigrationScriptConfig} input_values: {get_param: input_values} diff --git a/extraconfig/tasks/mitaka_to_newton_ceilometer_wsgi_upgrade.pp b/extraconfig/tasks/mitaka_to_newton_ceilometer_wsgi_upgrade.pp new file mode 100644 index 00000000..1c376285 --- /dev/null +++ b/extraconfig/tasks/mitaka_to_newton_ceilometer_wsgi_upgrade.pp @@ -0,0 +1,97 @@ +# Copyright 2015 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# This puppet manifest is to be used only during a Mitaka->Newton upgrade +# It configures ceilometer to be run under httpd but it makes sure to not +# restart any services. This snippet needs to be called before init as a +# pre upgrade migration. + +Service <| + tag == 'ceilometer-service' +|> { + hasrestart => true, + restart => '/bin/true', + start => '/bin/true', + stop => '/bin/true', +} + +if $::hostname == downcase(hiera('bootstrap_nodeid')) { + $pacemaker_master = true + $sync_db = true +} else { + $pacemaker_master = false + $sync_db = false +} + +include ::tripleo::packages + + +if str2bool(hiera('mongodb::server::ipv6', false)) { + $mongo_node_ips_with_port_prefixed = prefix(hiera('mongodb_node_ips'), '[') + $mongo_node_ips_with_port = suffix($mongo_node_ips_with_port_prefixed, ']:27017') +} else { + $mongo_node_ips_with_port = suffix(hiera('mongodb_node_ips'), ':27017') +} +$mongodb_replset = hiera('mongodb::server::replset') +$mongo_node_string = join($mongo_node_ips_with_port, ',') +$database_connection = "mongodb://${mongo_node_string}/ceilometer?replicaSet=${mongodb_replset}" + +include ::ceilometer + +class {'::ceilometer::db': + database_connection => $database_connection, +} + +if $sync_db { + include ::ceilometer::db::sync +} + +include ::ceilometer::config + +class { '::ceilometer::api': + enabled => true, + service_name => 'httpd', + keystone_password => hiera('ceilometer::keystone::auth::password'), + identity_uri => hiera('ceilometer::keystone::authtoken::auth_url'), + auth_uri => hiera('ceilometer::keystone::authtoken::auth_uri'), + keystone_tenant => hiera('ceilometer::keystone::authtoken::project_name'), +} + +class { '::apache' : + service_enable => false, + service_manage => true, + service_restart => '/bin/true', + purge_configs => false, + purge_vhost_dir => false, +} + +# To ensure existing ports are not overridden +class { '::aodh::wsgi::apache': + servername => $::hostname, + ssl => false, +} +class { '::gnocchi::wsgi::apache': + servername => $::hostname, + ssl => false, +} + +class { '::keystone::wsgi::apache': + servername => $::hostname, + ssl => false, +} +class { '::ceilometer::wsgi::apache': + servername => $::hostname, + ssl => false, +} diff --git a/extraconfig/tasks/pacemaker_common_functions.sh b/extraconfig/tasks/pacemaker_common_functions.sh index 7d794c97..2c7dfc35 100755 --- a/extraconfig/tasks/pacemaker_common_functions.sh +++ b/extraconfig/tasks/pacemaker_common_functions.sh @@ -2,61 +2,298 @@ set -eu -function check_resource { +DEBUG="true" # set false if the verbosity is a problem +SCRIPT_NAME=$(basename $0) +function log_debug { + if [[ $DEBUG = "true" ]]; then + echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1" + fi +} + +function is_bootstrap_node { + if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then + log_debug "Node is bootstrap" + echo "true" + fi +} +function check_resource_pacemaker { if [ "$#" -ne 3 ]; then - echo_error "ERROR: check_resource function expects 3 parameters, $# given" - exit 1 + echo_error "ERROR: check_resource function expects 3 parameters, $# given" + exit 1 fi - service=$1 - state=$2 - timeout=$3 + local service=$1 + local state=$2 + local timeout=$3 + + if [[ -z $(is_bootstrap_node) ]] ; then + log_debug "Node isn't bootstrap, skipping check for $service to be $state here " + return + else + log_debug "Node is bootstrap checking $service to be $state here" + fi if [ "$state" = "stopped" ]; then - match_for_incomplete='Started' + match_for_incomplete='Started' else # started - match_for_incomplete='Stopped' + match_for_incomplete='Stopped' fi nodes_local=$(pcs status | grep ^Online | sed 's/.*\[ \(.*\) \]/\1/g' | sed 's/ /\|/g') if timeout -k 10 $timeout crm_resource --wait; then - node_states=$(pcs status --full | grep "$service" | grep -v Clone | { egrep "$nodes_local" || true; } ) - if echo "$node_states" | grep -q "$match_for_incomplete"; then - echo_error "ERROR: cluster finished transition but $service was not in $state state, exiting." - exit 1 - else - echo "$service has $state" - fi - else - echo_error "ERROR: cluster remained unstable for more than $timeout seconds, exiting." + node_states=$(pcs status --full | grep "$service" | grep -v Clone | { egrep "$nodes_local" || true; } ) + if echo "$node_states" | grep -q "$match_for_incomplete"; then + echo_error "ERROR: cluster finished transition but $service was not in $state state, exiting." exit 1 + else + echo "$service has $state" + fi + else + echo_error "ERROR: cluster remained unstable for more than $timeout seconds, exiting." + exit 1 + fi + +} + +function pcmk_running { + if [[ $(systemctl is-active pacemaker) = "active" ]] ; then + echo "true" + fi +} + +function is_systemd_unknown { + local service=$1 + if [[ $(systemctl is-active "$service") = "unknown" ]]; then + log_debug "$service found to be unkown to systemd" + echo "true" + fi +} + +function grep_is_cluster_controlled { + local service=$1 + if [[ -n $(systemctl status $service -l | grep Drop-In -A 5 | grep pacemaker) || + -n $(systemctl status $service -l | grep "Cluster Controlled $service") ]] ; then + log_debug "$service is pcmk managed from systemctl grep" + echo "true" + fi +} + + +function is_systemd_managed { + local service=$1 + #if we have pcmk check to see if it is managed there + if [[ -n $(pcmk_running) ]]; then + if [[ -z $(pcs status --full | grep $service) && -z $(is_systemd_unknown $service) ]] ; then + log_debug "$service found to be systemd managed from pcs status" + echo "true" + fi + else + # if it is "unknown" to systemd, then it is pacemaker managed + if [[ -n $(is_systemd_unknown $service) ]] ; then + return + elif [[ -z $(grep_is_cluster_controlled $service) ]] ; then + echo "true" + fi + fi +} + +function is_pacemaker_managed { + local service=$1 + #if we have pcmk check to see if it is managed there + if [[ -n $(pcmk_running) ]]; then + if [[ -n $(pcs status --full | grep $service) ]]; then + log_debug "$service found to be pcmk managed from pcs status" + echo "true" + fi + else + # if it is unknown to systemd, then it is pcmk managed + if [[ -n $(is_systemd_unknown $service) ]]; then + echo "true" + elif [[ -n $(grep_is_cluster_controlled $service) ]] ; then + echo "true" + fi + fi +} + +function is_managed { + local service=$1 + if [[ -n $(is_pacemaker_managed $service) || -n $(is_systemd_managed $service) ]]; then + echo "true" + fi +} + +function check_resource_systemd { + + if [ "$#" -ne 3 ]; then + echo_error "ERROR: check_resource function expects 3 parameters, $# given" + exit 1 fi + local service=$1 + local state=$2 + local timeout=$3 + local check_interval=3 + + if [ "$state" = "stopped" ]; then + match_for_incomplete='active' + else # started + match_for_incomplete='inactive' + fi + + log_debug "Going to check_resource_systemd for $service to be $state" + + #sanity check is systemd managed: + if [[ -z $(is_systemd_managed $service) ]]; then + echo "ERROR - $service not found to be systemd managed." + exit 1 + fi + + tstart=$(date +%s) + tend=$(( $tstart + $timeout )) + while (( $(date +%s) < $tend )); do + if [[ "$(systemctl is-active $service)" = $match_for_incomplete ]]; then + echo "$service not yet $state, sleeping $check_interval seconds." + sleep $check_interval + else + echo "$service is $state" + return + fi + done + + echo "Timed out waiting for $service to go to $state after $timeout seconds" + exit 1 +} + + +function check_resource { + local service=$1 + local pcmk_managed=$(is_pacemaker_managed $service) + local systemd_managed=$(is_systemd_managed $service) + + if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then + log_debug "ERROR $service managed by both systemd and pcmk - SKIPPING" + return + fi + + if [[ -n $pcmk_managed ]]; then + check_resource_pacemaker $@ + return + elif [[ -n $systemd_managed ]]; then + check_resource_systemd $@ + return + fi + log_debug "ERROR cannot check_resource for $service, not managed here?" +} + +function manage_systemd_service { + local action=$1 + local service=$2 + log_debug "Going to systemctl $action $service" + systemctl $action $service +} + +function manage_pacemaker_service { + local action=$1 + local service=$2 + # not if pacemaker isn't running! + if [[ -z $(pcmk_running) ]]; then + echo "$(facter hostname) pacemaker not active, skipping $action $service here" + elif [[ -n $(is_bootstrap_node) ]]; then + log_debug "Going to pcs resource $action $service" + pcs resource $action $service + fi +} + +function stop_or_disable_service { + local service=$1 + local pcmk_managed=$(is_pacemaker_managed $service) + local systemd_managed=$(is_systemd_managed $service) + + if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then + log_debug "Skipping stop_or_disable $service due to management conflict" + return + fi + + log_debug "Stopping or disabling $service" + if [[ -n $pcmk_managed ]]; then + manage_pacemaker_service disable $service + return + elif [[ -n $systemd_managed ]]; then + manage_systemd_service stop $service + return + fi + log_debug "ERROR: $service not managed here?" +} + +function start_or_enable_service { + local service=$1 + local pcmk_managed=$(is_pacemaker_managed $service) + local systemd_managed=$(is_systemd_managed $service) + + if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then + log_debug "Skipping start_or_enable $service due to management conflict" + return + fi + + log_debug "Starting or enabling $service" + if [[ -n $pcmk_managed ]]; then + manage_pacemaker_service enable $service + return + elif [[ -n $systemd_managed ]]; then + manage_systemd_service start $service + return + fi + log_debug "ERROR $service not managed here?" +} + +function restart_service { + local service=$1 + local pcmk_managed=$(is_pacemaker_managed $service) + local systemd_managed=$(is_systemd_managed $service) + + if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then + log_debug "ERROR $service managed by both systemd and pcmk - SKIPPING" + return + fi + + log_debug "Restarting $service" + if [[ -n $pcmk_managed ]]; then + manage_pacemaker_service restart $service + return + elif [[ -n $systemd_managed ]]; then + manage_systemd_service restart $service + return + fi + log_debug "ERROR $service not managed here?" } function echo_error { echo "$@" | tee /dev/fd2 } +# swift is a special case because it is/was never handled by pacemaker +# when stand-alone swift is used, only swift-proxy is running on controllers function systemctl_swift { services=( openstack-swift-account-auditor openstack-swift-account-reaper openstack-swift-account-replicator openstack-swift-account \ openstack-swift-container-auditor openstack-swift-container-replicator openstack-swift-container-updater openstack-swift-container \ openstack-swift-object-auditor openstack-swift-object-replicator openstack-swift-object-updater openstack-swift-object openstack-swift-proxy ) - action=$1 + local action=$1 case $action in stop) - services=$(systemctl | grep swift | grep running | awk '{print $1}') + services=$(systemctl | grep openstack-swift- | grep running | awk '{print $1}') ;; start) - enable_swift_storage=$(hiera -c /etc/puppet/hiera.yaml 'enable_swift_storage') + enable_swift_storage=$(hiera -c /etc/puppet/hiera.yaml tripleo::profile::base::swift::storage::enable_swift_storage) if [[ $enable_swift_storage != "true" ]]; then services=( openstack-swift-proxy ) fi ;; - *) services=() ;; # for safetly, should never happen + *) echo "Unknown action $action passed to systemctl_swift" + exit 1 + ;; # shouldn't ever happen... esac - for S in ${services[@]}; do - systemctl $action $S + for service in ${services[@]}; do + manage_systemd_service $action $service done } diff --git a/extraconfig/tasks/pacemaker_resource_restart.sh b/extraconfig/tasks/pacemaker_resource_restart.sh index fd1fd0dc..8500bcef 100755 --- a/extraconfig/tasks/pacemaker_resource_restart.sh +++ b/extraconfig/tasks/pacemaker_resource_restart.sh @@ -2,16 +2,16 @@ set -eux -pacemaker_status=$(systemctl is-active pacemaker) - # Run if pacemaker is running, we're the bootstrap node, # and we're updating the deployment (not creating). -if [ "$pacemaker_status" = "active" -a \ - "$(hiera bootstrap_nodeid)" = "$(facter hostname)" ]; then + +RESTART_FOLDER="/var/lib/tripleo/pacemaker-restarts" + +if [[ -d "$RESTART_FOLDER" && -n $(pcmk_running) && -n $(is_bootstrap_node) ]]; then TIMEOUT=600 - SERVICES_TO_RESTART="$(ls /var/lib/tripleo/pacemaker-restarts)" PCS_STATUS_OUTPUT="$(pcs status)" + SERVICES_TO_RESTART="$(ls $RESTART_FOLDER)" for service in $SERVICES_TO_RESTART; do if ! echo "$PCS_STATUS_OUTPUT" | grep $service; then @@ -23,7 +23,12 @@ if [ "$pacemaker_status" = "active" -a \ for service in $SERVICES_TO_RESTART; do echo "Restarting $service..." pcs resource restart --wait=$TIMEOUT $service - rm -f /var/lib/tripleo/pacemaker-restarts/$service + rm -f "$RESTART_FOLDER"/$service done fi + +haproxy_status=$(systemctl is-active haproxy) +if [ "$haproxy_status" = "active" ]; then + systemctl reload haproxy +fi diff --git a/extraconfig/tasks/yum_update.sh b/extraconfig/tasks/yum_update.sh index b045e5ea..4612f197 100755 --- a/extraconfig/tasks/yum_update.sh +++ b/extraconfig/tasks/yum_update.sh @@ -44,6 +44,25 @@ fi pacemaker_status=$(systemctl is-active pacemaker) +# Fix the redis/rabbit resource start/stop timeouts. See https://bugs.launchpad.net/tripleo/+bug/1633455 +# and https://bugs.launchpad.net/tripleo/+bug/1634851 +if [[ "$pacemaker_status" == "active" && \ + "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]] ; then + if pcs resource show rabbitmq | grep -E "start.*timeout=100"; then + pcs resource update rabbitmq op start timeout=200s + fi + if pcs resource show rabbitmq | grep -E "stop.*timeout=90"; then + pcs resource update rabbitmq op stop timeout=200s + fi + if pcs resource show redis | grep -E "start.*timeout=120"; then + pcs resource update redis op start timeout=200s + fi + if pcs resource show redis | grep -E "stop.*timeout=120"; then + pcs resource update redis op stop timeout=200s + fi +fi + + if [[ "$pacemaker_status" == "active" ]] ; then echo "Pacemaker running, stopping cluster node and doing full package update" node_count=$(pcs status xml | grep -o "<nodes_configured.*/>" | grep -o 'number="[0-9]*"' | grep -o "[0-9]*") @@ -54,13 +73,28 @@ if [[ "$pacemaker_status" == "active" ]] ; then pcs cluster stop fi else - echo "Upgrading openstack-puppet-modules" + echo "Upgrading openstack-puppet-modules and its dependencies" yum -q -y update openstack-puppet-modules + yum deplist openstack-puppet-modules | awk '/dependency/{print $2}' | xargs yum -q -y update echo "Upgrading other packages is handled by config management tooling" echo -n "true" > $heat_outputs_path.update_managed_packages exit 0 fi +# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1635205 +if [[ -n $(rpm -q --scripts openvswitch | awk '/postuninstall/,/*/' | grep "systemctl.*try-restart") ]]; then + echo "Manual upgrade of openvswitch - restart in postun detected" + mkdir OVS_UPGRADE || true + pushd OVS_UPGRADE + echo "Attempting to downloading latest openvswitch with yumdownloader" + yumdownloader --resolve openvswitch + echo "Updating openvswitch with nopostun option" + rpm -U --replacepkgs --nopostun ./*.rpm + popd +else + echo "Skipping manual upgrade of openvswitch - no restart in postun detected" +fi + command=${command:-update} full_command="yum -q -y $command $command_arguments" echo "Running: $full_command" |