From faed52cdedf1b067c3f39d49827cb843d55edd45 Mon Sep 17 00:00:00 2001 From: QiLiang Date: Fri, 14 Oct 2016 06:11:07 +0800 Subject: Add recovery scripts - add recovery shell scripts - add boot-recovery role to stop services during system boot which could cause boot pending - configure nfs mount in /etc/fstab cause system boot pending, so mount nfs during ansible-playbook run. - kill mysqld before mysql recovery, running mysqld may cause mysql recover failure JIRA: COMPASS-474 Change-Id: I0f6f0ee935fbe3fbbe28a451a02decfb01a6165b Signed-off-by: QiLiang --- .../ansible/openstack/HA-ansible-multinodes.yml | 7 +++ .../openstack_mitaka/HA-ansible-multinodes.yml | 7 +++ .../openstack_mitaka/roles/glance/tasks/nfs.yml | 8 ++- .../HA-ansible-multinodes.yml | 7 +++ .../roles/glance/tasks/nfs.yml | 8 ++- .../HA-ansible-multinodes.yml | 7 +++ .../ansible/roles/boot-recovery/tasks/main.yml | 26 ++++++++ .../ansible/roles/boot-recovery/vars/Debian.yml | 14 +++++ .../ansible/roles/boot-recovery/vars/RedHat.yml | 15 +++++ .../ansible/roles/boot-recovery/vars/main.yml | 11 ++++ .../roles/controller-recovery/vars/Debian.yml | 1 + .../roles/controller-recovery/vars/RedHat.yml | 1 + .../database/tasks/mariadb_cluster_debian.yml | 8 +++ .../database/tasks/mariadb_cluster_redhat.yml | 8 +++ .../adapters/ansible/roles/glance/tasks/main.yml | 2 + deploy/adapters/ansible/roles/glance/tasks/nfs.yml | 8 ++- deploy/compass_vm.sh | 72 ++++++++++++++++++++++ deploy/host_virtual.sh | 13 ++++ deploy/launch.sh | 12 +++- deploy/network.sh | 41 ++++++++++++ deploy/recovery.sh | 40 ++++++++++++ deploy/template/power/ipmitool.tmpl | 30 +++++---- 22 files changed, 327 insertions(+), 19 deletions(-) create mode 100755 deploy/adapters/ansible/roles/boot-recovery/tasks/main.yml create mode 100755 deploy/adapters/ansible/roles/boot-recovery/vars/Debian.yml create mode 100755 deploy/adapters/ansible/roles/boot-recovery/vars/RedHat.yml create mode 100755 deploy/adapters/ansible/roles/boot-recovery/vars/main.yml create mode 100644 deploy/recovery.sh (limited to 'deploy') diff --git a/deploy/adapters/ansible/openstack/HA-ansible-multinodes.yml b/deploy/adapters/ansible/openstack/HA-ansible-multinodes.yml index 7f61a1cf..95102d2b 100644 --- a/deploy/adapters/ansible/openstack/HA-ansible-multinodes.yml +++ b/deploy/adapters/ansible/openstack/HA-ansible-multinodes.yml @@ -233,6 +233,13 @@ roles: - ext-network +- hosts: controller + remote_user: root + accelerate: true + max_fail_percentage: 0 + roles: + - boot-recovery + - hosts: controller remote_user: root accelerate: true diff --git a/deploy/adapters/ansible/openstack_mitaka/HA-ansible-multinodes.yml b/deploy/adapters/ansible/openstack_mitaka/HA-ansible-multinodes.yml index 7ef467ee..c04445d8 100644 --- a/deploy/adapters/ansible/openstack_mitaka/HA-ansible-multinodes.yml +++ b/deploy/adapters/ansible/openstack_mitaka/HA-ansible-multinodes.yml @@ -242,6 +242,13 @@ roles: - tacker +- hosts: controller + remote_user: root + accelerate: true + max_fail_percentage: 0 + roles: + - boot-recovery + - hosts: controller remote_user: root accelerate: true diff --git a/deploy/adapters/ansible/openstack_mitaka/roles/glance/tasks/nfs.yml b/deploy/adapters/ansible/openstack_mitaka/roles/glance/tasks/nfs.yml index 07dfacdd..deec81f8 100644 --- a/deploy/adapters/ansible/openstack_mitaka/roles/glance/tasks/nfs.yml +++ b/deploy/adapters/ansible/openstack_mitaka/roles/glance/tasks/nfs.yml @@ -42,10 +42,14 @@ - name: get mount info command: mount register: mount_info + tags: + - recovery - name: get nfs server shell: awk -F'=' '/compass_server/ {print $2}' /etc/compass.conf register: ip_info + tags: + - recovery - name: restart host nfs service service: name={{ item }} state=restarted enabled=yes @@ -55,7 +59,9 @@ shell: | mount -t nfs -onfsvers=3 {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images sed -i '/\/var\/lib\/glance\/images/d' /etc/fstab - echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab + #echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab when: mount_info.stdout.find('images') == -1 retries: 5 delay: 3 + tags: + - recovery diff --git a/deploy/adapters/ansible/openstack_mitaka_xenial/HA-ansible-multinodes.yml b/deploy/adapters/ansible/openstack_mitaka_xenial/HA-ansible-multinodes.yml index ec4c53f4..ac31b682 100644 --- a/deploy/adapters/ansible/openstack_mitaka_xenial/HA-ansible-multinodes.yml +++ b/deploy/adapters/ansible/openstack_mitaka_xenial/HA-ansible-multinodes.yml @@ -242,6 +242,13 @@ roles: - ext-network +- hosts: controller + remote_user: root + accelerate: true + max_fail_percentage: 0 + roles: + - boot-recovery + - hosts: controller remote_user: root accelerate: true diff --git a/deploy/adapters/ansible/openstack_mitaka_xenial/roles/glance/tasks/nfs.yml b/deploy/adapters/ansible/openstack_mitaka_xenial/roles/glance/tasks/nfs.yml index 07dfacdd..deec81f8 100644 --- a/deploy/adapters/ansible/openstack_mitaka_xenial/roles/glance/tasks/nfs.yml +++ b/deploy/adapters/ansible/openstack_mitaka_xenial/roles/glance/tasks/nfs.yml @@ -42,10 +42,14 @@ - name: get mount info command: mount register: mount_info + tags: + - recovery - name: get nfs server shell: awk -F'=' '/compass_server/ {print $2}' /etc/compass.conf register: ip_info + tags: + - recovery - name: restart host nfs service service: name={{ item }} state=restarted enabled=yes @@ -55,7 +59,9 @@ shell: | mount -t nfs -onfsvers=3 {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images sed -i '/\/var\/lib\/glance\/images/d' /etc/fstab - echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab + #echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab when: mount_info.stdout.find('images') == -1 retries: 5 delay: 3 + tags: + - recovery diff --git a/deploy/adapters/ansible/openstack_newton_xenial/HA-ansible-multinodes.yml b/deploy/adapters/ansible/openstack_newton_xenial/HA-ansible-multinodes.yml index 3d5b0a1c..9e8ec15b 100644 --- a/deploy/adapters/ansible/openstack_newton_xenial/HA-ansible-multinodes.yml +++ b/deploy/adapters/ansible/openstack_newton_xenial/HA-ansible-multinodes.yml @@ -242,6 +242,13 @@ roles: - ext-network +- hosts: controller + remote_user: root + accelerate: true + max_fail_percentage: 0 + roles: + - boot-recovery + - hosts: controller remote_user: root accelerate: true diff --git a/deploy/adapters/ansible/roles/boot-recovery/tasks/main.yml b/deploy/adapters/ansible/roles/boot-recovery/tasks/main.yml new file mode 100755 index 00000000..67206bf6 --- /dev/null +++ b/deploy/adapters/ansible/roles/boot-recovery/tasks/main.yml @@ -0,0 +1,26 @@ +############################################################################## +# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +--- +- name: Register RECOVERY + set_fact: RECOVERY_ENV={{RECOVERY_ENV | default('False')}} + tags: + - recovery-stop-service + +- include_vars: "{{ ansible_os_family }}.yml" + when: RECOVERY_ENV + tags: + - recovery-stop-service + +- name: stop controller services + service: name={{ item }} state=stopped enabled=yes + with_items: controller_services | union(controller_services_noarch) + when: RECOVERY_ENV + tags: + - recovery-stop-service + diff --git a/deploy/adapters/ansible/roles/boot-recovery/vars/Debian.yml b/deploy/adapters/ansible/roles/boot-recovery/vars/Debian.yml new file mode 100755 index 00000000..084deebc --- /dev/null +++ b/deploy/adapters/ansible/roles/boot-recovery/vars/Debian.yml @@ -0,0 +1,14 @@ +############################################################################## +# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +--- +controller_services: + - cron + - aodh-expirer + - neutron-openvswitch-agent + - mysql diff --git a/deploy/adapters/ansible/roles/boot-recovery/vars/RedHat.yml b/deploy/adapters/ansible/roles/boot-recovery/vars/RedHat.yml new file mode 100755 index 00000000..c46f79c8 --- /dev/null +++ b/deploy/adapters/ansible/roles/boot-recovery/vars/RedHat.yml @@ -0,0 +1,15 @@ +############################################################################## +# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +--- +controller_services: + - cron + - neutron-openvswitch-agent + - openstack-aodh-expirer + - mysql + diff --git a/deploy/adapters/ansible/roles/boot-recovery/vars/main.yml b/deploy/adapters/ansible/roles/boot-recovery/vars/main.yml new file mode 100755 index 00000000..22af29f4 --- /dev/null +++ b/deploy/adapters/ansible/roles/boot-recovery/vars/main.yml @@ -0,0 +1,11 @@ +############################################################################## +# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +--- +controller_services_noarch: [] + diff --git a/deploy/adapters/ansible/roles/controller-recovery/vars/Debian.yml b/deploy/adapters/ansible/roles/controller-recovery/vars/Debian.yml index 34675f6b..62753413 100644 --- a/deploy/adapters/ansible/roles/controller-recovery/vars/Debian.yml +++ b/deploy/adapters/ansible/roles/controller-recovery/vars/Debian.yml @@ -37,4 +37,5 @@ controller_services: - aodh-notifier - aodh-evaluator - aodh-listener + - cron diff --git a/deploy/adapters/ansible/roles/controller-recovery/vars/RedHat.yml b/deploy/adapters/ansible/roles/controller-recovery/vars/RedHat.yml index 35c0a955..145acecd 100644 --- a/deploy/adapters/ansible/roles/controller-recovery/vars/RedHat.yml +++ b/deploy/adapters/ansible/roles/controller-recovery/vars/RedHat.yml @@ -36,4 +36,5 @@ controller_services: - openstack-aodh-notifier - openstack-aodh-evaluator - openstack-aodh-listener + - cron diff --git a/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_debian.yml b/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_debian.yml index 6b670312..f083a40f 100644 --- a/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_debian.yml +++ b/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_debian.yml @@ -7,6 +7,14 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## --- +- name: Register RECOVERY + set_fact: RECOVERY_ENV={{RECOVERY_ENV | default('False')}} + +- name: killall mysqld processes + shell: sudo killall -9 mysqld + when: RECOVERY_ENV + ignore_errors: True + - name: get cluster status shell: mysql --silent --skip-column-names -e 'SHOW STATUS LIKE "wsrep_evs_state"'|awk '{print $2}' register: cluster_status diff --git a/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_redhat.yml b/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_redhat.yml index da1b863c..cfd778f1 100644 --- a/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_redhat.yml +++ b/deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_redhat.yml @@ -7,6 +7,14 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## --- +- name: Register RECOVERY + set_fact: RECOVERY_ENV={{RECOVERY_ENV | default('False')}} + +- name: killall mysqld processes + shell: sudo killall -9 mysqld + when: RECOVERY_ENV + ignore_errors: True + - name: get cluster status shell: mysql --silent --skip-column-names -e 'SHOW STATUS LIKE "wsrep_evs_state"'|awk '{print $2}' register: cluster_status diff --git a/deploy/adapters/ansible/roles/glance/tasks/main.yml b/deploy/adapters/ansible/roles/glance/tasks/main.yml index a78ba771..caece26c 100644 --- a/deploy/adapters/ansible/roles/glance/tasks/main.yml +++ b/deploy/adapters/ansible/roles/glance/tasks/main.yml @@ -8,6 +8,8 @@ ############################################################################## --- - include_vars: "{{ ansible_os_family }}.yml" + tags: + - recovery - include: glance_install.yml tags: diff --git a/deploy/adapters/ansible/roles/glance/tasks/nfs.yml b/deploy/adapters/ansible/roles/glance/tasks/nfs.yml index 7895c386..179229de 100644 --- a/deploy/adapters/ansible/roles/glance/tasks/nfs.yml +++ b/deploy/adapters/ansible/roles/glance/tasks/nfs.yml @@ -38,10 +38,14 @@ - name: get mount info command: mount register: mount_info + tags: + - recovery - name: get nfs server shell: awk -F'=' '/compass_server/ {print $2}' /etc/compass.conf register: ip_info + tags: + - recovery - name: restart host nfs service service: name={{ item }} state=restarted enabled=yes @@ -51,7 +55,9 @@ shell: | mount -t nfs -onfsvers=3 {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images sed -i '/\/var\/lib\/glance\/images/d' /etc/fstab - echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab + #echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab when: mount_info.stdout.find('images') == -1 retries: 5 delay: 3 + tags: + - recovery diff --git a/deploy/compass_vm.sh b/deploy/compass_vm.sh index dc391acc..7e2ce40b 100755 --- a/deploy/compass_vm.sh +++ b/deploy/compass_vm.sh @@ -151,3 +151,75 @@ function launch_compass() { set +e log_info "launch_compass exit" } + +function recover_compass() { + log_info "recover_compass enter" + + sudo virsh start compass + + if ! wait_ok 500;then + log_error "install os timeout" + exit 1 + fi + + log_info "launch_compass exit" +} + +function _check_hosts_reachable() { + retry=0 + + while true; do + sleep 1 + let retry+=1 + if [[ $retry -ge $1 ]]; then + log_error "hosts boot time out" + echo "fail" + return + fi + + ssh $ssh_args root@$MGMT_IP " + cd /var/ansible/run/$ADAPTER_NAME'-'$CLUSTER_NAME; + ansible -i inventories/inventory.yml $2 -m ping + " > /dev/null + if [ $? == 0 ]; then + break + fi + done + echo "ok" +} + +function check_hosts_reachable() { + ret=$(_check_hosts_reachable $1 compute) + if [[ "$ret" == "fail" ]]; then + echo $ret + return + fi + + ret=$(_check_hosts_reachable 100 controller) + echo $ret +} + +function recover_hosts() { + ssh $ssh_args root@$MGMT_IP " + cd /var/ansible/run/$ADAPTER_NAME'-'$CLUSTER_NAME; + ansible-playbook \ + -i inventories/inventory.yml HA-ansible-multinodes.yml \ + -t recovery \ + -e 'RECOVERY_ENV=True' + " + if [ $? == 0 ]; then + echo "Recovery Complete!" + fi +} + +function wait_controller_nodes_ok() { + sleep 100 + ssh $ssh_args root@$MGMT_IP " + cd /var/ansible/run/$ADAPTER_NAME'-'$CLUSTER_NAME; + ansible-playbook \ + -i inventories/inventory.yml HA-ansible-multinodes.yml \ + -t recovery-stop-service \ + -e 'RECOVERY_ENV=True' + " + sleep 30 +} diff --git a/deploy/host_virtual.sh b/deploy/host_virtual.sh index 2fab2c9d..0a991f11 100755 --- a/deploy/host_virtual.sh +++ b/deploy/host_virtual.sh @@ -54,6 +54,19 @@ function launch_host_vms() { IFS=$old_ifs } +function recover_host_vms() { + old_ifs=$IFS + IFS=, + + for host in $HOSTNAMES; do + sudo virsh destroy $host + sleep 2 + sudo virsh start $host + sleep 2 + done + IFS=$old_ifs +} + function get_host_macs() { local mac_generator=${COMPASS_DIR}/deploy/mac_generator.sh local machines= diff --git a/deploy/launch.sh b/deploy/launch.sh index 976af3c0..6db9f362 100755 --- a/deploy/launch.sh +++ b/deploy/launch.sh @@ -12,7 +12,8 @@ WORK_DIR=$COMPASS_DIR/work/deploy mkdir -p $WORK_DIR/script -export DEPLOY_FIRST_TIME=${DEPLOY_FIRST_TIME-"true"} +export DEPLOY_FIRST_TIME=${DEPLOY_FIRST_TIME:-"true"} +export DEPLOY_RECOVERY=${DEPLOY_RECOVERY:-"false"} source ${COMPASS_DIR}/deploy/prepare.sh prepare_python_env @@ -31,9 +32,14 @@ source ${COMPASS_DIR}/deploy/compass_vm.sh source ${COMPASS_DIR}/deploy/deploy_host.sh ######################### main process -if [[ "$EXPANSION" == "false" ]] -then +if [[ "$DEPLOY_RECOVERY" == "true" ]]; then + source ${COMPASS_DIR}/deploy/recovery.sh + recover_cluster + exit 0 +fi + +if [[ "$EXPANSION" == "false" ]]; then print_logo if [[ ! -z $VIRT_NUMBER ]];then diff --git a/deploy/network.sh b/deploy/network.sh index 46b8c023..6c678222 100755 --- a/deploy/network.sh +++ b/deploy/network.sh @@ -29,6 +29,13 @@ function setup_bridge_net() sudo virsh net-start $net_name } +function recover_bridge_net() +{ + net_name=$1 + + sudo virsh net-start $net_name +} + function save_network_info() { sudo ovs-vsctl list-br |grep br-external @@ -69,6 +76,13 @@ function setup_bridge_external() python $COMPASS_DIR/deploy/setup_vnic.py } +function recover_bridge_external() +{ + sudo virsh net-start external + + python $COMPASS_DIR/deploy/setup_vnic.py +} + function setup_nat_net() { net_name=$1 gw=$2 @@ -92,11 +106,20 @@ function setup_nat_net() { sudo virsh net-start $net_name } +function recover_nat_net() { + net_name=$1 + + sudo virsh net-start $net_name +} function setup_virtual_net() { setup_nat_net install $INSTALL_GW $INSTALL_MASK } +function recover_virtual_net() { + recover_nat_net install +} + function setup_baremetal_net() { if [[ -z $INSTALL_NIC ]]; then exit 1 @@ -104,6 +127,13 @@ function setup_baremetal_net() { setup_bridge_net install $INSTALL_NIC } +function recover_baremetal_net() { + if [[ -z $INSTALL_NIC ]]; then + exit 1 + fi + recover_bridge_net install +} + function setup_network_boot_scripts() { sudo cp $COMPASS_DIR/deploy/network.sh /usr/sbin/network_setup sudo chmod +777 /usr/sbin/network_setup @@ -134,3 +164,14 @@ function create_nets() { setup_network_boot_scripts } +function recover_nets() { + recover_nat_net mgmt + + # recover install network + recover_"$TYPE"_net + + # recover external network + recover_bridge_external + clear_forward_rejct_rules +} + diff --git a/deploy/recovery.sh b/deploy/recovery.sh new file mode 100644 index 00000000..db85848f --- /dev/null +++ b/deploy/recovery.sh @@ -0,0 +1,40 @@ +#!/bin/bash +############################################################################## +# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +function recover_cluster() { + recover_nets + recover_compass + + i=0 + MAX_RETRY_TIMES=2 + while [ $i -lt $MAX_RETRY_TIMES ]; do + let i+=1 + + if [[ ! -z $VIRT_NUMBER ]];then + recover_host_vms + else + reboot_hosts + fi + + ret=$(check_hosts_reachable 500) + if [[ "$ret" == "ok" ]];then + break + fi + done + + if [[ $i -ge $MAX_RETRY_TIMES ]]; then + echo "Recovery Failure !!!" + exit 1 + fi + + wait_controller_nodes_ok + recover_hosts +} + diff --git a/deploy/template/power/ipmitool.tmpl b/deploy/template/power/ipmitool.tmpl index a297e001..048e997a 100644 --- a/deploy/template/power/ipmitool.tmpl +++ b/deploy/template/power/ipmitool.tmpl @@ -40,19 +40,23 @@ for i in {1..5}; do fi done sleep 1 -for i in {1..5}; do - if ipmitool -I $interface -H $ipmiIp -U $ipmiUser -P $ipmiPass chassis bootdev pxe >/dev/null 2>&1 - then - break - elif [[ i -lt 5 ]] - then - sleep 1 - else - log_error "set $ipmiIp pxe fail" - exit 1 - fi -done -sleep 1 + +if [[ "\$DEPLOY_RECOVERY" != "true" ]]; then + for i in {1..5}; do + if ipmitool -I $interface -H $ipmiIp -U $ipmiUser -P $ipmiPass chassis bootdev pxe >/dev/null 2>&1 + then + break + elif [[ i -lt 5 ]] + then + sleep 1 + else + log_error "set $ipmiIp pxe fail" + exit 1 + fi + done + sleep 1 +fi + for i in {1..5}; do if ipmitool -I $interface -H $ipmiIp -U $ipmiUser -P $ipmiPass chassis power reset >/dev/null 2>&1 then -- cgit 1.2.3-korg