From a3df16776dd5d7eb0a60ca4c58cef9913eb1c5cb Mon Sep 17 00:00:00 2001 From: Giulio Fidente Date: Fri, 17 Feb 2017 16:49:58 +0100 Subject: Add checks in ansible upgrade tasks for CephMon and CephOSD Adds two checks, one for the CephMon and one for the CephOSD upgrade tasks borrowed from ceph-ansible. Change-Id: I0a0e60d277240130c6bd76a74ccc13354b87a30a Co-Authored-By: Sebastien Han --- extraconfig/tasks/major_upgrade_pacemaker.yaml | 4 --- puppet/services/ceph-mon.yaml | 43 +++++++++++++++------- puppet/services/ceph-osd.yaml | 50 ++++++++++++++++++++++---- puppet/services/ceph-rgw.yaml | 4 ++- 4 files changed, 78 insertions(+), 23 deletions(-) diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index 8c91027d..74d3be71 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -18,10 +18,6 @@ parameters: constraints: - allowed_values: ['auto', 'yes', 'no'] default: 'auto' - IgnoreCephUpgradeWarnings: - type: boolean - default: false - description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean KeepSaharaServicesOnUpgrade: type: boolean default: true diff --git a/puppet/services/ceph-mon.yaml b/puppet/services/ceph-mon.yaml index 1ce58335..d589ef89 100644 --- a/puppet/services/ceph-mon.yaml +++ b/puppet/services/ceph-mon.yaml @@ -59,6 +59,14 @@ parameters: } default: {} type: json + CephValidationRetries: + type: number + default: 5 + description: Number of retry attempts for Ceph validation + CephValidationDelay: + type: number + default: 10 + description: Interval (in seconds) in between validation checks MonitoringSubscriptionCephMon: default: 'overcloud-ceph-mon' type: string @@ -119,21 +127,32 @@ outputs: # rolling upgrade of all osd nodes in step1 - name: Check status tags: step0,validation - shell: ceph health | grep -qv HEALTH_ERR - # FIXME(shardy) I suspect we can use heat or ansible facts here instead? - - name: Get hostname + shell: ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN" + - name: Stop CephMon tags: step0 - shell: hostname -s - register: mon_id - - name: Stop Ceph Mon + service: + name: ceph-mon@{{ ansible_hostname }} + state: stopped + - name: Update Ceph packages tags: step0 - service: name=ceph-mon@{{mon_id.stdout}} pattern=ceph-mon state=stopped - - name: Update ceph packages + yum: + name: ceph-mon + state: latest + - name: Start CephMon tags: step0 - yum: name=ceph-mon state=latest - - name: Start ceph-mon service - tags: step0 - service: name=ceph-mon@{{mon_id.stdout}} state=started + service: + name: ceph-mon@{{ ansible_hostname }} + state: started + # ceph-ansible + # https://github.com/ceph/ceph-ansible/blob/master/infrastructure-playbooks/rolling_update.yml#L149-L157 + - name: Wait for the monitor to join the quorum... + tags: step0,ceph_quorum_validation + shell: | + ceph -s | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }} + register: ceph_quorum_nodecheck + until: ceph_quorum_nodecheck.rc == 0 + retries: {get_param: CephValidationRetries} + delay: {get_param: CephValidationDelay} - name: ceph osd crush tunables default tags: step0 shell: ceph osd crush tunables default diff --git a/puppet/services/ceph-osd.yaml b/puppet/services/ceph-osd.yaml index 9bd83aab..a97fa116 100644 --- a/puppet/services/ceph-osd.yaml +++ b/puppet/services/ceph-osd.yaml @@ -21,6 +21,24 @@ parameters: MonitoringSubscriptionCephOsd: default: 'overcloud-ceph-osd' type: string + CephValidationRetries: + type: number + default: 40 + description: Number of retry attempts for Ceph validation + CephValidationDelay: + type: number + default: 30 + description: Interval (in seconds) in between validation checks + IgnoreCephUpgradeWarnings: + type: boolean + default: false + description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean + +parameter_groups: +- label: deprecated + description: Do not use deprecated params, they will be removed. + parameters: + - IgnoreCephUpgradeWarnings resources: CephBase: @@ -66,17 +84,37 @@ outputs: - name: ceph osd set noscrub tags: step1 command: ceph osd set noscrub - - name: Stop Ceph OSD + - name: Stop CephOSD tags: step1 - service: name=ceph-osd@{{ item }} state=stopped + service: + name: ceph-osd@{{ item }} + state: stopped with_items: "{{osd_ids.stdout.strip().split()}}" - - name: Update ceph OSD packages + - name: Update Ceph packages tags: step1 - yum: name=ceph-osd state=latest - - name: Start ceph-osd service + yum: + name: ceph-osd + state: latest + - name: Start CephOSD tags: step1 - service: name=ceph-osd@{{ item }} state=started + service: + name: ceph-osd@{{ item }} + state: started with_items: "{{osd_ids.stdout.strip().split()}}" + # with awk we are meant to check if $2 and $4 are *the same* but it returns 1 when + # they are, so the check is inverted to produce an useful exit code + - name: Wait for clean pgs... + tags: step1,ceph_pgs_clean_validation + vars: + ignore_warnings: {get_param: IgnoreCephUpgradeWarnings} + shell: | + ceph pg stat | awk '{exit($2!=$4)}' && ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN" + register: ceph_pgs_healthcheck + until: ceph_pgs_healthcheck.rc == 0 + retries: {get_param: CephValidationRetries} + delay: {get_param: CephValidationDelay} + when: + - not ignore_warnings - name: ceph osd unset noout tags: step1 command: ceph osd unset noout diff --git a/puppet/services/ceph-rgw.yaml b/puppet/services/ceph-rgw.yaml index d7014e54..c5b29c7e 100644 --- a/puppet/services/ceph-rgw.yaml +++ b/puppet/services/ceph-rgw.yaml @@ -87,4 +87,6 @@ outputs: tags: step0,validation - name: Stop RGW instance tags: step1 - service: name=ceph-radosgw@{{rgw_id.stdout}} state=stopped + service: + name: ceph-radosgw@{{rgw_id.stdout}} + state: stopped -- cgit 1.2.3-korg