aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGiulio Fidente <gfidente@redhat.com>2017-02-17 16:49:58 +0100
committerGiulio Fidente <gfidente@redhat.com>2017-02-24 12:04:31 -0500
commita41c5ebcc23ffc1df2f3baa6dcc173e8ad6143bf (patch)
treebdc63bdd3739a51a3c0abb4fbfb6b7d8cf9236e7
parent38cbdc5424096de93c73116123f45436a35a6884 (diff)
Add checks in ansible upgrade tasks for CephMon and CephOSD
Adds two checks, one for the CephMon and one for the CephOSD upgrade tasks borrowed from ceph-ansible. Change-Id: I0a0e60d277240130c6bd76a74ccc13354b87a30a Co-Authored-By: Sebastien Han <seb@redhat.com> (cherry picked from commit a3df16776dd5d7eb0a60ca4c58cef9913eb1c5cb)
-rw-r--r--extraconfig/tasks/major_upgrade_pacemaker.yaml4
-rw-r--r--puppet/services/ceph-mon.yaml43
-rw-r--r--puppet/services/ceph-osd.yaml50
-rw-r--r--puppet/services/ceph-rgw.yaml4
4 files changed, 78 insertions, 23 deletions
diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml
index 8c91027d..74d3be71 100644
--- a/extraconfig/tasks/major_upgrade_pacemaker.yaml
+++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml
@@ -18,10 +18,6 @@ parameters:
constraints:
- allowed_values: ['auto', 'yes', 'no']
default: 'auto'
- IgnoreCephUpgradeWarnings:
- type: boolean
- default: false
- description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
KeepSaharaServicesOnUpgrade:
type: boolean
default: true
diff --git a/puppet/services/ceph-mon.yaml b/puppet/services/ceph-mon.yaml
index 1ce58335..d589ef89 100644
--- a/puppet/services/ceph-mon.yaml
+++ b/puppet/services/ceph-mon.yaml
@@ -59,6 +59,14 @@ parameters:
}
default: {}
type: json
+ CephValidationRetries:
+ type: number
+ default: 5
+ description: Number of retry attempts for Ceph validation
+ CephValidationDelay:
+ type: number
+ default: 10
+ description: Interval (in seconds) in between validation checks
MonitoringSubscriptionCephMon:
default: 'overcloud-ceph-mon'
type: string
@@ -119,21 +127,32 @@ outputs:
# rolling upgrade of all osd nodes in step1
- name: Check status
tags: step0,validation
- shell: ceph health | grep -qv HEALTH_ERR
- # FIXME(shardy) I suspect we can use heat or ansible facts here instead?
- - name: Get hostname
+ shell: ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN"
+ - name: Stop CephMon
tags: step0
- shell: hostname -s
- register: mon_id
- - name: Stop Ceph Mon
+ service:
+ name: ceph-mon@{{ ansible_hostname }}
+ state: stopped
+ - name: Update Ceph packages
tags: step0
- service: name=ceph-mon@{{mon_id.stdout}} pattern=ceph-mon state=stopped
- - name: Update ceph packages
+ yum:
+ name: ceph-mon
+ state: latest
+ - name: Start CephMon
tags: step0
- yum: name=ceph-mon state=latest
- - name: Start ceph-mon service
- tags: step0
- service: name=ceph-mon@{{mon_id.stdout}} state=started
+ service:
+ name: ceph-mon@{{ ansible_hostname }}
+ state: started
+ # ceph-ansible
+ # https://github.com/ceph/ceph-ansible/blob/master/infrastructure-playbooks/rolling_update.yml#L149-L157
+ - name: Wait for the monitor to join the quorum...
+ tags: step0,ceph_quorum_validation
+ shell: |
+ ceph -s | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
+ register: ceph_quorum_nodecheck
+ until: ceph_quorum_nodecheck.rc == 0
+ retries: {get_param: CephValidationRetries}
+ delay: {get_param: CephValidationDelay}
- name: ceph osd crush tunables default
tags: step0
shell: ceph osd crush tunables default
diff --git a/puppet/services/ceph-osd.yaml b/puppet/services/ceph-osd.yaml
index 9bd83aab..a97fa116 100644
--- a/puppet/services/ceph-osd.yaml
+++ b/puppet/services/ceph-osd.yaml
@@ -21,6 +21,24 @@ parameters:
MonitoringSubscriptionCephOsd:
default: 'overcloud-ceph-osd'
type: string
+ CephValidationRetries:
+ type: number
+ default: 40
+ description: Number of retry attempts for Ceph validation
+ CephValidationDelay:
+ type: number
+ default: 30
+ description: Interval (in seconds) in between validation checks
+ IgnoreCephUpgradeWarnings:
+ type: boolean
+ default: false
+ description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
+
+parameter_groups:
+- label: deprecated
+ description: Do not use deprecated params, they will be removed.
+ parameters:
+ - IgnoreCephUpgradeWarnings
resources:
CephBase:
@@ -66,17 +84,37 @@ outputs:
- name: ceph osd set noscrub
tags: step1
command: ceph osd set noscrub
- - name: Stop Ceph OSD
+ - name: Stop CephOSD
tags: step1
- service: name=ceph-osd@{{ item }} state=stopped
+ service:
+ name: ceph-osd@{{ item }}
+ state: stopped
with_items: "{{osd_ids.stdout.strip().split()}}"
- - name: Update ceph OSD packages
+ - name: Update Ceph packages
tags: step1
- yum: name=ceph-osd state=latest
- - name: Start ceph-osd service
+ yum:
+ name: ceph-osd
+ state: latest
+ - name: Start CephOSD
tags: step1
- service: name=ceph-osd@{{ item }} state=started
+ service:
+ name: ceph-osd@{{ item }}
+ state: started
with_items: "{{osd_ids.stdout.strip().split()}}"
+ # with awk we are meant to check if $2 and $4 are *the same* but it returns 1 when
+ # they are, so the check is inverted to produce an useful exit code
+ - name: Wait for clean pgs...
+ tags: step1,ceph_pgs_clean_validation
+ vars:
+ ignore_warnings: {get_param: IgnoreCephUpgradeWarnings}
+ shell: |
+ ceph pg stat | awk '{exit($2!=$4)}' && ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN"
+ register: ceph_pgs_healthcheck
+ until: ceph_pgs_healthcheck.rc == 0
+ retries: {get_param: CephValidationRetries}
+ delay: {get_param: CephValidationDelay}
+ when:
+ - not ignore_warnings
- name: ceph osd unset noout
tags: step1
command: ceph osd unset noout
diff --git a/puppet/services/ceph-rgw.yaml b/puppet/services/ceph-rgw.yaml
index d7014e54..c5b29c7e 100644
--- a/puppet/services/ceph-rgw.yaml
+++ b/puppet/services/ceph-rgw.yaml
@@ -87,4 +87,6 @@ outputs:
tags: step0,validation
- name: Stop RGW instance
tags: step1
- service: name=ceph-radosgw@{{rgw_id.stdout}} state=stopped
+ service:
+ name: ceph-radosgw@{{rgw_id.stdout}}
+ state: stopped