summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/dump_stuck.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/ceph/qa/tasks/dump_stuck.py')
-rw-r--r--src/ceph/qa/tasks/dump_stuck.py162
1 files changed, 0 insertions, 162 deletions
diff --git a/src/ceph/qa/tasks/dump_stuck.py b/src/ceph/qa/tasks/dump_stuck.py
deleted file mode 100644
index 39429d2..0000000
--- a/src/ceph/qa/tasks/dump_stuck.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""
-Dump_stuck command
-"""
-import logging
-import re
-import time
-
-import ceph_manager
-from teuthology import misc as teuthology
-
-
-log = logging.getLogger(__name__)
-
-def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
- """
- Do checks. Make sure get_stuck_pgs return the right amout of information, then
- extract health information from the raw_cluster_cmd and compare the results with
- values passed in. This passes if all asserts pass.
-
- :param num_manager: Ceph manager
- :param num_inactive: number of inaactive pages that are stuck
- :param num_unclean: number of unclean pages that are stuck
- :paran num_stale: number of stale pages that are stuck
- :param timeout: timeout value for get_stuck_pgs calls
- """
- inactive = manager.get_stuck_pgs('inactive', timeout)
- unclean = manager.get_stuck_pgs('unclean', timeout)
- stale = manager.get_stuck_pgs('stale', timeout)
- log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
- len(inactive), num_inactive,
- len(unclean), num_unclean,
- len(stale), num_stale)
- assert len(inactive) == num_inactive
- assert len(unclean) == num_unclean
- assert len(stale) == num_stale
-
-def task(ctx, config):
- """
- Test the dump_stuck command.
-
- :param ctx: Context
- :param config: Configuration
- """
- assert config is None, \
- 'dump_stuck requires no configuration'
- assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
- 'dump_stuck requires exactly 2 osds'
-
- timeout = 60
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- manager.flush_pg_stats([0, 1])
- manager.wait_for_clean(timeout)
-
- manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
-# '--mon-osd-report-timeout 90',
- '--mon-pg-stuck-threshold 10')
-
- # all active+clean
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=0,
- num_stale=0,
- )
- num_pgs = manager.get_num_pgs()
-
- manager.mark_out_osd(0)
- time.sleep(timeout)
- manager.flush_pg_stats([1])
- manager.wait_for_recovery(timeout)
-
- # all active+clean+remapped
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=0,
- num_stale=0,
- )
-
- manager.mark_in_osd(0)
- manager.flush_pg_stats([0, 1])
- manager.wait_for_clean(timeout)
-
- # all active+clean
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=0,
- num_stale=0,
- )
-
- log.info('stopping first osd')
- manager.kill_osd(0)
- manager.mark_down_osd(0)
- manager.wait_for_active(timeout)
-
- log.info('waiting for all to be unclean')
- starttime = time.time()
- done = False
- while not done:
- try:
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=num_pgs,
- num_stale=0,
- )
- done = True
- except AssertionError:
- # wait up to 15 minutes to become stale
- if time.time() - starttime > 900:
- raise
-
-
- log.info('stopping second osd')
- manager.kill_osd(1)
- manager.mark_down_osd(1)
-
- log.info('waiting for all to be stale')
- starttime = time.time()
- done = False
- while not done:
- try:
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=num_pgs,
- num_stale=num_pgs,
- )
- done = True
- except AssertionError:
- # wait up to 15 minutes to become stale
- if time.time() - starttime > 900:
- raise
-
- log.info('reviving')
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
- manager.revive_osd(id_)
- manager.mark_in_osd(id_)
- while True:
- try:
- manager.flush_pg_stats([0, 1])
- break
- except Exception:
- log.exception('osds must not be started yet, waiting...')
- time.sleep(1)
- manager.wait_for_clean(timeout)
-
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=0,
- num_stale=0,
- )