summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/thrashosds.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/ceph/qa/tasks/thrashosds.py')
-rw-r--r--src/ceph/qa/tasks/thrashosds.py204
1 files changed, 0 insertions, 204 deletions
diff --git a/src/ceph/qa/tasks/thrashosds.py b/src/ceph/qa/tasks/thrashosds.py
deleted file mode 100644
index 420b735..0000000
--- a/src/ceph/qa/tasks/thrashosds.py
+++ /dev/null
@@ -1,204 +0,0 @@
-"""
-Thrash -- Simulate random osd failures.
-"""
-import contextlib
-import logging
-import ceph_manager
-from teuthology import misc as teuthology
-
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- "Thrash" the OSDs by randomly marking them out/down (and then back
- in) until the task is ended. This loops, and every op_delay
- seconds it randomly chooses to add or remove an OSD (even odds)
- unless there are fewer than min_out OSDs out of the cluster, or
- more than min_in OSDs in the cluster.
-
- All commands are run on mon0 and it stops when __exit__ is called.
-
- The config is optional, and is a dict containing some or all of:
-
- cluster: (default 'ceph') the name of the cluster to thrash
-
- min_in: (default 4) the minimum number of OSDs to keep in the
- cluster
-
- min_out: (default 0) the minimum number of OSDs to keep out of the
- cluster
-
- op_delay: (5) the length of time to sleep between changing an
- OSD's status
-
- min_dead: (0) minimum number of osds to leave down/dead.
-
- max_dead: (0) maximum number of osds to leave down/dead before waiting
- for clean. This should probably be num_replicas - 1.
-
- clean_interval: (60) the approximate length of time to loop before
- waiting until the cluster goes clean. (In reality this is used
- to probabilistically choose when to wait, and the method used
- makes it closer to -- but not identical to -- the half-life.)
-
- scrub_interval: (-1) the approximate length of time to loop before
- waiting until a scrub is performed while cleaning. (In reality
- this is used to probabilistically choose when to wait, and it
- only applies to the cases where cleaning is being performed).
- -1 is used to indicate that no scrubbing will be done.
-
- chance_down: (0.4) the probability that the thrasher will mark an
- OSD down rather than marking it out. (The thrasher will not
- consider that OSD out of the cluster, since presently an OSD
- wrongly marked down will mark itself back up again.) This value
- can be either an integer (eg, 75) or a float probability (eg
- 0.75).
-
- chance_test_min_size: (0) chance to run test_pool_min_size,
- which:
- - kills all but one osd
- - waits
- - kills that osd
- - revives all other osds
- - verifies that the osds fully recover
-
- timeout: (360) the number of seconds to wait for the cluster
- to become clean after each cluster change. If this doesn't
- happen within the timeout, an exception will be raised.
-
- revive_timeout: (150) number of seconds to wait for an osd asok to
- appear after attempting to revive the osd
-
- thrash_primary_affinity: (true) randomly adjust primary-affinity
-
- chance_pgnum_grow: (0) chance to increase a pool's size
- chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
- pool_grow_by: (10) amount to increase pgnum by
- max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
-
- pause_short: (3) duration of short pause
- pause_long: (80) duration of long pause
- pause_check_after: (50) assert osd down after this long
- chance_inject_pause_short: (1) chance of injecting short stall
- chance_inject_pause_long: (0) chance of injecting long stall
-
- clean_wait: (0) duration to wait before resuming thrashing once clean
-
- sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a
- random live osd
-
- powercycle: (false) whether to power cycle the node instead
- of just the osd process. Note that this assumes that a single
- osd is the only important process on the node.
-
- bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
- the delay lets the BlockDevice "accept" more aio operations but blocks
- any flush, and then eventually crashes (losing some or all ios). If 0,
- no bdev failure injection is enabled.
-
- bdev_inject_crash_probability: (.5) probability of doing a bdev failure
- injection crash vs a normal OSD kill.
-
- chance_test_backfill_full: (0) chance to simulate full disks stopping
- backfill
-
- chance_test_map_discontinuity: (0) chance to test map discontinuity
- map_discontinuity_sleep_time: (40) time to wait for map trims
-
- ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
- chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)
-
- optrack_toggle_delay: (2.0) duration to delay between toggling op tracker
- enablement to all osds
-
- dump_ops_enable: (true) continuously dump ops on all live osds
-
- noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub
-
- disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based
- tests
-
- chance_thrash_cluster_full: .05
-
- chance_thrash_pg_upmap: 1.0
- chance_thrash_pg_upmap_items: 1.0
-
- example:
-
- tasks:
- - ceph:
- - thrashosds:
- cluster: ceph
- chance_down: 10
- op_delay: 3
- min_in: 1
- timeout: 600
- - interactive:
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'thrashosds task only accepts a dict for configuration'
- # add default value for sighup_delay
- config['sighup_delay'] = config.get('sighup_delay', 0.1)
- # add default value for optrack_toggle_delay
- config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0)
- # add default value for dump_ops_enable
- config['dump_ops_enable'] = config.get('dump_ops_enable', "true")
- # add default value for noscrub_toggle_delay
- config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0)
- # add default value for random_eio
- config['random_eio'] = config.get('random_eio', 0.0)
-
- log.info("config is {config}".format(config=str(config)))
-
- overrides = ctx.config.get('overrides', {})
- log.info("overrides is {overrides}".format(overrides=str(overrides)))
- teuthology.deep_merge(config, overrides.get('thrashosds', {}))
- cluster = config.get('cluster', 'ceph')
-
- log.info("config is {config}".format(config=str(config)))
-
- if 'powercycle' in config:
-
- # sync everyone first to avoid collateral damage to / etc.
- log.info('Doing preliminary sync to avoid collateral damage...')
- ctx.cluster.run(args=['sync'])
-
- if 'ipmi_user' in ctx.teuthology_config:
- for remote in ctx.cluster.remotes.keys():
- log.debug('checking console status of %s' % remote.shortname)
- if not remote.console.check_status():
- log.warn('Failed to get console status for %s',
- remote.shortname)
-
- # check that all osd remotes have a valid console
- osds = ctx.cluster.only(teuthology.is_type('osd', cluster))
- for remote in osds.remotes.keys():
- if not remote.console.has_ipmi_credentials:
- raise Exception(
- 'IPMI console required for powercycling, '
- 'but not available on osd role: {r}'.format(
- r=remote.name))
-
- cluster_manager = ctx.managers[cluster]
- for f in ['powercycle', 'bdev_inject_crash']:
- if config.get(f):
- cluster_manager.config[f] = config.get(f)
-
- log.info('Beginning thrashosds...')
- thrash_proc = ceph_manager.Thrasher(
- cluster_manager,
- config,
- logger=log.getChild('thrasher')
- )
- try:
- yield
- finally:
- log.info('joining thrashosds')
- thrash_proc.do_join()
- cluster_manager.wait_for_all_osds_up()
- cluster_manager.flush_all_pg_stats()
- cluster_manager.wait_for_recovery(config.get('timeout', 360))