summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/mon_thrash.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/ceph/qa/tasks/mon_thrash.py')
-rw-r--r--src/ceph/qa/tasks/mon_thrash.py343
1 files changed, 0 insertions, 343 deletions
diff --git a/src/ceph/qa/tasks/mon_thrash.py b/src/ceph/qa/tasks/mon_thrash.py
deleted file mode 100644
index 0754bcd..0000000
--- a/src/ceph/qa/tasks/mon_thrash.py
+++ /dev/null
@@ -1,343 +0,0 @@
-"""
-Monitor thrash
-"""
-import logging
-import contextlib
-import ceph_manager
-import random
-import time
-import gevent
-import json
-import math
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-def _get_mons(ctx):
- """
- Get monitor names from the context value.
- """
- mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
- return mons
-
-class MonitorThrasher:
- """
- How it works::
-
- - pick a monitor
- - kill it
- - wait for quorum to be formed
- - sleep for 'revive_delay' seconds
- - revive monitor
- - wait for quorum to be formed
- - sleep for 'thrash_delay' seconds
-
- Options::
-
- seed Seed to use on the RNG to reproduce a previous
- behaviour (default: None; i.e., not set)
- revive_delay Number of seconds to wait before reviving
- the monitor (default: 10)
- thrash_delay Number of seconds to wait in-between
- test iterations (default: 0)
- thrash_store Thrash monitor store before killing the monitor being thrashed (default: False)
- thrash_store_probability Probability of thrashing a monitor's store
- (default: 50)
- thrash_many Thrash multiple monitors instead of just one. If
- 'maintain-quorum' is set to False, then we will
- thrash up to as many monitors as there are
- available. (default: False)
- maintain_quorum Always maintain quorum, taking care on how many
- monitors we kill during the thrashing. If we
- happen to only have one or two monitors configured,
- if this option is set to True, then we won't run
- this task as we cannot guarantee maintenance of
- quorum. Setting it to false however would allow the
- task to run with as many as just one single monitor.
- (default: True)
- freeze_mon_probability: how often to freeze the mon instead of killing it,
- in % (default: 0)
- freeze_mon_duration: how many seconds to freeze the mon (default: 15)
- scrub Scrub after each iteration (default: True)
-
- Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also
- be set to True.
-
- For example::
-
- tasks:
- - ceph:
- - mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- thrash_store: true
- thrash_store_probability: 40
- seed: 31337
- maintain_quorum: true
- thrash_many: true
- - ceph-fuse:
- - workunit:
- clients:
- all:
- - mon/workloadgen.sh
- """
- def __init__(self, ctx, manager, config, logger):
- self.ctx = ctx
- self.manager = manager
- self.manager.wait_for_clean()
-
- self.stopping = False
- self.logger = logger
- self.config = config
-
- if self.config is None:
- self.config = dict()
-
- """ Test reproducibility """
- self.random_seed = self.config.get('seed', None)
-
- if self.random_seed is None:
- self.random_seed = int(time.time())
-
- self.rng = random.Random()
- self.rng.seed(int(self.random_seed))
-
- """ Monitor thrashing """
- self.revive_delay = float(self.config.get('revive_delay', 10.0))
- self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
-
- self.thrash_many = self.config.get('thrash_many', False)
- self.maintain_quorum = self.config.get('maintain_quorum', True)
-
- self.scrub = self.config.get('scrub', True)
-
- self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
- self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
-
- assert self.max_killable() > 0, \
- 'Unable to kill at least one monitor with the current config.'
-
- """ Store thrashing """
- self.store_thrash = self.config.get('store_thrash', False)
- self.store_thrash_probability = int(
- self.config.get('store_thrash_probability', 50))
- if self.store_thrash:
- assert self.store_thrash_probability > 0, \
- 'store_thrash is set, probability must be > 0'
- assert self.maintain_quorum, \
- 'store_thrash = true must imply maintain_quorum = true'
-
- self.thread = gevent.spawn(self.do_thrash)
-
- def log(self, x):
- """
- locally log info messages
- """
- self.logger.info(x)
-
- def do_join(self):
- """
- Break out of this processes thrashing loop.
- """
- self.stopping = True
- self.thread.get()
-
- def should_thrash_store(self):
- """
- If allowed, indicate that we should thrash a certain percentage of
- the time as determined by the store_thrash_probability value.
- """
- if not self.store_thrash:
- return False
- return self.rng.randrange(0, 101) < self.store_thrash_probability
-
- def thrash_store(self, mon):
- """
- Thrash the monitor specified.
- :param mon: monitor to thrash
- """
- addr = self.ctx.ceph['ceph'].conf['mon.%s' % mon]['mon addr']
- self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr))
- out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force')
- j = json.loads(out)
- assert j['ret'] == 0, \
- 'error forcing store sync on mon.{id}:\n{ret}'.format(
- id=mon,ret=out)
-
- def should_freeze_mon(self):
- """
- Indicate that we should freeze a certain percentago of the time
- as determined by the freeze_mon_probability value.
- """
- return self.rng.randrange(0, 101) < self.freeze_mon_probability
-
- def freeze_mon(self, mon):
- """
- Send STOP signal to freeze the monitor.
- """
- log.info('Sending STOP to mon %s', mon)
- self.manager.signal_mon(mon, 19) # STOP
-
- def unfreeze_mon(self, mon):
- """
- Send CONT signal to unfreeze the monitor.
- """
- log.info('Sending CONT to mon %s', mon)
- self.manager.signal_mon(mon, 18) # CONT
-
- def kill_mon(self, mon):
- """
- Kill the monitor specified
- """
- self.log('killing mon.{id}'.format(id=mon))
- self.manager.kill_mon(mon)
-
- def revive_mon(self, mon):
- """
- Revive the monitor specified
- """
- self.log('killing mon.{id}'.format(id=mon))
- self.log('reviving mon.{id}'.format(id=mon))
- self.manager.revive_mon(mon)
-
- def max_killable(self):
- """
- Return the maximum number of monitors we can kill.
- """
- m = len(_get_mons(self.ctx))
- if self.maintain_quorum:
- return max(math.ceil(m/2.0)-1, 0)
- else:
- return m
-
- def do_thrash(self):
- """
- Cotinuously loop and thrash the monitors.
- """
- self.log('start thrashing')
- self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
- 'thrash many: {tm}, maintain quorum: {mq} '\
- 'store thrash: {st}, probability: {stp} '\
- 'freeze mon: prob {fp} duration {fd}'.format(
- s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
- tm=self.thrash_many, mq=self.maintain_quorum,
- st=self.store_thrash,stp=self.store_thrash_probability,
- fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
- ))
-
- while not self.stopping:
- mons = _get_mons(self.ctx)
- self.manager.wait_for_mon_quorum_size(len(mons))
- self.log('making sure all monitors are in the quorum')
- for m in mons:
- s = self.manager.get_mon_status(m)
- assert s['state'] == 'leader' or s['state'] == 'peon'
- assert len(s['quorum']) == len(mons)
-
- kill_up_to = self.rng.randrange(1, self.max_killable()+1)
- mons_to_kill = self.rng.sample(mons, kill_up_to)
- self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
-
- mons_to_freeze = []
- for mon in mons:
- if mon in mons_to_kill:
- continue
- if self.should_freeze_mon():
- mons_to_freeze.append(mon)
- self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
-
- for mon in mons_to_kill:
- self.log('thrashing mon.{m}'.format(m=mon))
-
- """ we only thrash stores if we are maintaining quorum """
- if self.should_thrash_store() and self.maintain_quorum:
- self.thrash_store(mon)
-
- self.kill_mon(mon)
-
- if mons_to_freeze:
- for mon in mons_to_freeze:
- self.freeze_mon(mon)
- self.log('waiting for {delay} secs to unfreeze mons'.format(
- delay=self.freeze_mon_duration))
- time.sleep(self.freeze_mon_duration)
- for mon in mons_to_freeze:
- self.unfreeze_mon(mon)
-
- if self.maintain_quorum:
- self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
- for m in mons:
- if m in mons_to_kill:
- continue
- s = self.manager.get_mon_status(m)
- assert s['state'] == 'leader' or s['state'] == 'peon'
- assert len(s['quorum']) == len(mons)-len(mons_to_kill)
-
- self.log('waiting for {delay} secs before reviving monitors'.format(
- delay=self.revive_delay))
- time.sleep(self.revive_delay)
-
- for mon in mons_to_kill:
- self.revive_mon(mon)
- # do more freezes
- if mons_to_freeze:
- for mon in mons_to_freeze:
- self.freeze_mon(mon)
- self.log('waiting for {delay} secs to unfreeze mons'.format(
- delay=self.freeze_mon_duration))
- time.sleep(self.freeze_mon_duration)
- for mon in mons_to_freeze:
- self.unfreeze_mon(mon)
-
- self.manager.wait_for_mon_quorum_size(len(mons))
- for m in mons:
- s = self.manager.get_mon_status(m)
- assert s['state'] == 'leader' or s['state'] == 'peon'
- assert len(s['quorum']) == len(mons)
-
- if self.scrub:
- self.log('triggering scrub')
- try:
- self.manager.raw_cluster_cmd('scrub')
- except Exception:
- log.exception("Saw exception while triggering scrub")
-
- if self.thrash_delay > 0.0:
- self.log('waiting for {delay} secs before continuing thrashing'.format(
- delay=self.thrash_delay))
- time.sleep(self.thrash_delay)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Stress test the monitor by thrashing them while another task/workunit
- is running.
-
- Please refer to MonitorThrasher class for further information on the
- available options.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'mon_thrash task only accepts a dict for configuration'
- assert len(_get_mons(ctx)) > 2, \
- 'mon_thrash task requires at least 3 monitors'
- log.info('Beginning mon_thrash...')
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
- thrash_proc = MonitorThrasher(ctx,
- manager, config,
- logger=log.getChild('mon_thrasher'))
- try:
- log.debug('Yielding')
- yield
- finally:
- log.info('joining mon_thrasher')
- thrash_proc.do_join()
- mons = _get_mons(ctx)
- manager.wait_for_mon_quorum_size(len(mons))