summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/mon_seesaw.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/ceph/qa/tasks/mon_seesaw.py')
-rw-r--r--src/ceph/qa/tasks/mon_seesaw.py198
1 files changed, 0 insertions, 198 deletions
diff --git a/src/ceph/qa/tasks/mon_seesaw.py b/src/ceph/qa/tasks/mon_seesaw.py
deleted file mode 100644
index b101c0e..0000000
--- a/src/ceph/qa/tasks/mon_seesaw.py
+++ /dev/null
@@ -1,198 +0,0 @@
-from cStringIO import StringIO
-
-import contextlib
-import logging
-import random
-
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-
-from ceph_manager import CephManager, write_conf
-
-
-log = logging.getLogger(__name__)
-
-
-def _get_mons(ctx):
- return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
-
-
-# teuthology prepares the monitor IPs (and ports) in get_mons(), we can
-# enumerate all monitor ports ([6789..]), and find the next available one.
-def _get_next_port(ctx, ip, cluster):
- # assuming we have only one cluster here.
- used = []
- for name in teuthology.get_mon_names(ctx, cluster):
- addr = ctx.ceph[cluster].conf[name]['mon addr']
- mon_ip, mon_port = addr.split(':')
- if mon_ip != ip:
- continue
- used.append(int(mon_port))
- port = 6789
- used.sort()
- for p in used:
- if p != port:
- break
- port += 1
- return port
-
-
-def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
- # co-locate a new monitor on remote where an existing monitor is hosted
- cluster = manager.cluster
- remote.run(args=['sudo', 'mkdir', '-p', data_path])
- keyring_path = '/etc/ceph/{cluster}.keyring'.format(
- cluster=manager.cluster)
- testdir = teuthology.get_testdir(ctx)
- monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
- cluster=cluster)
- manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
- if manager.controller != remote:
- monmap = teuthology.get_file(manager.controller, monmap_path)
- teuthology.write_file(remote, monmap_path, StringIO(monmap))
- remote.run(
- args=[
- 'sudo',
- 'ceph-mon',
- '--cluster', cluster,
- '--mkfs',
- '-i', mon,
- '--monmap', monmap_path,
- '--keyring', keyring_path])
- if manager.controller != remote:
- teuthology.delete_file(remote, monmap_path)
- # raw_cluster_cmd() is performed using sudo, so sudo here also.
- teuthology.delete_file(manager.controller, monmap_path, sudo=True)
- # update ceph.conf so that the ceph CLI is able to connect to the cluster
- if conf_path:
- ip = remote.ip_address
- port = _get_next_port(ctx, ip, cluster)
- mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
- ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
- write_conf(ctx, conf_path, cluster)
-
-
-def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
- cluster = manager.cluster
- del ctx.ceph[cluster].conf[name]
- write_conf(ctx, conf_path, cluster)
- remote.run(args=['sudo', 'rm', '-rf', data_path])
-
-
-@contextlib.contextmanager
-def _prepare_mon(ctx, manager, remote, mon):
- cluster = manager.cluster
- data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
- cluster=cluster, id=mon)
- conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
- name = 'mon.{0}'.format(mon)
- _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
- yield
- _teardown_mon(ctx, manager, remote, name,
- data_path, conf_path)
-
-
-# run_daemon() in ceph.py starts a herd of daemons of the same type, but
-# _run_daemon() starts only one instance.
-@contextlib.contextmanager
-def _run_daemon(ctx, remote, cluster, type_, id_):
- testdir = teuthology.get_testdir(ctx)
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
- daemon_signal = 'kill'
- run_cmd = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'daemon-helper',
- daemon_signal,
- ]
- run_cmd_tail = [
- 'ceph-%s' % (type_),
- '-f',
- '--cluster', cluster,
- '-i', id_]
- run_cmd.extend(run_cmd_tail)
- ctx.daemons.add_daemon(remote, type_, id_,
- cluster=cluster,
- args=run_cmd,
- logger=log.getChild(type_),
- stdin=run.PIPE,
- wait=False)
- daemon = ctx.daemons.get_daemon(type_, id_, cluster)
- yield daemon
- daemon.stop()
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- replace a monitor with a newly added one, and then revert this change
-
- How it works::
- 1. add a mon with specified id (mon.victim_prime)
- 2. wait for quorum
- 3. remove a monitor with specified id (mon.victim), mon.victim will commit
- suicide
- 4. wait for quorum
- 5. <yield>
- 5. add mon.a back, and start it
- 6. wait for quorum
- 7. remove mon.a_prime
-
- Options::
- victim the id of the mon to be removed (pick a random mon by default)
- replacer the id of the new mon (use "${victim}_prime" if not specified)
- """
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
-
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- "task ceph only supports a dictionary for configuration"
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
- victim = config.get('victim', random.choice(_get_mons(ctx)))
- replacer = config.get('replacer', '{0}_prime'.format(victim))
- remote = manager.find_remote('mon', victim)
- quorum = manager.get_mon_quorum()
- cluster = manager.cluster
- log.info('replacing {victim} with {replacer}'.format(victim=victim,
- replacer=replacer))
- with _prepare_mon(ctx, manager, remote, replacer):
- with _run_daemon(ctx, remote, cluster, 'mon', replacer):
- # replacer will join the quorum automatically
- manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
- # if we don't remove the victim from monmap, there is chance that
- # we are leaving the new joiner with a monmap of 2 mon, and it will
- # not able to reach the other one, it will be keeping probing for
- # ever.
- log.info('removing {mon}'.format(mon=victim))
- manager.raw_cluster_cmd('mon', 'remove', victim)
- manager.wait_for_mon_quorum_size(len(quorum), 10)
- # the victim will commit suicide after being removed from
- # monmap, let's wait until it stops.
- ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
- try:
- # perform other tasks
- yield
- finally:
- # bring the victim back online
- # nuke the monstore of victim, otherwise it will refuse to boot
- # with following message:
- #
- # not in monmap and have been in a quorum before; must have
- # been removed
- log.info('re-adding {mon}'.format(mon=victim))
- data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
- cluster=cluster, id=victim)
- remote.run(args=['sudo', 'rm', '-rf', data_path])
- name = 'mon.{0}'.format(victim)
- _setup_mon(ctx, manager, remote, victim, name, data_path, None)
- log.info('reviving {mon}'.format(mon=victim))
- manager.revive_mon(victim)
- manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
- manager.raw_cluster_cmd('mon', 'remove', replacer)
- manager.wait_for_mon_quorum_size(len(quorum), 10)