diff options
author | Qiaowei Ren <qiaowei.ren@intel.com> | 2018-03-01 14:38:11 +0800 |
---|---|---|
committer | Qiaowei Ren <qiaowei.ren@intel.com> | 2018-03-01 14:38:11 +0800 |
commit | 7da45d65be36d36b880cc55c5036e96c24b53f00 (patch) | |
tree | d4f944eb4f8f8de50a9a7584ffa408dc3a3185b2 /src/ceph/qa/tasks | |
parent | 691462d09d0987b47e112d6ee8740375df3c51b2 (diff) |
remove ceph code
This patch removes initial ceph code, due to license issue.
Change-Id: I092d44f601cdf34aed92300fe13214925563081c
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Diffstat (limited to 'src/ceph/qa/tasks')
141 files changed, 0 insertions, 35401 deletions
diff --git a/src/ceph/qa/tasks/__init__.py b/src/ceph/qa/tasks/__init__.py deleted file mode 100644 index 9a7949a..0000000 --- a/src/ceph/qa/tasks/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -import logging - -# Inherit teuthology's log level -teuthology_log = logging.getLogger('teuthology') -log = logging.getLogger(__name__) -log.setLevel(teuthology_log.level) diff --git a/src/ceph/qa/tasks/admin_socket.py b/src/ceph/qa/tasks/admin_socket.py deleted file mode 100644 index 3301372..0000000 --- a/src/ceph/qa/tasks/admin_socket.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -Admin Socket task -- used in rados, powercycle, and smoke testing -""" -from cStringIO import StringIO - -import json -import logging -import os -import time - -from teuthology.orchestra import run -from teuthology import misc as teuthology -from teuthology.parallel import parallel -from teuthology.config import config as teuth_config - -log = logging.getLogger(__name__) - - -def task(ctx, config): - """ - Run an admin socket command, make sure the output is json, and run - a test program on it. The test program should read json from - stdin. This task succeeds if the test program exits with status 0. - - To run the same test on all clients:: - - tasks: - - ceph: - - rados: - - admin_socket: - all: - dump_requests: - test: http://example.com/script - - To restrict it to certain clients:: - - tasks: - - ceph: - - rados: [client.1] - - admin_socket: - client.1: - dump_requests: - test: http://example.com/script - - If an admin socket command has arguments, they can be specified as - a list:: - - tasks: - - ceph: - - rados: [client.0] - - admin_socket: - client.0: - dump_requests: - test: http://example.com/script - help: - test: http://example.com/test_help_version - args: [version] - - Note that there must be a ceph client with an admin socket running - before this task is run. The tests are parallelized at the client - level. Tests for a single client are run serially. - - :param ctx: Context - :param config: Configuration - """ - assert isinstance(config, dict), \ - 'admin_socket task requires a dict for configuration' - teuthology.replace_all_with_clients(ctx.cluster, config) - - with parallel() as ptask: - for client, tests in config.iteritems(): - ptask.spawn(_run_tests, ctx, client, tests) - - -def _socket_command(ctx, remote, socket_path, command, args): - """ - Run an admin socket command and return the result as a string. - - :param ctx: Context - :param remote: Remote site - :param socket_path: path to socket - :param command: command to be run remotely - :param args: command arguments - - :returns: output of command in json format - """ - json_fp = StringIO() - testdir = teuthology.get_testdir(ctx) - max_tries = 120 - while True: - proc = remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', - '--admin-daemon', socket_path, - ] + command.split(' ') + args, - stdout=json_fp, - check_status=False, - ) - if proc.exitstatus == 0: - break - assert max_tries > 0 - max_tries -= 1 - log.info('ceph cli returned an error, command not registered yet?') - log.info('sleeping and retrying ...') - time.sleep(1) - out = json_fp.getvalue() - json_fp.close() - log.debug('admin socket command %s returned %s', command, out) - return json.loads(out) - -def _run_tests(ctx, client, tests): - """ - Create a temp directory and wait for a client socket to be created. - For each test, copy the executable locally and run the test. - Remove temp directory when finished. - - :param ctx: Context - :param client: client machine to run the test - :param tests: list of tests to run - """ - testdir = teuthology.get_testdir(ctx) - log.debug('Running admin socket tests on %s', client) - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - socket_path = '/var/run/ceph/ceph-{name}.asok'.format(name=client) - overrides = ctx.config.get('overrides', {}).get('admin_socket', {}) - - try: - tmp_dir = os.path.join( - testdir, - 'admin_socket_{client}'.format(client=client), - ) - remote.run( - args=[ - 'mkdir', - '--', - tmp_dir, - run.Raw('&&'), - # wait for client process to create the socket - 'while', 'test', '!', '-e', socket_path, run.Raw(';'), - 'do', 'sleep', '1', run.Raw(';'), 'done', - ], - ) - - for command, config in tests.iteritems(): - if config is None: - config = {} - teuthology.deep_merge(config, overrides) - log.debug('Testing %s with config %s', command, str(config)) - - test_path = None - if 'test' in config: - # hack: the git_url is always ceph-ci or ceph - git_url = teuth_config.get_ceph_git_url() - repo_name = 'ceph.git' - if git_url.count('ceph-ci'): - repo_name = 'ceph-ci.git' - url = config['test'].format( - branch=config.get('branch', 'master'), - repo=repo_name, - ) - test_path = os.path.join(tmp_dir, command) - remote.run( - args=[ - 'wget', - '-q', - '-O', - test_path, - '--', - url, - run.Raw('&&'), - 'chmod', - 'u=rx', - '--', - test_path, - ], - ) - - args = config.get('args', []) - assert isinstance(args, list), \ - 'admin socket command args must be a list' - sock_out = _socket_command(ctx, remote, socket_path, command, args) - if test_path is not None: - remote.run( - args=[ - test_path, - ], - stdin=json.dumps(sock_out), - ) - - finally: - remote.run( - args=[ - 'rm', '-rf', '--', tmp_dir, - ], - ) diff --git a/src/ceph/qa/tasks/autotest.py b/src/ceph/qa/tasks/autotest.py deleted file mode 100644 index efa9721..0000000 --- a/src/ceph/qa/tasks/autotest.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Run an autotest test on the ceph cluster. -""" -import json -import logging -import os - -from teuthology import misc as teuthology -from teuthology.parallel import parallel -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Run an autotest test on the ceph cluster. - - Only autotest client tests are supported. - - The config is a mapping from role name to list of tests to run on - that client. - - For example:: - - tasks: - - ceph: - - ceph-fuse: [client.0, client.1] - - autotest: - client.0: [dbench] - client.1: [bonnie] - - You can also specify a list of tests to run on all clients:: - - tasks: - - ceph: - - ceph-fuse: - - autotest: - all: [dbench] - """ - assert isinstance(config, dict) - config = teuthology.replace_all_with_clients(ctx.cluster, config) - log.info('Setting up autotest...') - testdir = teuthology.get_testdir(ctx) - with parallel() as p: - for role in config.iterkeys(): - (remote,) = ctx.cluster.only(role).remotes.keys() - p.spawn(_download, testdir, remote) - - log.info('Making a separate scratch dir for every client...') - for role in config.iterkeys(): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) - scratch = os.path.join(mnt, 'client.{id}'.format(id=id_)) - remote.run( - args=[ - 'sudo', - 'install', - '-d', - '-m', '0755', - '--owner={user}'.format(user='ubuntu'), #TODO - '--', - scratch, - ], - ) - - with parallel() as p: - for role, tests in config.iteritems(): - (remote,) = ctx.cluster.only(role).remotes.keys() - p.spawn(_run_tests, testdir, remote, role, tests) - -def _download(testdir, remote): - """ - Download. Does not explicitly support muliple tasks in a single run. - """ - remote.run( - args=[ - # explicitly does not support multiple autotest tasks - # in a single run; the result archival would conflict - 'mkdir', '{tdir}/archive/autotest'.format(tdir=testdir), - run.Raw('&&'), - 'mkdir', '{tdir}/autotest'.format(tdir=testdir), - run.Raw('&&'), - 'wget', - '-nv', - '--no-check-certificate', - 'https://github.com/ceph/autotest/tarball/ceph', - '-O-', - run.Raw('|'), - 'tar', - '-C', '{tdir}/autotest'.format(tdir=testdir), - '-x', - '-z', - '-f-', - '--strip-components=1', - ], - ) - -def _run_tests(testdir, remote, role, tests): - """ - Spawned to run test on remote site - """ - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) - scratch = os.path.join(mnt, 'client.{id}'.format(id=id_)) - - assert isinstance(tests, list) - for idx, testname in enumerate(tests): - log.info('Running autotest client test #%d: %s...', idx, testname) - - tag = 'client.{id}.num{idx}.{testname}'.format( - idx=idx, - testname=testname, - id=id_, - ) - control = '{tdir}/control.{tag}'.format(tdir=testdir, tag=tag) - teuthology.write_file( - remote=remote, - path=control, - data='import json; data=json.loads({data!r}); job.run_test(**data)'.format( - data=json.dumps(dict( - url=testname, - dir=scratch, - # TODO perhaps tag - # results will be in {testdir}/autotest/client/results/dbench - # or {testdir}/autotest/client/results/dbench.{tag} - )), - ), - ) - remote.run( - args=[ - '{tdir}/autotest/client/bin/autotest'.format(tdir=testdir), - '--verbose', - '--harness=simple', - '--tag={tag}'.format(tag=tag), - control, - run.Raw('3>&1'), - ], - ) - - remote.run( - args=[ - 'rm', '-rf', '--', control, - ], - ) - - remote.run( - args=[ - 'mv', - '--', - '{tdir}/autotest/client/results/{tag}'.format(tdir=testdir, tag=tag), - '{tdir}/archive/autotest/{tag}'.format(tdir=testdir, tag=tag), - ], - ) - - remote.run( - args=[ - 'rm', '-rf', '--', '{tdir}/autotest'.format(tdir=testdir), - ], - ) diff --git a/src/ceph/qa/tasks/aver.py b/src/ceph/qa/tasks/aver.py deleted file mode 100644 index 79ee18c..0000000 --- a/src/ceph/qa/tasks/aver.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Aver wrapper task -""" -import contextlib -import logging -from subprocess import check_call, Popen, PIPE - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Execute an aver assertion - - Parameters: - - input: file containing data referred to by the assertions. File name is - relative to the job's archive path - validations: list of validations in the Aver language - - Example: - - aver: - input: bench_output.csv - validations: - - expect performance(alg='ceph') > performance(alg='raw') - - for size > 3 expect avg_throughput > 2000 - """ - log.info('Beginning aver...') - assert isinstance(config, dict), 'expecting dictionary for configuration' - - if 'input' not in config: - raise Exception("Expecting 'input' option") - if len(config.get('validations', [])) < 1: - raise Exception("Expecting at least one entry in 'validations'") - - url = ('https://github.com/ivotron/aver/releases/download/' - 'v0.3.0/aver-linux-amd64.tar.bz2') - - aver_path = ctx.archive + '/aver' - - # download binary - check_call(['wget', '-O', aver_path + '.tbz', url]) - check_call(['tar', 'xfj', aver_path + '.tbz', '-C', ctx.archive]) - - # print version - process = Popen([aver_path, '-v'], stdout=PIPE) - log.info(process.communicate()[0]) - - # validate - for validation in config['validations']: - cmd = (aver_path + ' -s -i ' + (ctx.archive + '/' + config['input']) + - ' "' + validation + '"') - log.info("executing: " + cmd) - process = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) - (stdout, stderr) = process.communicate() - if stderr: - log.info('aver stderr: ' + stderr) - log.info('aver result: ' + stdout) - if stdout.strip(' \t\n\r') != 'true': - raise Exception('Failed validation: ' + validation) - - try: - yield - finally: - log.info('Removing aver binary...') - check_call(['rm', aver_path, aver_path + '.tbz']) diff --git a/src/ceph/qa/tasks/blktrace.py b/src/ceph/qa/tasks/blktrace.py deleted file mode 100644 index 96aaf50..0000000 --- a/src/ceph/qa/tasks/blktrace.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -Run blktrace program through teuthology -""" -import contextlib -import logging - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.orchestra import run - -log = logging.getLogger(__name__) -blktrace = '/usr/sbin/blktrace' -daemon_signal = 'term' - -@contextlib.contextmanager -def setup(ctx, config): - """ - Setup all the remotes - """ - osds = ctx.cluster.only(teuthology.is_type('osd', config['cluster'])) - log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=teuthology.get_testdir(ctx)) - - for remote, roles_for_host in osds.remotes.iteritems(): - log.info('Creating %s on %s' % (log_dir, remote.name)) - remote.run( - args=['mkdir', '-p', '-m0755', '--', log_dir], - wait=False, - ) - yield - -@contextlib.contextmanager -def execute(ctx, config): - """ - Run the blktrace program on remote machines. - """ - procs = [] - testdir = teuthology.get_testdir(ctx) - log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=testdir) - - osds = ctx.cluster.only(teuthology.is_type('osd')) - for remote, roles_for_host in osds.remotes.iteritems(): - roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote] - for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', - config['cluster']): - if roles_to_devs.get(role): - dev = roles_to_devs[role] - log.info("running blktrace on %s: %s" % (remote.name, dev)) - - proc = remote.run( - args=[ - 'cd', - log_dir, - run.Raw(';'), - 'daemon-helper', - daemon_signal, - 'sudo', - blktrace, - '-o', - dev.rsplit("/", 1)[1], - '-d', - dev, - ], - wait=False, - stdin=run.PIPE, - ) - procs.append(proc) - try: - yield - finally: - osds = ctx.cluster.only(teuthology.is_type('osd')) - log.info('stopping blktrace processs') - for proc in procs: - proc.stdin.close() - -@contextlib.contextmanager -def task(ctx, config): - """ - Usage: - blktrace: - - or: - blktrace: - cluster: backup - - Runs blktrace on all osds in the specified cluster (the 'ceph' cluster by - default). - """ - if config is None: - config = {} - config['cluster'] = config.get('cluster', 'ceph') - - with contextutil.nested( - lambda: setup(ctx=ctx, config=config), - lambda: execute(ctx=ctx, config=config), - ): - yield diff --git a/src/ceph/qa/tasks/boto.cfg.template b/src/ceph/qa/tasks/boto.cfg.template deleted file mode 100644 index cdfe887..0000000 --- a/src/ceph/qa/tasks/boto.cfg.template +++ /dev/null @@ -1,2 +0,0 @@ -[Boto] -http_socket_timeout = {idle_timeout} diff --git a/src/ceph/qa/tasks/calamari_nosetests.py b/src/ceph/qa/tasks/calamari_nosetests.py deleted file mode 100644 index c6bbaf3..0000000 --- a/src/ceph/qa/tasks/calamari_nosetests.py +++ /dev/null @@ -1,289 +0,0 @@ -import contextlib -import logging -import os -import textwrap -import yaml - -from cStringIO import StringIO -from teuthology import contextutil -from teuthology import misc -from teuthology import packaging -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - -# extra stuff we need to do our job here -EXTRA_PKGS = [ - 'git', -] - -# stuff that would be in a devmode install, but should be -# installed in the system for running nosetests against -# a production install. -EXTRA_NOSETEST_PKGS = [ - 'python-psutil', - 'python-mock', -] - - -def find_client0(cluster): - ''' Find remote that has client.0 role, or None ''' - for rem, roles in cluster.remotes.iteritems(): - if 'client.0' in roles: - return rem - return None - - -def pip(remote, package, venv=None, uninstall=False, force=False): - ''' {un}install a package with pip, possibly in a virtualenv ''' - if venv: - pip = os.path.join(venv, 'bin', 'pip') - args = ['sudo', pip] - else: - args = ['sudo', 'pip'] - - if uninstall: - args.extend(['uninstall', '-y']) - else: - args.append('install') - if force: - args.append('-I') - - args.append(package) - remote.run(args=args) - - -@contextlib.contextmanager -def install_epel(remote): - ''' install a disabled-by-default epel repo config file ''' - remove = False - try: - if remote.os.package_type == 'deb': - yield - else: - remove = True - distromajor = remote.os.version.split('.')[0] - - repofiledata = textwrap.dedent(''' - [epel] - name=epel{version} - metalink=http://mirrors.fedoraproject.org/metalink?repo=epel-{version}&arch=$basearch - enabled=0 - gpgcheck=0 - ''').format(version=distromajor) - - misc.create_file(remote, '/etc/yum.repos.d/epel.repo', - data=repofiledata, sudo=True) - remote.run(args='sudo yum clean all') - yield - - finally: - if remove: - misc.delete_file(remote, '/etc/yum.repos.d/epel.repo', sudo=True) - - -def enable_epel(remote, enable=True): - ''' enable/disable the epel repo ''' - args = 'sudo sed -i'.split() - if enable: - args.extend(['s/enabled=0/enabled=1/']) - else: - args.extend(['s/enabled=1/enabled=0/']) - args.extend(['/etc/yum.repos.d/epel.repo']) - - remote.run(args=args) - remote.run(args='sudo yum clean all') - - -@contextlib.contextmanager -def install_extra_pkgs(client): - ''' Install EXTRA_PKGS ''' - try: - for pkg in EXTRA_PKGS: - packaging.install_package(pkg, client) - yield - - finally: - for pkg in EXTRA_PKGS: - packaging.remove_package(pkg, client) - - -@contextlib.contextmanager -def clone_calamari(config, client): - ''' clone calamari source into current directory on remote ''' - branch = config.get('calamari_branch', 'master') - url = config.get('calamari_giturl', 'git://github.com/ceph/calamari') - try: - out = StringIO() - # ensure branch is present (clone -b will succeed even if - # the branch doesn't exist, falling back to master) - client.run( - args='git ls-remote %s %s' % (url, branch), - stdout=out, - label='check for calamari branch %s existence' % branch - ) - if len(out.getvalue()) == 0: - raise RuntimeError("Calamari branch %s doesn't exist" % branch) - client.run(args='git clone -b %s %s' % (branch, url)) - yield - finally: - # sudo python setup.py develop may have left some root files around - client.run(args='sudo rm -rf calamari') - - -@contextlib.contextmanager -def write_info_yaml(cluster, client): - ''' write info.yaml to client for nosetests ''' - try: - info = { - 'cluster': { - rem.name: {'roles': roles} - for rem, roles in cluster.remotes.iteritems() - } - } - misc.create_file(client, 'calamari/info.yaml', - data=yaml.safe_dump(info, default_flow_style=False)) - yield - finally: - misc.delete_file(client, 'calamari/info.yaml') - - -@contextlib.contextmanager -def write_test_conf(client): - ''' write calamari/tests/test.conf to client for nosetests ''' - try: - testconf = textwrap.dedent(''' - [testing] - - calamari_control = external - ceph_control = external - bootstrap = False - api_username = admin - api_password = admin - embedded_timeout_factor = 1 - external_timeout_factor = 3 - external_cluster_path = info.yaml - ''') - misc.create_file(client, 'calamari/tests/test.conf', data=testconf) - yield - - finally: - misc.delete_file(client, 'calamari/tests/test.conf') - - -@contextlib.contextmanager -def prepare_nosetest_env(client): - try: - # extra dependencies that would be in the devmode venv - if client.os.package_type == 'rpm': - enable_epel(client, enable=True) - for package in EXTRA_NOSETEST_PKGS: - packaging.install_package(package, client) - if client.os.package_type == 'rpm': - enable_epel(client, enable=False) - - # install nose itself into the calamari venv, force it in case it's - # already installed in the system, so we can invoke it by path without - # fear that it's not present - pip(client, 'nose', venv='/opt/calamari/venv', force=True) - - # install a later version of requests into the venv as well - # (for precise) - pip(client, 'requests', venv='/opt/calamari/venv', force=True) - - # link (setup.py develop) calamari/rest-api into the production venv - # because production does not include calamari_rest.management, needed - # for test_rest_api.py's ApiIntrospection - args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \ - 'sudo /opt/calamari/venv/bin/python setup.py develop'.split() - client.run(args=args) - - # because, at least in Python 2.6/Centos, site.py uses - # 'os.path.exists()' to process .pth file entries, and exists() uses - # access(2) to check for existence, all the paths leading up to - # $HOME/calamari/rest-api need to be searchable by all users of - # the package, which will include the WSGI/Django app, running - # as the Apache user. So make them all world-read-and-execute. - args = 'sudo chmod a+x'.split() + \ - ['.', './calamari', './calamari/rest-api'] - client.run(args=args) - - # make one dummy request just to get the WSGI app to do - # all its log creation here, before the chmod below (I'm - # looking at you, graphite -- /var/log/calamari/info.log and - # /var/log/calamari/exception.log) - client.run(args='wget -q -O /dev/null http://localhost') - - # /var/log/calamari/* is root-or-apache write-only - client.run(args='sudo chmod a+w /var/log/calamari/*') - - yield - - finally: - args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \ - 'sudo /opt/calamari/venv/bin/python setup.py develop -u'.split() - client.run(args=args) - for pkg in ('nose', 'requests'): - pip(client, pkg, venv='/opt/calamari/venv', uninstall=True) - for package in EXTRA_NOSETEST_PKGS: - packaging.remove_package(package, client) - - -@contextlib.contextmanager -def run_nosetests(client): - ''' Actually run the tests ''' - args = [ - 'cd', - 'calamari', - run.Raw(';'), - 'CALAMARI_CONFIG=/etc/calamari/calamari.conf', - '/opt/calamari/venv/bin/nosetests', - '-v', - 'tests/', - ] - client.run(args=args) - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run Calamari tests against an instance set up by 'calamari_server'. - - -- clone the Calamari source into $HOME (see options) - -- write calamari/info.yaml describing the cluster - -- write calamari/tests/test.conf containing - 'external' for calamari_control and ceph_control - 'bootstrap = False' to disable test bootstrapping (installing minions) - no api_url necessary (inferred from client.0) - 'external_cluster_path = info.yaml' - -- modify the production Calamari install to allow test runs: - install nose in the venv - install EXTRA_NOSETEST_PKGS - link in, with setup.py develop, calamari_rest (for ApiIntrospection) - -- set CALAMARI_CONFIG to point to /etc/calamari/calamari.conf - -- nosetests -v tests/ - - Options are: - calamari_giturl: url from which to git clone calamari - (default: git://github.com/ceph/calamari) - calamari_branch: git branch of calamari to check out - (default: master) - - Note: the tests must find a clean cluster, so don't forget to - set the crush default type appropriately, or install min_size OSD hosts - """ - client0 = find_client0(ctx.cluster) - if client0 is None: - raise RuntimeError("must have client.0 role") - - with contextutil.nested( - lambda: install_epel(client0), - lambda: install_extra_pkgs(client0), - lambda: clone_calamari(config, client0), - lambda: write_info_yaml(ctx.cluster, client0), - lambda: write_test_conf(client0), - lambda: prepare_nosetest_env(client0), - lambda: run_nosetests(client0), - ): - yield diff --git a/src/ceph/qa/tasks/calamari_setup.py b/src/ceph/qa/tasks/calamari_setup.py deleted file mode 100644 index 8ef404f..0000000 --- a/src/ceph/qa/tasks/calamari_setup.py +++ /dev/null @@ -1,467 +0,0 @@ -""" -Calamari setup task -""" -import contextlib -import logging -import os -import requests -import shutil -import webbrowser - -from cStringIO import StringIO -from teuthology.orchestra import run -from teuthology import contextutil -from teuthology import misc - -log = logging.getLogger(__name__) - - -DEFAULTS = { - 'version': 'v0.80.9', - 'test_image': None, - 'start_browser': False, - 'email': 'x@y.com', - 'no_epel': True, - 'calamari_user': 'admin', - 'calamari_password': 'admin', -} - - -@contextlib.contextmanager -def task(ctx, config): - """ - Do the setup of a calamari server. - - - calamari_setup: - version: 'v80.1' - test_image: <path to tarball or iso> - - Options are (see DEFAULTS above): - - version -- ceph version we are testing against - test_image -- Can be an HTTP URL, in which case fetch from this - http path; can also be local path - start_browser -- If True, start a browser. To be used by runs that will - bring up a browser quickly for human use. Set to False - for overnight suites that are testing for problems in - the installation itself - email -- email address for the user - no_epel -- indicates if we should remove epel files prior to yum - installations. - calamari_user -- user name to log into gui - calamari_password -- calamari user password - """ - local_config = DEFAULTS - local_config.update(config) - config = local_config - cal_svr = None - for remote_, roles in ctx.cluster.remotes.items(): - if 'client.0' in roles: - cal_svr = remote_ - break - if not cal_svr: - raise RuntimeError('client.0 not found in roles') - with contextutil.nested( - lambda: adjust_yum_repos(ctx, cal_svr, config['no_epel']), - lambda: calamari_install(config, cal_svr), - lambda: ceph_install(ctx, cal_svr), - # do it again because ceph-deploy installed epel for centos - lambda: remove_epel(ctx, config['no_epel']), - lambda: calamari_connect(ctx, cal_svr), - lambda: browser(config['start_browser'], cal_svr.hostname), - ): - yield - - -@contextlib.contextmanager -def adjust_yum_repos(ctx, cal_svr, no_epel): - """ - For each remote machine, fix the repos if yum is used. - """ - ice_distro = str(cal_svr.os) - if ice_distro.startswith('rhel') or ice_distro.startswith('centos'): - if no_epel: - for remote in ctx.cluster.remotes: - fix_yum_repos(remote, ice_distro) - try: - yield - finally: - if ice_distro.startswith('rhel') or ice_distro.startswith('centos'): - if no_epel: - for remote in ctx.cluster.remotes: - restore_yum_repos(remote) - - -def restore_yum_repos(remote): - """ - Copy the old saved repo back in. - """ - if remote.run(args=['sudo', 'rm', '-rf', '/etc/yum.repos.d']).exitstatus: - return False - if remote.run(args=['sudo', 'mv', '/etc/yum.repos.d.old', - '/etc/yum.repos.d']).exitstatus: - return False - - -def fix_yum_repos(remote, distro): - """ - For yum calamari installations, the repos.d directory should only - contain a repo file named rhel<version-number>.repo - """ - if distro.startswith('centos'): - # hack alert: detour: install lttng for ceph - # this works because epel is preinstalled on the vpms - # this is not a generic solution - # this is here solely to test the one-off 1.3.0 release for centos6 - remote.run(args="sudo yum -y install lttng-tools") - cmds = [ - 'sudo mkdir /etc/yum.repos.d.old'.split(), - ['sudo', 'cp', run.Raw('/etc/yum.repos.d/*'), - '/etc/yum.repos.d.old'], - ['sudo', 'rm', run.Raw('/etc/yum.repos.d/epel*')], - ] - for cmd in cmds: - if remote.run(args=cmd).exitstatus: - return False - else: - cmds = [ - 'sudo mv /etc/yum.repos.d /etc/yum.repos.d.old'.split(), - 'sudo mkdir /etc/yum.repos.d'.split(), - ] - for cmd in cmds: - if remote.run(args=cmd).exitstatus: - return False - - # map "distroversion" from Remote.os to a tuple of - # (repo title, repo name descriptor, apt-mirror repo path chunk) - yum_repo_params = { - 'rhel 6.4': ('rhel6-server', 'RHEL', 'rhel6repo-server'), - 'rhel 6.5': ('rhel6-server', 'RHEL', 'rhel6repo-server'), - 'rhel 7.0': ('rhel7-server', 'RHEL', 'rhel7repo/server'), - } - repotitle, reponame, path = yum_repo_params[distro] - repopath = '/etc/yum.repos.d/%s.repo' % repotitle - # TO DO: Make this data configurable too - repo_contents = '\n'.join( - ('[%s]' % repotitle, - 'name=%s $releasever - $basearch' % reponame, - 'baseurl=http://apt-mirror.front.sepia.ceph.com/' + path, - 'gpgcheck=0', - 'enabled=1') - ) - misc.sudo_write_file(remote, repopath, repo_contents) - cmds = [ - 'sudo yum clean all'.split(), - 'sudo yum makecache'.split(), - ] - for cmd in cmds: - if remote.run(args=cmd).exitstatus: - return False - return True - - -@contextlib.contextmanager -def remove_epel(ctx, no_epel): - """ - just remove epel. No undo; assumed that it's used after - adjust_yum_repos, and relies on its state-save/restore. - """ - if no_epel: - for remote in ctx.cluster.remotes: - if remote.os.name.startswith('centos'): - remote.run(args=[ - 'sudo', 'rm', '-f', run.Raw('/etc/yum.repos.d/epel*') - ]) - try: - yield - finally: - pass - - -def get_iceball_with_http(url, destdir): - ''' - Copy iceball with http to destdir. Try both .tar.gz and .iso. - ''' - # stream=True means we don't download until copyfileobj below, - # and don't need a temp file - r = requests.get(url, stream=True) - if not r.ok: - raise RuntimeError("Failed to download %s", str(url)) - filename = os.path.join(destdir, url.split('/')[-1]) - with open(filename, 'w') as f: - shutil.copyfileobj(r.raw, f) - log.info('saved %s as %s' % (url, filename)) - return filename - - -@contextlib.contextmanager -def calamari_install(config, cal_svr): - """ - Install calamari - - The steps here are: - -- Get the iceball, locally or from http - -- Copy the iceball to the calamari server, and untar/mount it. - -- Run ice-setup on the calamari server. - -- Run calamari-ctl initialize. - """ - client_id = str(cal_svr) - at_loc = client_id.find('@') - if at_loc > 0: - client_id = client_id[at_loc + 1:] - - test_image = config['test_image'] - - if not test_image: - raise RuntimeError('Must supply test image') - log.info('calamari test image: %s' % test_image) - delete_iceball = False - - if test_image.startswith('http'): - iceball_file = get_iceball_with_http(test_image, '/tmp') - delete_iceball = True - else: - iceball_file = test_image - - remote_iceball_file = os.path.join('/tmp', os.path.split(iceball_file)[1]) - cal_svr.put_file(iceball_file, remote_iceball_file) - if iceball_file.endswith('.tar.gz'): # XXX specify tar/iso in config? - icetype = 'tarball' - elif iceball_file.endswith('.iso'): - icetype = 'iso' - else: - raise RuntimeError('Can''t handle iceball {0}'.format(iceball_file)) - - if icetype == 'tarball': - ret = cal_svr.run(args=['gunzip', run.Raw('<'), remote_iceball_file, - run.Raw('|'), 'tar', 'xvf', run.Raw('-')]) - if ret.exitstatus: - raise RuntimeError('remote iceball untar failed') - elif icetype == 'iso': - mountpoint = '/mnt/' # XXX create? - ret = cal_svr.run( - args=['sudo', 'mount', '-o', 'loop', '-r', - remote_iceball_file, mountpoint] - ) - - # install ice_setup package - args = { - 'deb': 'sudo dpkg -i /mnt/ice-setup*deb', - 'rpm': 'sudo yum -y localinstall /mnt/ice_setup*rpm' - }.get(cal_svr.system_type, None) - if not args: - raise RuntimeError('{0}: unknown system type'.format(cal_svr)) - ret = cal_svr.run(args=args) - if ret.exitstatus: - raise RuntimeError('ice_setup package install failed') - - # Run ice_setup - icesetdata = 'yes\n\n%s\nhttp\n' % client_id - ice_in = StringIO(icesetdata) - ice_out = StringIO() - if icetype == 'tarball': - args = 'sudo python ice_setup.py' - else: - args = 'sudo ice_setup -d /mnt' - ret = cal_svr.run(args=args, stdin=ice_in, stdout=ice_out) - log.debug(ice_out.getvalue()) - if ret.exitstatus: - raise RuntimeError('ice_setup failed') - - # Run calamari-ctl initialize. - icesetdata = '%s\n%s\n%s\n%s\n' % ( - config['calamari_user'], - config['email'], - config['calamari_password'], - config['calamari_password'], - ) - ice_in = StringIO(icesetdata) - ret = cal_svr.run(args=['sudo', 'calamari-ctl', 'initialize'], - stdin=ice_in, stdout=ice_out) - log.debug(ice_out.getvalue()) - if ret.exitstatus: - raise RuntimeError('calamari-ctl initialize failed') - try: - yield - finally: - log.info('Cleaning up after Calamari installation') - if icetype == 'iso': - cal_svr.run(args=['sudo', 'umount', mountpoint]) - if delete_iceball: - os.unlink(iceball_file) - - -@contextlib.contextmanager -def ceph_install(ctx, cal_svr): - """ - Install ceph if ceph was not previously installed by teuthology. This - code tests the case where calamari is installed on a brand new system. - """ - loc_inst = False - if 'install' not in [x.keys()[0] for x in ctx.config['tasks']]: - loc_inst = True - ret = deploy_ceph(ctx, cal_svr) - if ret: - raise RuntimeError('ceph installs failed') - try: - yield - finally: - if loc_inst: - if not undeploy_ceph(ctx, cal_svr): - log.error('Cleanup of Ceph installed by Calamari-setup failed') - - -def deploy_ceph(ctx, cal_svr): - """ - Perform the ceph-deploy actions needed to bring up a Ceph cluster. This - test is needed to check the ceph-deploy that comes with the calamari - package. - """ - osd_to_name = {} - all_machines = set() - all_mons = set() - all_osds = set() - - # collect which remotes are osds and which are mons - for remote in ctx.cluster.remotes: - all_machines.add(remote.shortname) - roles = ctx.cluster.remotes[remote] - for role in roles: - daemon_type, number = role.split('.') - if daemon_type == 'osd': - all_osds.add(remote.shortname) - osd_to_name[number] = remote.shortname - if daemon_type == 'mon': - all_mons.add(remote.shortname) - - # figure out whether we're in "1.3+" mode: prior to 1.3, there was - # only one Ceph repo, and it was all installed on every Ceph host. - # with 1.3, we've split that into MON and OSD repos (in order to - # be able to separately track subscriptions per-node). This - # requires new switches to ceph-deploy to select which locally-served - # repo is connected to which cluster host. - # - # (TODO: A further issue is that the installation/setup may not have - # created local repos at all, but that is the subject of a future - # change.) - - r = cal_svr.run(args='/usr/bin/test -d /mnt/MON', check_status=False) - use_install_repo = (r.returncode == 0) - - # pre-1.3: - # ceph-deploy new <all_mons> - # ceph-deploy install <all_machines> - # ceph-deploy mon create-initial - # - # 1.3 and later: - # ceph-deploy new <all_mons> - # ceph-deploy install --repo --release=ceph-mon <all_mons> - # ceph-deploy install <all_mons> - # ceph-deploy install --repo --release=ceph-osd <all_osds> - # ceph-deploy install <all_osds> - # ceph-deploy mon create-initial - # - # one might think the install <all_mons> and install <all_osds> - # commands would need --mon and --osd, but #12147 has not yet - # made it into RHCS 1.3.0; since the package split also hasn't - # landed, we can avoid using the flag and avoid the bug. - - cmds = ['ceph-deploy new ' + ' '.join(all_mons)] - - if use_install_repo: - cmds.append('ceph-deploy repo ceph-mon ' + - ' '.join(all_mons)) - cmds.append('ceph-deploy install --no-adjust-repos --mon ' + - ' '.join(all_mons)) - cmds.append('ceph-deploy repo ceph-osd ' + - ' '.join(all_osds)) - cmds.append('ceph-deploy install --no-adjust-repos --osd ' + - ' '.join(all_osds)) - # We tell users to use `hostname` in our docs. Do the same here. - cmds.append('ceph-deploy install --no-adjust-repos --cli `hostname`') - else: - cmds.append('ceph-deploy install ' + ' '.join(all_machines)) - - cmds.append('ceph-deploy mon create-initial') - - for cmd in cmds: - cal_svr.run(args=cmd).exitstatus - - disk_labels = '_dcba' - # NEEDS WORK assumes disks start with vd (need to check this somewhere) - for cmd_pts in [['disk', 'zap'], ['osd', 'prepare'], ['osd', 'activate']]: - mach_osd_cnt = {} - for osdn in osd_to_name: - osd_mac = osd_to_name[osdn] - mach_osd_cnt[osd_mac] = mach_osd_cnt.get(osd_mac, 0) + 1 - arg_list = ['ceph-deploy'] - arg_list.extend(cmd_pts) - disk_id = '%s:vd%s' % (osd_to_name[osdn], - disk_labels[mach_osd_cnt[osd_mac]]) - if 'activate' in cmd_pts: - disk_id += '1' - arg_list.append(disk_id) - cal_svr.run(args=arg_list).exitstatus - - -def undeploy_ceph(ctx, cal_svr): - """ - Cleanup deployment of ceph. - """ - all_machines = [] - ret = True - for remote in ctx.cluster.remotes: - roles = ctx.cluster.remotes[remote] - if ( - not any('osd' in role for role in roles) and - not any('mon' in role for role in roles) - ): - continue - ret &= remote.run( - args=['sudo', 'stop', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'ceph', 'stop'] - ).exitstatus - all_machines.append(remote.shortname) - all_machines = set(all_machines) - cmd1 = ['ceph-deploy', 'uninstall'] - cmd1.extend(all_machines) - ret &= cal_svr.run(args=cmd1).exitstatus - cmd2 = ['ceph-deploy', 'purge'] - cmd2.extend(all_machines) - ret &= cal_svr.run(args=cmd2).exitstatus - for remote in ctx.cluster.remotes: - ret &= remote.run(args=['sudo', 'rm', '-rf', - '.ssh/known_hosts']).exitstatus - return ret - - -@contextlib.contextmanager -def calamari_connect(ctx, cal_svr): - """ - Connect calamari to the ceph nodes. - """ - connects = ['ceph-deploy', 'calamari', 'connect'] - for machine_info in ctx.cluster.remotes: - if 'client.0' not in ctx.cluster.remotes[machine_info]: - connects.append(machine_info.shortname) - ret = cal_svr.run(args=connects) - if ret.exitstatus: - raise RuntimeError('calamari connect failed') - try: - yield - finally: - log.info('Calamari test terminating') - - -@contextlib.contextmanager -def browser(start_browser, web_page): - """ - Bring up a browser, if wanted. - """ - if start_browser: - webbrowser.open('http://%s' % web_page) - try: - yield - finally: - if start_browser: - log.info('Web browser support terminating') diff --git a/src/ceph/qa/tasks/ceph.py b/src/ceph/qa/tasks/ceph.py deleted file mode 100644 index 72f2653..0000000 --- a/src/ceph/qa/tasks/ceph.py +++ /dev/null @@ -1,1688 +0,0 @@ -""" -Ceph cluster task. - -Handle the setup, starting, and clean-up of a Ceph cluster. -""" -from cStringIO import StringIO - -import argparse -import contextlib -import errno -import logging -import os -import json -import time -import gevent -import socket - -from paramiko import SSHException -from ceph_manager import CephManager, write_conf -from tasks.cephfs.filesystem import Filesystem -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology import exceptions -from teuthology.orchestra import run -import ceph_client as cclient -from teuthology.orchestra.daemon import DaemonGroup - -CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw'] - -log = logging.getLogger(__name__) - - -def generate_caps(type_): - """ - Each call will return the next capability for each system type - (essentially a subset of possible role values). Valid types are osd, - mds and client. - """ - defaults = dict( - osd=dict( - mon='allow *', - mgr='allow *', - osd='allow *', - ), - mgr=dict( - mon='allow profile mgr', - osd='allow *', - mds='allow *', - ), - mds=dict( - mon='allow *', - mgr='allow *', - osd='allow *', - mds='allow', - ), - client=dict( - mon='allow rw', - mgr='allow r', - osd='allow rwx', - mds='allow', - ), - ) - for subsystem, capability in defaults[type_].items(): - yield '--cap' - yield subsystem - yield capability - - -@contextlib.contextmanager -def ceph_log(ctx, config): - """ - Create /var/log/ceph log directory that is open to everyone. - Add valgrind and profiling-logger directories. - - :param ctx: Context - :param config: Configuration - """ - log.info('Making ceph log dir writeable by non-root...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'chmod', - '777', - '/var/log/ceph', - ], - wait=False, - ) - ) - log.info('Disabling ceph logrotate...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'rm', '-f', '--', - '/etc/logrotate.d/ceph', - ], - wait=False, - ) - ) - log.info('Creating extra log directories...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'install', '-d', '-m0777', '--', - '/var/log/ceph/valgrind', - '/var/log/ceph/profiling-logger', - ], - wait=False, - ) - ) - - class Rotater(object): - stop_event = gevent.event.Event() - - def invoke_logrotate(self): - # 1) install ceph-test.conf in /etc/logrotate.d - # 2) continuously loop over logrotate invocation with ceph-test.conf - while not self.stop_event.is_set(): - self.stop_event.wait(timeout=30) - try: - run.wait( - ctx.cluster.run( - args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf' - ], - wait=False, - ) - ) - except exceptions.ConnectionLostError as e: - # Some tests may power off nodes during test, in which - # case we will see connection errors that we should ignore. - log.debug("Missed logrotate, node '{0}' is offline".format( - e.node)) - except EOFError as e: - # Paramiko sometimes raises this when it fails to - # connect to a node during open_session. As with - # ConnectionLostError, we ignore this because nodes - # are allowed to get power cycled during tests. - log.debug("Missed logrotate, EOFError") - except SSHException as e: - log.debug("Missed logrotate, SSHException") - except socket.error as e: - if e.errno == errno.EHOSTUNREACH: - log.debug("Missed logrotate, host unreachable") - else: - raise - - def begin(self): - self.thread = gevent.spawn(self.invoke_logrotate) - - def end(self): - self.stop_event.set() - self.thread.get() - - def write_rotate_conf(ctx, daemons): - testdir = teuthology.get_testdir(ctx) - rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf') - with file(rotate_conf_path, 'rb') as f: - conf = "" - for daemon, size in daemons.iteritems(): - log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon)) - conf += f.read().format(daemon_type=daemon, max_size=size) - f.seek(0, 0) - - for remote in ctx.cluster.remotes.iterkeys(): - teuthology.write_file(remote=remote, - path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir), - data=StringIO(conf) - ) - remote.run( - args=[ - 'sudo', - 'mv', - '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir), - '/etc/logrotate.d/ceph-test.conf', - run.Raw('&&'), - 'sudo', - 'chmod', - '0644', - '/etc/logrotate.d/ceph-test.conf', - run.Raw('&&'), - 'sudo', - 'chown', - 'root.root', - '/etc/logrotate.d/ceph-test.conf' - ] - ) - remote.chcon('/etc/logrotate.d/ceph-test.conf', - 'system_u:object_r:etc_t:s0') - - if ctx.config.get('log-rotate'): - daemons = ctx.config.get('log-rotate') - log.info('Setting up log rotation with ' + str(daemons)) - write_rotate_conf(ctx, daemons) - logrotater = Rotater() - logrotater.begin() - try: - yield - - finally: - if ctx.config.get('log-rotate'): - log.info('Shutting down logrotate') - logrotater.end() - ctx.cluster.run( - args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf' - ] - ) - if ctx.archive is not None and \ - not (ctx.config.get('archive-on-error') and ctx.summary['success']): - # and logs - log.info('Compressing logs...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'find', - '/var/log/ceph', - '-name', - '*.log', - '-print0', - run.Raw('|'), - 'sudo', - 'xargs', - '-0', - '--no-run-if-empty', - '--', - 'gzip', - '--', - ], - wait=False, - ), - ) - - log.info('Archiving logs...') - path = os.path.join(ctx.archive, 'remote') - os.makedirs(path) - for remote in ctx.cluster.remotes.iterkeys(): - sub = os.path.join(path, remote.shortname) - os.makedirs(sub) - teuthology.pull_directory(remote, '/var/log/ceph', - os.path.join(sub, 'log')) - - -def assign_devs(roles, devs): - """ - Create a dictionary of devs indexed by roles - - :param roles: List of roles - :param devs: Corresponding list of devices. - :returns: Dictionary of devs indexed by roles. - """ - return dict(zip(roles, devs)) - - -@contextlib.contextmanager -def valgrind_post(ctx, config): - """ - After the tests run, look throught all the valgrind logs. Exceptions are raised - if textual errors occured in the logs, or if valgrind exceptions were detected in - the logs. - - :param ctx: Context - :param config: Configuration - """ - try: - yield - finally: - lookup_procs = list() - log.info('Checking for errors in any valgrind logs...') - for remote in ctx.cluster.remotes.iterkeys(): - # look at valgrind logs for each node - proc = remote.run( - args=[ - 'sudo', - 'zgrep', - '<kind>', - run.Raw('/var/log/ceph/valgrind/*'), - '/dev/null', # include a second file so that we always get a filename prefix on the output - run.Raw('|'), - 'sort', - run.Raw('|'), - 'uniq', - ], - wait=False, - check_status=False, - stdout=StringIO(), - ) - lookup_procs.append((proc, remote)) - - valgrind_exception = None - for (proc, remote) in lookup_procs: - proc.wait() - out = proc.stdout.getvalue() - for line in out.split('\n'): - if line == '': - continue - try: - (file, kind) = line.split(':') - except Exception: - log.error('failed to split line %s', line) - raise - log.debug('file %s kind %s', file, kind) - if (file.find('mds') >= 0) and kind.find('Lost') > 0: - continue - log.error('saw valgrind issue %s in %s', kind, file) - valgrind_exception = Exception('saw valgrind issues') - - if config.get('expect_valgrind_errors'): - if not valgrind_exception: - raise Exception('expected valgrind issues and found none') - else: - if valgrind_exception: - raise valgrind_exception - - -@contextlib.contextmanager -def crush_setup(ctx, config): - cluster_name = config['cluster'] - first_mon = teuthology.get_first_mon(ctx, config, cluster_name) - (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - profile = config.get('crush_tunables', 'default') - log.info('Setting crush tunables to %s', profile) - mon_remote.run( - args=['sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'crush', 'tunables', profile]) - yield - - -@contextlib.contextmanager -def create_rbd_pool(ctx, config): - cluster_name = config['cluster'] - first_mon = teuthology.get_first_mon(ctx, config, cluster_name) - (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() - log.info('Waiting for OSDs to come up') - teuthology.wait_until_osds_up( - ctx, - cluster=ctx.cluster, - remote=mon_remote, - ceph_cluster=cluster_name, - ) - if config.get('create_rbd_pool', True): - log.info('Creating RBD pool') - mon_remote.run( - args=['sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'pool', 'create', 'rbd', '8']) - mon_remote.run( - args=[ - 'sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'pool', 'application', 'enable', - 'rbd', 'rbd', '--yes-i-really-mean-it' - ], - check_status=False) - yield - -@contextlib.contextmanager -def cephfs_setup(ctx, config): - cluster_name = config['cluster'] - testdir = teuthology.get_testdir(ctx) - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - - first_mon = teuthology.get_first_mon(ctx, config, cluster_name) - (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() - mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name)) - # If there are any MDSs, then create a filesystem for them to use - # Do this last because requires mon cluster to be up and running - if mdss.remotes: - log.info('Setting up CephFS filesystem...') - - fs = Filesystem(ctx, name='cephfs', create=True, - ec_profile=config.get('cephfs_ec_profile', None)) - - is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role - all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles] - num_active = len([r for r in all_roles if is_active_mds(r)]) - - fs.set_max_mds(num_active) - fs.set_allow_dirfrags(True) - - yield - - -@contextlib.contextmanager -def cluster(ctx, config): - """ - Handle the creation and removal of a ceph cluster. - - On startup: - Create directories needed for the cluster. - Create remote journals for all osds. - Create and set keyring. - Copy the monmap to tht test systems. - Setup mon nodes. - Setup mds nodes. - Mkfs osd nodes. - Add keyring information to monmaps - Mkfs mon nodes. - - On exit: - If errors occured, extract a failure message and store in ctx.summary. - Unmount all test files and temporary journaling files. - Save the monitor information and archive all ceph logs. - Cleanup the keyring setup, and remove all monitor map and data files left over. - - :param ctx: Context - :param config: Configuration - """ - if ctx.config.get('use_existing_cluster', False) is True: - log.info("'use_existing_cluster' is true; skipping cluster creation") - yield - - testdir = teuthology.get_testdir(ctx) - cluster_name = config['cluster'] - data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name) - log.info('Creating ceph cluster %s...', cluster_name) - run.wait( - ctx.cluster.run( - args=[ - 'install', '-d', '-m0755', '--', - data_dir, - ], - wait=False, - ) - ) - - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'install', '-d', '-m0777', '--', '/var/run/ceph', - ], - wait=False, - ) - ) - - devs_to_clean = {} - remote_to_roles_to_devs = {} - remote_to_roles_to_journals = {} - osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name)) - for remote, roles_for_host in osds.remotes.iteritems(): - devs = teuthology.get_scratch_devices(remote) - roles_to_devs = {} - roles_to_journals = {} - if config.get('fs'): - log.info('fs option selected, checking for scratch devs') - log.info('found devs: %s' % (str(devs),)) - devs_id_map = teuthology.get_wwn_id_map(remote, devs) - iddevs = devs_id_map.values() - roles_to_devs = assign_devs( - teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs - ) - if len(roles_to_devs) < len(iddevs): - iddevs = iddevs[len(roles_to_devs):] - devs_to_clean[remote] = [] - - if config.get('block_journal'): - log.info('block journal enabled') - roles_to_journals = assign_devs( - teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs - ) - log.info('journal map: %s', roles_to_journals) - - if config.get('tmpfs_journal'): - log.info('tmpfs journal enabled') - roles_to_journals = {} - remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt']) - for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name): - tmpfs = '/mnt/' + role - roles_to_journals[role] = tmpfs - remote.run(args=['truncate', '-s', '1500M', tmpfs]) - log.info('journal map: %s', roles_to_journals) - - log.info('dev map: %s' % (str(roles_to_devs),)) - remote_to_roles_to_devs[remote] = roles_to_devs - remote_to_roles_to_journals[remote] = roles_to_journals - - log.info('Generating config...') - remotes_and_roles = ctx.cluster.remotes.items() - roles = [role_list for (remote, role_list) in remotes_and_roles] - ips = [host for (host, port) in - (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] - conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name) - for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): - for role, journal in roles_to_journals.iteritems(): - name = teuthology.ceph_role(role) - if name not in conf: - conf[name] = {} - conf[name]['osd journal'] = journal - for section, keys in config['conf'].iteritems(): - for key, value in keys.iteritems(): - log.info("[%s] %s = %s" % (section, key, value)) - if section not in conf: - conf[section] = {} - conf[section][key] = value - - if config.get('tmpfs_journal'): - conf['journal dio'] = False - - if not hasattr(ctx, 'ceph'): - ctx.ceph = {} - ctx.ceph[cluster_name] = argparse.Namespace() - ctx.ceph[cluster_name].conf = conf - - default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name) - keyring_path = config.get('keyring_path', default_keyring) - - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - - firstmon = teuthology.get_first_mon(ctx, config, cluster_name) - - log.info('Setting up %s...' % firstmon) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - keyring_path, - ], - ) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--gen-key', - '--name=mon.', - keyring_path, - ], - ) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'chmod', - '0644', - keyring_path, - ], - ) - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir, - cluster=cluster_name) - fsid = teuthology.create_simple_monmap( - ctx, - remote=mon0_remote, - conf=conf, - path=monmap_path, - ) - if not 'global' in conf: - conf['global'] = {} - conf['global']['fsid'] = fsid - - default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name) - conf_path = config.get('conf_path', default_conf_path) - log.info('Writing %s for FSID %s...' % (conf_path, fsid)) - write_conf(ctx, conf_path, cluster_name) - - log.info('Creating admin key on %s...' % firstmon) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--gen-key', - '--name=client.admin', - '--set-uid=0', - '--cap', 'mon', 'allow *', - '--cap', 'osd', 'allow *', - '--cap', 'mds', 'allow *', - '--cap', 'mgr', 'allow *', - keyring_path, - ], - ) - - log.info('Copying monmap to all nodes...') - keyring = teuthology.get_file( - remote=mon0_remote, - path=keyring_path, - ) - monmap = teuthology.get_file( - remote=mon0_remote, - path=monmap_path, - ) - - for rem in ctx.cluster.remotes.iterkeys(): - # copy mon key and initial monmap - log.info('Sending monmap to node {remote}'.format(remote=rem)) - teuthology.sudo_write_file( - remote=rem, - path=keyring_path, - data=keyring, - perms='0644' - ) - teuthology.write_file( - remote=rem, - path=monmap_path, - data=monmap, - ) - - log.info('Setting up mon nodes...') - mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name)) - - if not config.get('skip_mgr_daemons', False): - log.info('Setting up mgr nodes...') - mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name)) - for remote, roles_for_host in mgrs.remotes.iteritems(): - for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr', - cluster_name): - _, _, id_ = teuthology.split_role(role) - mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format( - cluster=cluster_name, - id=id_, - ) - remote.run( - args=[ - 'sudo', - 'mkdir', - '-p', - mgr_dir, - run.Raw('&&'), - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - '--gen-key', - '--name=mgr.{id}'.format(id=id_), - mgr_dir + '/keyring', - ], - ) - - log.info('Setting up mds nodes...') - mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name)) - for remote, roles_for_host in mdss.remotes.iteritems(): - for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds', - cluster_name): - _, _, id_ = teuthology.split_role(role) - mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format( - cluster=cluster_name, - id=id_, - ) - remote.run( - args=[ - 'sudo', - 'mkdir', - '-p', - mds_dir, - run.Raw('&&'), - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - '--gen-key', - '--name=mds.{id}'.format(id=id_), - mds_dir + '/keyring', - ], - ) - - cclient.create_keyring(ctx, cluster_name) - log.info('Running mkfs on osd nodes...') - - if not hasattr(ctx, 'disk_config'): - ctx.disk_config = argparse.Namespace() - if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'): - ctx.disk_config.remote_to_roles_to_dev = {} - if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'): - ctx.disk_config.remote_to_roles_to_journals = {} - if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'): - ctx.disk_config.remote_to_roles_to_dev_mount_options = {} - if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'): - ctx.disk_config.remote_to_roles_to_dev_fstype = {} - - teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs) - teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals) - - log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev))) - for remote, roles_for_host in osds.remotes.iteritems(): - roles_to_devs = remote_to_roles_to_devs[remote] - roles_to_journals = remote_to_roles_to_journals[remote] - - for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name): - _, _, id_ = teuthology.split_role(role) - mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_) - remote.run( - args=[ - 'sudo', - 'mkdir', - '-p', - mnt_point, - ]) - log.info(str(roles_to_devs)) - log.info(str(roles_to_journals)) - log.info(role) - if roles_to_devs.get(role): - dev = roles_to_devs[role] - fs = config.get('fs') - package = None - mkfs_options = config.get('mkfs_options') - mount_options = config.get('mount_options') - if fs == 'btrfs': - # package = 'btrfs-tools' - if mount_options is None: - mount_options = ['noatime', 'user_subvol_rm_allowed'] - if mkfs_options is None: - mkfs_options = ['-m', 'single', - '-l', '32768', - '-n', '32768'] - if fs == 'xfs': - # package = 'xfsprogs' - if mount_options is None: - mount_options = ['noatime'] - if mkfs_options is None: - mkfs_options = ['-f', '-i', 'size=2048'] - if fs == 'ext4' or fs == 'ext3': - if mount_options is None: - mount_options = ['noatime', 'user_xattr'] - - if mount_options is None: - mount_options = [] - if mkfs_options is None: - mkfs_options = [] - mkfs = ['mkfs.%s' % fs] + mkfs_options - log.info('%s on %s on %s' % (mkfs, dev, remote)) - if package is not None: - remote.run( - args=[ - 'sudo', - 'apt-get', 'install', '-y', package - ], - stdout=StringIO(), - ) - - try: - remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) - except run.CommandFailedError: - # Newer btfs-tools doesn't prompt for overwrite, use -f - if '-f' not in mount_options: - mkfs_options.append('-f') - mkfs = ['mkfs.%s' % fs] + mkfs_options - log.info('%s on %s on %s' % (mkfs, dev, remote)) - remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) - - log.info('mount %s on %s -o %s' % (dev, remote, - ','.join(mount_options))) - remote.run( - args=[ - 'sudo', - 'mount', - '-t', fs, - '-o', ','.join(mount_options), - dev, - mnt_point, - ] - ) - remote.run( - args=[ - 'sudo', '/sbin/restorecon', mnt_point, - ], - check_status=False, - ) - if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: - ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {} - ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options - if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: - ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} - ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs - devs_to_clean[remote].append(mnt_point) - - for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name): - _, _, id_ = teuthology.split_role(role) - remote.run( - args=[ - 'sudo', - 'MALLOC_CHECK_=3', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-osd', - '--cluster', - cluster_name, - '--mkfs', - '--mkkey', - '-i', id_, - '--monmap', monmap_path, - ], - ) - - log.info('Reading keys from all nodes...') - keys_fp = StringIO() - keys = [] - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - for type_ in ['mgr', 'mds', 'osd']: - if type_ == 'mgr' and config.get('skip_mgr_daemons', False): - continue - for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name): - _, _, id_ = teuthology.split_role(role) - data = teuthology.get_file( - remote=remote, - path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format( - type=type_, - id=id_, - cluster=cluster_name, - ), - sudo=True, - ) - keys.append((type_, id_, data)) - keys_fp.write(data) - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name): - _, _, id_ = teuthology.split_role(role) - data = teuthology.get_file( - remote=remote, - path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name) - ) - keys.append(('client', id_, data)) - keys_fp.write(data) - - log.info('Adding keys to all mons...') - writes = mons.run( - args=[ - 'sudo', 'tee', '-a', - keyring_path, - ], - stdin=run.PIPE, - wait=False, - stdout=StringIO(), - ) - keys_fp.seek(0) - teuthology.feed_many_stdins_and_close(keys_fp, writes) - run.wait(writes) - for type_, id_, data in keys: - run.wait( - mons.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - keyring_path, - '--name={type}.{id}'.format( - type=type_, - id=id_, - ), - ] + list(generate_caps(type_)), - wait=False, - ), - ) - - log.info('Running mkfs on mon nodes...') - for remote, roles_for_host in mons.remotes.iteritems(): - for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name): - _, _, id_ = teuthology.split_role(role) - remote.run( - args=[ - 'sudo', - 'mkdir', - '-p', - '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name), - ], - ) - remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-mon', - '--cluster', cluster_name, - '--mkfs', - '-i', id_, - '--monmap', monmap_path, - '--keyring', keyring_path, - ], - ) - - run.wait( - mons.run( - args=[ - 'rm', - '--', - monmap_path, - ], - wait=False, - ), - ) - - try: - yield - except Exception: - # we need to know this below - ctx.summary['success'] = False - raise - finally: - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - - log.info('Checking cluster log for badness...') - - def first_in_ceph_log(pattern, excludes): - """ - Find the first occurence of the pattern specified in the Ceph log, - Returns None if none found. - - :param pattern: Pattern scanned for. - :param excludes: Patterns to ignore. - :return: First line of text (or None if not found) - """ - args = [ - 'sudo', - 'egrep', pattern, - '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name), - ] - for exclude in excludes: - args.extend([run.Raw('|'), 'egrep', '-v', exclude]) - args.extend([ - run.Raw('|'), 'head', '-n', '1', - ]) - r = mon0_remote.run( - stdout=StringIO(), - args=args, - ) - stdout = r.stdout.getvalue() - if stdout != '': - return stdout - return None - - if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', - config['log_whitelist']) is not None: - log.warning('Found errors (ERR|WRN|SEC) in cluster log') - ctx.summary['success'] = False - # use the most severe problem as the failure reason - if 'failure_reason' not in ctx.summary: - for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: - match = first_in_ceph_log(pattern, config['log_whitelist']) - if match is not None: - ctx.summary['failure_reason'] = \ - '"{match}" in cluster log'.format( - match=match.rstrip('\n'), - ) - break - - for remote, dirs in devs_to_clean.iteritems(): - for dir_ in dirs: - log.info('Unmounting %s on %s' % (dir_, remote)) - try: - remote.run( - args=[ - 'sync', - run.Raw('&&'), - 'sudo', - 'umount', - '-f', - dir_ - ] - ) - except Exception as e: - remote.run(args=[ - 'sudo', - run.Raw('PATH=/usr/sbin:$PATH'), - 'lsof', - run.Raw(';'), - 'ps', 'auxf', - ]) - raise e - - if config.get('tmpfs_journal'): - log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') - for remote, roles_for_host in osds.remotes.iteritems(): - remote.run( - args=['sudo', 'umount', '-f', '/mnt'], - check_status=False, - ) - - if ctx.archive is not None and \ - not (ctx.config.get('archive-on-error') and ctx.summary['success']): - - # archive mon data, too - log.info('Archiving mon data...') - path = os.path.join(ctx.archive, 'data') - try: - os.makedirs(path) - except OSError as e: - if e.errno == errno.EEXIST: - pass - else: - raise - for remote, roles in mons.remotes.iteritems(): - for role in roles: - is_mon = teuthology.is_type('mon', cluster_name) - if is_mon(role): - _, _, id_ = teuthology.split_role(role) - mon_dir = '/var/lib/ceph/mon/' + \ - '{0}-{1}'.format(cluster_name, id_) - teuthology.pull_directory_tarball( - remote, - mon_dir, - path + '/' + role + '.tgz') - - log.info('Cleaning ceph cluster...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'rm', - '-rf', - '--', - conf_path, - keyring_path, - data_dir, - monmap_path, - run.Raw('{tdir}/../*.pid'.format(tdir=testdir)), - ], - wait=False, - ), - ) - - -def osd_scrub_pgs(ctx, config): - """ - Scrub pgs when we exit. - - First make sure all pgs are active and clean. - Next scrub all osds. - Then periodically check until all pgs have scrub time stamps that - indicate the last scrub completed. Time out if no progess is made - here after two minutes. - """ - retries = 40 - delays = 20 - cluster_name = config['cluster'] - manager = ctx.managers[cluster_name] - all_clean = False - for _ in range(0, retries): - stats = manager.get_pg_stats() - bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']] - if not bad: - all_clean = True - break - log.info( - "Waiting for all PGs to be active and clean, waiting on %s" % bad) - time.sleep(delays) - if not all_clean: - raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.") - check_time_now = time.localtime() - time.sleep(1) - all_roles = teuthology.all_roles(ctx.cluster) - for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name): - log.info("Scrubbing {osd}".format(osd=role)) - _, _, id_ = teuthology.split_role(role) - # allow this to fail; in certain cases the OSD might not be up - # at this point. we will catch all pgs below. - try: - manager.raw_cluster_cmd('osd', 'deep-scrub', id_) - except run.CommandFailedError: - pass - prev_good = 0 - gap_cnt = 0 - loop = True - while loop: - stats = manager.get_pg_stats() - timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats] - loop = False - thiscnt = 0 - for (pgid, tmval) in timez: - pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S') - if pgtm > check_time_now: - thiscnt += 1 - else: - log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now) - loop = True - if thiscnt > prev_good: - prev_good = thiscnt - gap_cnt = 0 - else: - gap_cnt += 1 - if gap_cnt % 6 == 0: - for (pgid, tmval) in timez: - # re-request scrub every so often in case the earlier - # request was missed. do not do it everytime because - # the scrub may be in progress or not reported yet and - # we will starve progress. - manager.raw_cluster_cmd('pg', 'deep-scrub', pgid) - if gap_cnt > retries: - raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.') - if loop: - log.info('Still waiting for all pgs to be scrubbed.') - time.sleep(delays) - - -@contextlib.contextmanager -def run_daemon(ctx, config, type_): - """ - Run daemons for a role type. Handle the startup and termination of a a daemon. - On startup -- set coverages, cpu_profile, valgrind values for all remotes, - and a max_mds value for one mds. - On cleanup -- Stop all existing daemons of this type. - - :param ctx: Context - :param config: Configuration - :paran type_: Role type - """ - cluster_name = config['cluster'] - log.info('Starting %s daemons in cluster %s...', type_, cluster_name) - testdir = teuthology.get_testdir(ctx) - daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name)) - - # check whether any daemons if this type are configured - if daemons is None: - return - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - - daemon_signal = 'kill' - if config.get('coverage') or config.get('valgrind') is not None: - daemon_signal = 'term' - - # create osds in order. (this only matters for pre-luminous, which might - # be hammer, which doesn't take an id_ argument to legacy 'osd create'). - osd_uuids = {} - for remote, roles_for_host in daemons.remotes.iteritems(): - is_type_ = teuthology.is_type(type_, cluster_name) - for role in roles_for_host: - if not is_type_(role): - continue - _, _, id_ = teuthology.split_role(role) - - - if type_ == 'osd': - datadir='/var/lib/ceph/osd/{cluster}-{id}'.format( - cluster=cluster_name, id=id_) - osd_uuid = teuthology.get_file( - remote=remote, - path=datadir + '/fsid', - sudo=True, - ).strip() - osd_uuids[id_] = osd_uuid - for osd_id in range(len(osd_uuids)): - id_ = str(osd_id) - osd_uuid = osd_uuids.get(id_) - try: - remote.run( - args=[ - 'sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'new', osd_uuid, id_, - ] - ) - except: - # fallback to pre-luminous (hammer or jewel) - remote.run( - args=[ - 'sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'create', osd_uuid, - ] - ) - if config.get('add_osds_to_crush'): - remote.run( - args=[ - 'sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'crush', 'create-or-move', 'osd.' + id_, - '1.0', 'host=localhost', 'root=default', - ] - ) - - for remote, roles_for_host in daemons.remotes.iteritems(): - is_type_ = teuthology.is_type(type_, cluster_name) - for role in roles_for_host: - if not is_type_(role): - continue - _, _, id_ = teuthology.split_role(role) - - run_cmd = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'daemon-helper', - daemon_signal, - ] - run_cmd_tail = [ - 'ceph-%s' % (type_), - '-f', - '--cluster', cluster_name, - '-i', id_] - - if type_ in config.get('cpu_profile', []): - profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role) - run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path]) - - if config.get('valgrind') is not None: - valgrind_args = None - if type_ in config['valgrind']: - valgrind_args = config['valgrind'][type_] - if role in config['valgrind']: - valgrind_args = config['valgrind'][role] - run_cmd = teuthology.get_valgrind_args(testdir, role, - run_cmd, - valgrind_args) - - run_cmd.extend(run_cmd_tail) - - # always register mgr; don't necessarily start - ctx.daemons.register_daemon( - remote, type_, id_, - cluster=cluster_name, - args=run_cmd, - logger=log.getChild(role), - stdin=run.PIPE, - wait=False - ) - if type_ != 'mgr' or not config.get('skip_mgr_daemons', False): - role = cluster_name + '.' + type_ - ctx.daemons.get_daemon(type_, id_, cluster_name).restart() - - try: - yield - finally: - teuthology.stop_daemons_of_type(ctx, type_, cluster_name) - - -def healthy(ctx, config): - """ - Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. - - :param ctx: Context - :param config: Configuration - """ - config = config if isinstance(config, dict) else dict() - cluster_name = config.get('cluster', 'ceph') - log.info('Waiting until %s daemons up and pgs clean...', cluster_name) - manager = ctx.managers[cluster_name] - try: - manager.wait_for_mgr_available(timeout=30) - except (run.CommandFailedError, AssertionError) as e: - log.info('ignoring mgr wait error, probably testing upgrade: %s', e) - - firstmon = teuthology.get_first_mon(ctx, config, cluster_name) - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - teuthology.wait_until_osds_up( - ctx, - cluster=ctx.cluster, - remote=mon0_remote, - ceph_cluster=cluster_name, - ) - - try: - manager.flush_all_pg_stats() - except (run.CommandFailedError, Exception) as e: - log.info('ignoring flush pg stats error, probably testing upgrade: %s', e) - manager.wait_for_clean() - - log.info('Waiting until ceph cluster %s is healthy...', cluster_name) - teuthology.wait_until_healthy( - ctx, - remote=mon0_remote, - ceph_cluster=cluster_name, - ) - - if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: - # Some MDSs exist, wait for them to be healthy - ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware - ceph_fs.wait_for_daemons(timeout=300) - - -def wait_for_osds_up(ctx, config): - """ - Wait for all osd's to come up. - - :param ctx: Context - :param config: Configuration - """ - log.info('Waiting until ceph osds are all up...') - cluster_name = config.get('cluster', 'ceph') - firstmon = teuthology.get_first_mon(ctx, config, cluster_name) - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - teuthology.wait_until_osds_up( - ctx, - cluster=ctx.cluster, - remote=mon0_remote - ) - - -def wait_for_mon_quorum(ctx, config): - """ - Check renote ceph status until all monitors are up. - - :param ctx: Context - :param config: Configuration - """ - if isinstance(config, dict): - mons = config['daemons'] - cluster_name = config.get('cluster', 'ceph') - else: - assert isinstance(config, list) - mons = config - cluster_name = 'ceph' - firstmon = teuthology.get_first_mon(ctx, config, cluster_name) - (remote,) = ctx.cluster.only(firstmon).remotes.keys() - with contextutil.safe_while(sleep=10, tries=60, - action='wait for monitor quorum') as proceed: - while proceed(): - r = remote.run( - args=[ - 'sudo', - 'ceph', - 'quorum_status', - ], - stdout=StringIO(), - logger=log.getChild('quorum_status'), - ) - j = json.loads(r.stdout.getvalue()) - q = j.get('quorum_names', []) - log.debug('Quorum: %s', q) - if sorted(q) == sorted(mons): - break - - -def created_pool(ctx, config): - """ - Add new pools to the dictionary of pools that the ceph-manager - knows about. - """ - for new_pool in config: - if new_pool not in ctx.managers['ceph'].pools: - ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property( - new_pool, 'pg_num') - - -@contextlib.contextmanager -def restart(ctx, config): - """ - restart ceph daemons - - For example:: - tasks: - - ceph.restart: [all] - - For example:: - tasks: - - ceph.restart: [osd.0, mon.1, mds.*] - - or:: - - tasks: - - ceph.restart: - daemons: [osd.0, mon.1] - wait-for-healthy: false - wait-for-osds-up: true - - :param ctx: Context - :param config: Configuration - """ - if config is None: - config = {} - elif isinstance(config, list): - config = {'daemons': config} - - daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True) - clusters = set() - for role in daemons: - cluster, type_, id_ = teuthology.split_role(role) - ctx.daemons.get_daemon(type_, id_, cluster).restart() - clusters.add(cluster) - - manager = ctx.managers['ceph'] - for dmon in daemons: - if '.' in dmon: - dm_parts = dmon.split('.') - if dm_parts[1].isdigit(): - if dm_parts[0] == 'osd': - manager.mark_down_osd(int(dm_parts[1])) - - if config.get('wait-for-healthy', True): - for cluster in clusters: - healthy(ctx=ctx, config=dict(cluster=cluster)) - if config.get('wait-for-osds-up', False): - for cluster in clusters: - wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster)) - yield - - -@contextlib.contextmanager -def stop(ctx, config): - """ - Stop ceph daemons - - For example:: - tasks: - - ceph.stop: [mds.*] - - tasks: - - ceph.stop: [osd.0, osd.2] - - tasks: - - ceph.stop: - daemons: [osd.0, osd.2] - - """ - if config is None: - config = {} - elif isinstance(config, list): - config = {'daemons': config} - - daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True) - for role in daemons: - cluster, type_, id_ = teuthology.split_role(role) - ctx.daemons.get_daemon(type_, id_, cluster).stop() - - yield - - -@contextlib.contextmanager -def wait_for_failure(ctx, config): - """ - Wait for a failure of a ceph daemon - - For example:: - tasks: - - ceph.wait_for_failure: [mds.*] - - tasks: - - ceph.wait_for_failure: [osd.0, osd.2] - - tasks: - - ceph.wait_for_failure: - daemons: [osd.0, osd.2] - - """ - if config is None: - config = {} - elif isinstance(config, list): - config = {'daemons': config} - - daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True) - for role in daemons: - cluster, type_, id_ = teuthology.split_role(role) - try: - ctx.daemons.get_daemon(type_, id_, cluster).wait() - except: - log.info('Saw expected daemon failure. Continuing.') - pass - else: - raise RuntimeError('daemon %s did not fail' % role) - - yield - - -def validate_config(ctx, config): - """ - Perform some simple validation on task configuration. - Raises exceptions.ConfigError if an error is found. - """ - # check for osds from multiple clusters on the same host - for remote, roles_for_host in ctx.cluster.remotes.items(): - last_cluster = None - last_role = None - for role in roles_for_host: - role_cluster, role_type, _ = teuthology.split_role(role) - if role_type != 'osd': - continue - if last_cluster and last_cluster != role_cluster: - msg = "Host should not have osds (%s and %s) from multiple clusters" % ( - last_role, role) - raise exceptions.ConfigError(msg) - last_cluster = role_cluster - last_role = role - - -@contextlib.contextmanager -def task(ctx, config): - """ - Set up and tear down a Ceph cluster. - - For example:: - - tasks: - - ceph: - - interactive: - - You can also specify what branch to run:: - - tasks: - - ceph: - branch: foo - - Or a tag:: - - tasks: - - ceph: - tag: v0.42.13 - - Or a sha1:: - - tasks: - - ceph: - sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed - - Or a local source dir:: - - tasks: - - ceph: - path: /home/sage/ceph - - To capture code coverage data, use:: - - tasks: - - ceph: - coverage: true - - To use btrfs, ext4, or xfs on the target's scratch disks, use:: - - tasks: - - ceph: - fs: xfs - mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1] - mount_options: [nobarrier, inode64] - - Note, this will cause the task to check the /scratch_devs file on each node - for available devices. If no such file is found, /dev/sdb will be used. - - To run some daemons under valgrind, include their names - and the tool/args to use in a valgrind section:: - - tasks: - - ceph: - valgrind: - mds.1: --tool=memcheck - osd.1: [--tool=memcheck, --leak-check=no] - - Those nodes which are using memcheck or valgrind will get - checked for bad results. - - To adjust or modify config options, use:: - - tasks: - - ceph: - conf: - section: - key: value - - For example:: - - tasks: - - ceph: - conf: - mds.0: - some option: value - other key: other value - client.0: - debug client: 10 - debug ms: 1 - - By default, the cluster log is checked for errors and warnings, - and the run marked failed if any appear. You can ignore log - entries by giving a list of egrep compatible regexes, i.e.: - - tasks: - - ceph: - log-whitelist: ['foo.*bar', 'bad message'] - - To run multiple ceph clusters, use multiple ceph tasks, and roles - with a cluster name prefix, e.g. cluster1.client.0. Roles with no - cluster use the default cluster name, 'ceph'. OSDs from separate - clusters must be on separate hosts. Clients and non-osd daemons - from multiple clusters may be colocated. For each cluster, add an - instance of the ceph task with the cluster name specified, e.g.:: - - roles: - - [mon.a, osd.0, osd.1] - - [backup.mon.a, backup.osd.0, backup.osd.1] - - [client.0, backup.client.0] - tasks: - - ceph: - cluster: ceph - - ceph: - cluster: backup - - :param ctx: Context - :param config: Configuration - - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - "task ceph only supports a dictionary for configuration" - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph', {})) - - first_ceph_cluster = False - if not hasattr(ctx, 'daemons'): - first_ceph_cluster = True - ctx.daemons = DaemonGroup() - - testdir = teuthology.get_testdir(ctx) - if config.get('coverage'): - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - log.info('Creating coverage directory...') - run.wait( - ctx.cluster.run( - args=[ - 'install', '-d', '-m0755', '--', - coverage_dir, - ], - wait=False, - ) - ) - - if 'cluster' not in config: - config['cluster'] = 'ceph' - - validate_config(ctx, config) - - subtasks = [] - if first_ceph_cluster: - # these tasks handle general log setup and parsing on all hosts, - # so they should only be run once - subtasks = [ - lambda: ceph_log(ctx=ctx, config=None), - lambda: valgrind_post(ctx=ctx, config=config), - ] - - subtasks += [ - lambda: cluster(ctx=ctx, config=dict( - conf=config.get('conf', {}), - fs=config.get('fs', 'xfs'), - mkfs_options=config.get('mkfs_options', None), - mount_options=config.get('mount_options', None), - block_journal=config.get('block_journal', None), - tmpfs_journal=config.get('tmpfs_journal', None), - skip_mgr_daemons=config.get('skip_mgr_daemons', False), - log_whitelist=config.get('log-whitelist', []), - cpu_profile=set(config.get('cpu_profile', []),), - cluster=config['cluster'], - )), - lambda: run_daemon(ctx=ctx, config=config, type_='mon'), - lambda: run_daemon(ctx=ctx, config=config, type_='mgr'), - lambda: crush_setup(ctx=ctx, config=config), - lambda: run_daemon(ctx=ctx, config=config, type_='osd'), - lambda: create_rbd_pool(ctx=ctx, config=config), - lambda: cephfs_setup(ctx=ctx, config=config), - lambda: run_daemon(ctx=ctx, config=config, type_='mds'), - ] - - with contextutil.nested(*subtasks): - first_mon = teuthology.get_first_mon(ctx, config, config['cluster']) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - if not hasattr(ctx, 'managers'): - ctx.managers = {} - ctx.managers[config['cluster']] = CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager.' + config['cluster']), - cluster=config['cluster'], - ) - - try: - if config.get('wait-for-healthy', True): - healthy(ctx=ctx, config=dict(cluster=config['cluster'])) - - yield - finally: - if config.get('wait-for-scrub', True): - osd_scrub_pgs(ctx, config) - - # stop logging health to clog during shutdown, or else we generate - # a bunch of scary messages unrelated to our actual run. - firstmon = teuthology.get_first_mon(ctx, config, config['cluster']) - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - mon0_remote.run( - args=[ - 'sudo', - 'ceph', - '--cluster', config['cluster'], - 'tell', - 'mon.*', - 'injectargs', - '--', - '--no-mon-health-to-clog', - ] - ) diff --git a/src/ceph/qa/tasks/ceph_client.py b/src/ceph/qa/tasks/ceph_client.py deleted file mode 100644 index 3ca90b7..0000000 --- a/src/ceph/qa/tasks/ceph_client.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Set up client keyring -""" -import logging - -from teuthology import misc as teuthology -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - -def create_keyring(ctx, cluster_name): - """ - Set up key ring on remote sites - """ - log.info('Setting up client nodes...') - clients = ctx.cluster.only(teuthology.is_type('client', cluster_name)) - testdir = teuthology.get_testdir(ctx) - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - for remote, roles_for_host in clients.remotes.iteritems(): - for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', - cluster_name): - name = teuthology.ceph_role(role) - client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name, name) - remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - '--gen-key', - # TODO this --name= is not really obeyed, all unknown "types" are munged to "client" - '--name={name}'.format(name=name), - client_keyring, - run.Raw('&&'), - 'sudo', - 'chmod', - '0644', - client_keyring, - ], - ) diff --git a/src/ceph/qa/tasks/ceph_deploy.py b/src/ceph/qa/tasks/ceph_deploy.py deleted file mode 100644 index 38fbe43..0000000 --- a/src/ceph/qa/tasks/ceph_deploy.py +++ /dev/null @@ -1,862 +0,0 @@ -""" -Execute ceph-deploy as a task -""" -from cStringIO import StringIO - -import contextlib -import os -import time -import logging -import traceback - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.config import config as teuth_config -from teuthology.task import install as install_fn -from teuthology.orchestra import run -from tasks.cephfs.filesystem import Filesystem -from teuthology.misc import wait_until_healthy - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download_ceph_deploy(ctx, config): - """ - Downloads ceph-deploy from the ceph.com git mirror and (by default) - switches to the master branch. If the `ceph-deploy-branch` is specified, it - will use that instead. The `bootstrap` script is ran, with the argument - obtained from `python_version`, if specified. - """ - # use mon.a for ceph_admin - (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys() - - try: - py_ver = str(config['python_version']) - except KeyError: - pass - else: - supported_versions = ['2', '3'] - if py_ver not in supported_versions: - raise ValueError("python_version must be: {}, not {}".format( - ' or '.join(supported_versions), py_ver - )) - - log.info("Installing Python") - system_type = teuthology.get_system_type(ceph_admin) - - if system_type == 'rpm': - package = 'python34' if py_ver == '3' else 'python' - ctx.cluster.run(args=[ - 'sudo', 'yum', '-y', 'install', - package, 'python-virtualenv' - ]) - else: - package = 'python3' if py_ver == '3' else 'python' - ctx.cluster.run(args=[ - 'sudo', 'apt-get', '-y', '--force-yes', 'install', - package, 'python-virtualenv' - ]) - - log.info('Downloading ceph-deploy...') - testdir = teuthology.get_testdir(ctx) - ceph_deploy_branch = config.get('ceph-deploy-branch', 'master') - - ceph_admin.run( - args=[ - 'git', 'clone', '-b', ceph_deploy_branch, - teuth_config.ceph_git_base_url + 'ceph-deploy.git', - '{tdir}/ceph-deploy'.format(tdir=testdir), - ], - ) - args = [ - 'cd', - '{tdir}/ceph-deploy'.format(tdir=testdir), - run.Raw('&&'), - './bootstrap', - ] - try: - args.append(str(config['python_version'])) - except KeyError: - pass - ceph_admin.run(args=args) - - try: - yield - finally: - log.info('Removing ceph-deploy ...') - ceph_admin.run( - args=[ - 'rm', - '-rf', - '{tdir}/ceph-deploy'.format(tdir=testdir), - ], - ) - - -def is_healthy(ctx, config): - """Wait until a Ceph cluster is healthy.""" - testdir = teuthology.get_testdir(ctx) - ceph_admin = teuthology.get_first_mon(ctx, config) - (remote,) = ctx.cluster.only(ceph_admin).remotes.keys() - max_tries = 90 # 90 tries * 10 secs --> 15 minutes - tries = 0 - while True: - tries += 1 - if tries >= max_tries: - msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes" - remote.run( - args=[ - 'cd', - '{tdir}'.format(tdir=testdir), - run.Raw('&&'), - 'sudo', 'ceph', - 'report', - ], - ) - raise RuntimeError(msg) - - r = remote.run( - args=[ - 'cd', - '{tdir}'.format(tdir=testdir), - run.Raw('&&'), - 'sudo', 'ceph', - 'health', - ], - stdout=StringIO(), - logger=log.getChild('health'), - ) - out = r.stdout.getvalue() - log.info('Ceph health: %s', out.rstrip('\n')) - if out.split(None, 1)[0] == 'HEALTH_OK': - break - time.sleep(10) - - -def get_nodes_using_role(ctx, target_role): - """ - Extract the names of nodes that match a given role from a cluster, and modify the - cluster's service IDs to match the resulting node-based naming scheme that ceph-deploy - uses, such that if "mon.a" is on host "foo23", it'll be renamed to "mon.foo23". - """ - - # Nodes containing a service of the specified role - nodes_of_interest = [] - - # Prepare a modified version of cluster.remotes with ceph-deploy-ized names - modified_remotes = {} - ceph_deploy_mapped = dict() - for _remote, roles_for_host in ctx.cluster.remotes.iteritems(): - modified_remotes[_remote] = [] - for svc_id in roles_for_host: - if svc_id.startswith("{0}.".format(target_role)): - fqdn = str(_remote).split('@')[-1] - nodename = str(str(_remote).split('.')[0]).split('@')[1] - if target_role == 'mon': - nodes_of_interest.append(fqdn) - else: - nodes_of_interest.append(nodename) - mapped_role = "{0}.{1}".format(target_role, nodename) - modified_remotes[_remote].append(mapped_role) - # keep dict of mapped role for later use by tasks - # eg. mon.a => mon.node1 - ceph_deploy_mapped[svc_id] = mapped_role - else: - modified_remotes[_remote].append(svc_id) - - ctx.cluster.remotes = modified_remotes - ctx.cluster.mapped_role = ceph_deploy_mapped - - return nodes_of_interest - - -def get_dev_for_osd(ctx, config): - """Get a list of all osd device names.""" - osd_devs = [] - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - host = remote.name.split('@')[-1] - shortname = host.split('.')[0] - devs = teuthology.get_scratch_devices(remote) - num_osd_per_host = list( - teuthology.roles_of_type( - roles_for_host, 'osd')) - num_osds = len(num_osd_per_host) - if config.get('separate_journal_disk') is not None: - num_devs_reqd = 2 * num_osds - assert num_devs_reqd <= len( - devs), 'fewer data and journal disks than required ' + shortname - for dindex in range(0, num_devs_reqd, 2): - jd_index = dindex + 1 - dev_short = devs[dindex].split('/')[-1] - jdev_short = devs[jd_index].split('/')[-1] - osd_devs.append((shortname, dev_short, jdev_short)) - else: - assert num_osds <= len(devs), 'fewer disks than osds ' + shortname - for dev in devs[:num_osds]: - dev_short = dev.split('/')[-1] - osd_devs.append((shortname, dev_short)) - return osd_devs - - -def get_all_nodes(ctx, config): - """Return a string of node names separated by blanks""" - nodelist = [] - for t, k in ctx.config['targets'].iteritems(): - host = t.split('@')[-1] - simple_host = host.split('.')[0] - nodelist.append(simple_host) - nodelist = " ".join(nodelist) - return nodelist - - -@contextlib.contextmanager -def build_ceph_cluster(ctx, config): - """Build a ceph cluster""" - - # Expect to find ceph_admin on the first mon by ID, same place that the download task - # puts it. Remember this here, because subsequently IDs will change from those in - # the test config to those that ceph-deploy invents. - - (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys() - - def execute_ceph_deploy(cmd): - """Remotely execute a ceph_deploy command""" - return ceph_admin.run( - args=[ - 'cd', - '{tdir}/ceph-deploy'.format(tdir=testdir), - run.Raw('&&'), - run.Raw(cmd), - ], - check_status=False, - ).exitstatus - - try: - log.info('Building ceph cluster using ceph-deploy...') - testdir = teuthology.get_testdir(ctx) - ceph_branch = None - if config.get('branch') is not None: - cbranch = config.get('branch') - for var, val in cbranch.iteritems(): - ceph_branch = '--{var}={val}'.format(var=var, val=val) - all_nodes = get_all_nodes(ctx, config) - mds_nodes = get_nodes_using_role(ctx, 'mds') - mds_nodes = " ".join(mds_nodes) - mon_node = get_nodes_using_role(ctx, 'mon') - mon_nodes = " ".join(mon_node) - # skip mgr based on config item - # this is needed when test uses latest code to install old ceph - # versions - skip_mgr = config.get('skip-mgr', False) - if not skip_mgr: - mgr_nodes = get_nodes_using_role(ctx, 'mgr') - mgr_nodes = " ".join(mgr_nodes) - new_mon = './ceph-deploy new' + " " + mon_nodes - if not skip_mgr: - mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes - mon_hostname = mon_nodes.split(' ')[0] - mon_hostname = str(mon_hostname) - gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname - deploy_mds = './ceph-deploy mds create' + " " + mds_nodes - no_of_osds = 0 - - if mon_nodes is None: - raise RuntimeError("no monitor nodes in the config file") - - estatus_new = execute_ceph_deploy(new_mon) - if estatus_new != 0: - raise RuntimeError("ceph-deploy: new command failed") - - log.info('adding config inputs...') - testdir = teuthology.get_testdir(ctx) - conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) - - if config.get('conf') is not None: - confp = config.get('conf') - for section, keys in confp.iteritems(): - lines = '[{section}]\n'.format(section=section) - teuthology.append_lines_to_file(ceph_admin, conf_path, lines, - sudo=True) - for key, value in keys.iteritems(): - log.info("[%s] %s = %s" % (section, key, value)) - lines = '{key} = {value}\n'.format(key=key, value=value) - teuthology.append_lines_to_file( - ceph_admin, conf_path, lines, sudo=True) - - # install ceph - dev_branch = ctx.config['branch'] - branch = '--dev={branch}'.format(branch=dev_branch) - if ceph_branch: - option = ceph_branch - else: - option = branch - install_nodes = './ceph-deploy install ' + option + " " + all_nodes - estatus_install = execute_ceph_deploy(install_nodes) - if estatus_install != 0: - raise RuntimeError("ceph-deploy: Failed to install ceph") - # install ceph-test package too - install_nodes2 = './ceph-deploy install --tests ' + option + \ - " " + all_nodes - estatus_install = execute_ceph_deploy(install_nodes2) - if estatus_install != 0: - raise RuntimeError("ceph-deploy: Failed to install ceph-test") - - mon_create_nodes = './ceph-deploy mon create-initial' - # If the following fails, it is OK, it might just be that the monitors - # are taking way more than a minute/monitor to form quorum, so lets - # try the next block which will wait up to 15 minutes to gatherkeys. - execute_ceph_deploy(mon_create_nodes) - - # create-keys is explicit now - # http://tracker.ceph.com/issues/16036 - mons = ctx.cluster.only(teuthology.is_type('mon')) - for remote in mons.remotes.iterkeys(): - remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph', - '--id', remote.shortname]) - - estatus_gather = execute_ceph_deploy(gather_keys) - - if not skip_mgr: - execute_ceph_deploy(mgr_create) - - if mds_nodes: - estatus_mds = execute_ceph_deploy(deploy_mds) - if estatus_mds != 0: - raise RuntimeError("ceph-deploy: Failed to deploy mds") - - if config.get('test_mon_destroy') is not None: - for d in range(1, len(mon_node)): - mon_destroy_nodes = './ceph-deploy mon destroy' + \ - " " + mon_node[d] - estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) - if estatus_mon_d != 0: - raise RuntimeError("ceph-deploy: Failed to delete monitor") - - node_dev_list = get_dev_for_osd(ctx, config) - for d in node_dev_list: - node = d[0] - for disk in d[1:]: - zap = './ceph-deploy disk zap ' + node + ':' + disk - estatus = execute_ceph_deploy(zap) - if estatus != 0: - raise RuntimeError("ceph-deploy: Failed to zap osds") - osd_create_cmd = './ceph-deploy osd create ' - # first check for filestore, default is bluestore with ceph-deploy - if config.get('filestore') is not None: - osd_create_cmd += '--filestore ' - elif config.get('bluestore') is not None: - osd_create_cmd += '--bluestore ' - if config.get('dmcrypt') is not None: - osd_create_cmd += '--dmcrypt ' - osd_create_cmd += ":".join(d) - estatus_osd = execute_ceph_deploy(osd_create_cmd) - if estatus_osd == 0: - log.info('successfully created osd') - no_of_osds += 1 - else: - raise RuntimeError("ceph-deploy: Failed to create osds") - - if config.get('wait-for-healthy', True) and no_of_osds >= 2: - is_healthy(ctx=ctx, config=None) - - log.info('Setting up client nodes...') - conf_path = '/etc/ceph/ceph.conf' - admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' - first_mon = teuthology.get_first_mon(ctx, config) - (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() - conf_data = teuthology.get_file( - remote=mon0_remote, - path=conf_path, - sudo=True, - ) - admin_keyring = teuthology.get_file( - remote=mon0_remote, - path=admin_keyring_path, - sudo=True, - ) - - clients = ctx.cluster.only(teuthology.is_type('client')) - for remot, roles_for_host in clients.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, 'client'): - client_keyring = \ - '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) - mon0_remote.run( - args=[ - 'cd', - '{tdir}'.format(tdir=testdir), - run.Raw('&&'), - 'sudo', 'bash', '-c', - run.Raw('"'), 'ceph', - 'auth', - 'get-or-create', - 'client.{id}'.format(id=id_), - 'mds', 'allow', - 'mon', 'allow *', - 'osd', 'allow *', - run.Raw('>'), - client_keyring, - run.Raw('"'), - ], - ) - key_data = teuthology.get_file( - remote=mon0_remote, - path=client_keyring, - sudo=True, - ) - teuthology.sudo_write_file( - remote=remot, - path=client_keyring, - data=key_data, - perms='0644' - ) - teuthology.sudo_write_file( - remote=remot, - path=admin_keyring_path, - data=admin_keyring, - perms='0644' - ) - teuthology.sudo_write_file( - remote=remot, - path=conf_path, - data=conf_data, - perms='0644' - ) - - if mds_nodes: - log.info('Configuring CephFS...') - Filesystem(ctx, create=True) - elif not config.get('only_mon'): - raise RuntimeError( - "The cluster is NOT operational due to insufficient OSDs") - yield - - except Exception: - log.info( - "Error encountered, logging exception before tearing down ceph-deploy") - log.info(traceback.format_exc()) - raise - finally: - if config.get('keep_running'): - return - log.info('Stopping ceph...') - ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), - 'sudo', 'systemctl', 'stop', 'ceph.target']) - - # Are you really not running anymore? - # try first with the init tooling - # ignoring the status so this becomes informational only - ctx.cluster.run( - args=[ - 'sudo', 'status', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'ceph', 'status', run.Raw('||'), - 'sudo', 'systemctl', 'status', 'ceph.target'], - check_status=False) - - # and now just check for the processes themselves, as if upstart/sysvinit - # is lying to us. Ignore errors if the grep fails - ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), - 'grep', '-v', 'grep', run.Raw('|'), - 'grep', 'ceph'], check_status=False) - - if ctx.archive is not None: - # archive mon data, too - log.info('Archiving mon data...') - path = os.path.join(ctx.archive, 'data') - os.makedirs(path) - mons = ctx.cluster.only(teuthology.is_type('mon')) - for remote, roles in mons.remotes.iteritems(): - for role in roles: - if role.startswith('mon.'): - teuthology.pull_directory_tarball( - remote, - '/var/lib/ceph/mon', - path + '/' + role + '.tgz') - - log.info('Compressing logs...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'find', - '/var/log/ceph', - '-name', - '*.log', - '-print0', - run.Raw('|'), - 'sudo', - 'xargs', - '-0', - '--no-run-if-empty', - '--', - 'gzip', - '--', - ], - wait=False, - ), - ) - - log.info('Archiving logs...') - path = os.path.join(ctx.archive, 'remote') - os.makedirs(path) - for remote in ctx.cluster.remotes.iterkeys(): - sub = os.path.join(path, remote.shortname) - os.makedirs(sub) - teuthology.pull_directory(remote, '/var/log/ceph', - os.path.join(sub, 'log')) - - # Prevent these from being undefined if the try block fails - all_nodes = get_all_nodes(ctx, config) - purge_nodes = './ceph-deploy purge' + " " + all_nodes - purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes - - log.info('Purging package...') - execute_ceph_deploy(purge_nodes) - log.info('Purging data...') - execute_ceph_deploy(purgedata_nodes) - - -@contextlib.contextmanager -def cli_test(ctx, config): - """ - ceph-deploy cli to exercise most commonly use cli's and ensure - all commands works and also startup the init system. - - """ - log.info('Ceph-deploy Test') - if config is None: - config = {} - test_branch = '' - conf_dir = teuthology.get_testdir(ctx) + "/cdtest" - - def execute_cdeploy(admin, cmd, path): - """Execute ceph-deploy commands """ - """Either use git path or repo path """ - args = ['cd', conf_dir, run.Raw(';')] - if path: - args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path)) - else: - args.append('ceph-deploy') - args.append(run.Raw(cmd)) - ec = admin.run(args=args, check_status=False).exitstatus - if ec != 0: - raise RuntimeError( - "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec)) - - if config.get('rhbuild'): - path = None - else: - path = teuthology.get_testdir(ctx) - # test on branch from config eg: wip-* , master or next etc - # packages for all distro's should exist for wip* - if ctx.config.get('branch'): - branch = ctx.config.get('branch') - test_branch = ' --dev={branch} '.format(branch=branch) - mons = ctx.cluster.only(teuthology.is_type('mon')) - for node, role in mons.remotes.iteritems(): - admin = node - admin.run(args=['mkdir', conf_dir], check_status=False) - nodename = admin.shortname - system_type = teuthology.get_system_type(admin) - if config.get('rhbuild'): - admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y']) - log.info('system type is %s', system_type) - osds = ctx.cluster.only(teuthology.is_type('osd')) - - for remote, roles in osds.remotes.iteritems(): - devs = teuthology.get_scratch_devices(remote) - log.info("roles %s", roles) - if (len(devs) < 3): - log.error( - 'Test needs minimum of 3 devices, only found %s', - str(devs)) - raise RuntimeError("Needs minimum of 3 devices ") - - conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir) - new_cmd = 'new ' + nodename - execute_cdeploy(admin, new_cmd, path) - if config.get('conf') is not None: - confp = config.get('conf') - for section, keys in confp.iteritems(): - lines = '[{section}]\n'.format(section=section) - teuthology.append_lines_to_file(admin, conf_path, lines, - sudo=True) - for key, value in keys.iteritems(): - log.info("[%s] %s = %s" % (section, key, value)) - lines = '{key} = {value}\n'.format(key=key, value=value) - teuthology.append_lines_to_file(admin, conf_path, lines, - sudo=True) - new_mon_install = 'install {branch} --mon '.format( - branch=test_branch) + nodename - new_mgr_install = 'install {branch} --mgr '.format( - branch=test_branch) + nodename - new_osd_install = 'install {branch} --osd '.format( - branch=test_branch) + nodename - new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename - create_initial = 'mon create-initial ' - # either use create-keys or push command - push_keys = 'admin ' + nodename - execute_cdeploy(admin, new_mon_install, path) - execute_cdeploy(admin, new_mgr_install, path) - execute_cdeploy(admin, new_osd_install, path) - execute_cdeploy(admin, new_admin, path) - execute_cdeploy(admin, create_initial, path) - execute_cdeploy(admin, push_keys, path) - - for i in range(3): - zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i]) - prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i]) - execute_cdeploy(admin, zap_disk, path) - execute_cdeploy(admin, prepare, path) - - log.info("list files for debugging purpose to check file permissions") - admin.run(args=['ls', run.Raw('-lt'), conf_dir]) - remote.run(args=['sudo', 'ceph', '-s'], check_status=False) - r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) - out = r.stdout.getvalue() - log.info('Ceph health: %s', out.rstrip('\n')) - log.info("Waiting for cluster to become healthy") - with contextutil.safe_while(sleep=10, tries=6, - action='check health') as proceed: - while proceed(): - r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) - out = r.stdout.getvalue() - if (out.split(None, 1)[0] == 'HEALTH_OK'): - break - rgw_install = 'install {branch} --rgw {node}'.format( - branch=test_branch, - node=nodename, - ) - rgw_create = 'rgw create ' + nodename - execute_cdeploy(admin, rgw_install, path) - execute_cdeploy(admin, rgw_create, path) - log.info('All ceph-deploy cli tests passed') - try: - yield - finally: - log.info("cleaning up") - ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), - 'sudo', 'systemctl', 'stop', 'ceph.target'], - check_status=False) - time.sleep(4) - for i in range(3): - umount_dev = "{d}1".format(d=devs[i]) - r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)]) - cmd = 'purge ' + nodename - execute_cdeploy(admin, cmd, path) - cmd = 'purgedata ' + nodename - execute_cdeploy(admin, cmd, path) - log.info("Removing temporary dir") - admin.run( - args=[ - 'rm', - run.Raw('-rf'), - run.Raw(conf_dir)], - check_status=False) - if config.get('rhbuild'): - admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y']) - - -@contextlib.contextmanager -def single_node_test(ctx, config): - """ - - ceph-deploy.single_node_test: null - - #rhbuild testing - - ceph-deploy.single_node_test: - rhbuild: 1.2.3 - - """ - log.info("Testing ceph-deploy on single node") - if config is None: - config = {} - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph-deploy', {})) - - if config.get('rhbuild'): - log.info("RH Build, Skip Download") - with contextutil.nested( - lambda: cli_test(ctx=ctx, config=config), - ): - yield - else: - with contextutil.nested( - lambda: install_fn.ship_utilities(ctx=ctx, config=None), - lambda: download_ceph_deploy(ctx=ctx, config=config), - lambda: cli_test(ctx=ctx, config=config), - ): - yield - - -@contextlib.contextmanager -def upgrade(ctx, config): - """ - Upgrade using ceph-deploy - eg: - ceph-deploy.upgrade: - # to upgrade to specific branch, use - branch: - stable: jewel - # to setup mgr node, use - setup-mgr-node: True - # to wait for cluster to be healthy after all upgrade, use - wait-for-healthy: True - role: (upgrades the below roles serially) - mon.a - mon.b - osd.0 - """ - roles = config.get('roles') - # get the roles that are mapped as per ceph-deploy - # roles are mapped for mon/mds eg: mon.a => mon.host_short_name - mapped_role = ctx.cluster.mapped_role - if config.get('branch'): - branch = config.get('branch') - (var, val) = branch.items()[0] - ceph_branch = '--{var}={val}'.format(var=var, val=val) - else: - # default to master - ceph_branch = '--dev=master' - # get the node used for initial deployment which is mon.a - mon_a = mapped_role.get('mon.a') - (ceph_admin,) = ctx.cluster.only(mon_a).remotes.iterkeys() - testdir = teuthology.get_testdir(ctx) - cmd = './ceph-deploy install ' + ceph_branch - for role in roles: - # check if this role is mapped (mon or mds) - if mapped_role.get(role): - role = mapped_role.get(role) - remotes_and_roles = ctx.cluster.only(role).remotes - for remote, roles in remotes_and_roles.iteritems(): - nodename = remote.shortname - cmd = cmd + ' ' + nodename - log.info("Upgrading ceph on %s", nodename) - ceph_admin.run( - args=[ - 'cd', - '{tdir}/ceph-deploy'.format(tdir=testdir), - run.Raw('&&'), - run.Raw(cmd), - ], - ) - # restart all ceph services, ideally upgrade should but it does not - remote.run( - args=[ - 'sudo', 'systemctl', 'restart', 'ceph.target' - ] - ) - ceph_admin.run(args=['sudo', 'ceph', '-s']) - - # workaround for http://tracker.ceph.com/issues/20950 - # write the correct mgr key to disk - if config.get('setup-mgr-node', None): - mons = ctx.cluster.only(teuthology.is_type('mon')) - for remote, roles in mons.remotes.iteritems(): - remote.run( - args=[ - run.Raw('sudo ceph auth get client.bootstrap-mgr'), - run.Raw('|'), - run.Raw('sudo tee'), - run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring') - ] - ) - - if config.get('setup-mgr-node', None): - mgr_nodes = get_nodes_using_role(ctx, 'mgr') - mgr_nodes = " ".join(mgr_nodes) - mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes - mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes - # install mgr - ceph_admin.run( - args=[ - 'cd', - '{tdir}/ceph-deploy'.format(tdir=testdir), - run.Raw('&&'), - run.Raw(mgr_install), - ], - ) - # create mgr - ceph_admin.run( - args=[ - 'cd', - '{tdir}/ceph-deploy'.format(tdir=testdir), - run.Raw('&&'), - run.Raw(mgr_create), - ], - ) - ceph_admin.run(args=['sudo', 'ceph', '-s']) - if config.get('wait-for-healthy', None): - wait_until_healthy(ctx, ceph_admin, use_sudo=True) - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Set up and tear down a Ceph cluster. - - For example:: - - tasks: - - install: - extras: yes - - ssh_keys: - - ceph-deploy: - branch: - stable: bobtail - mon_initial_members: 1 - ceph-deploy-branch: my-ceph-deploy-branch - only_mon: true - keep_running: true - # either choose bluestore or filestore, default is bluestore - bluestore: True - # or - filestore: True - # skip install of mgr for old release using below flag - skip-mgr: True ( default is False ) - - tasks: - - install: - extras: yes - - ssh_keys: - - ceph-deploy: - branch: - dev: master - conf: - mon: - debug mon = 20 - - tasks: - - install: - extras: yes - - ssh_keys: - - ceph-deploy: - branch: - testing: - dmcrypt: yes - separate_journal_disk: yes - - """ - if config is None: - config = {} - - assert isinstance(config, dict), \ - "task ceph-deploy only supports a dictionary for configuration" - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph-deploy', {})) - - if config.get('branch') is not None: - assert isinstance( - config['branch'], dict), 'branch must be a dictionary' - - log.info('task ceph-deploy with config ' + str(config)) - - with contextutil.nested( - lambda: install_fn.ship_utilities(ctx=ctx, config=None), - lambda: download_ceph_deploy(ctx=ctx, config=config), - lambda: build_ceph_cluster(ctx=ctx, config=config), - ): - yield diff --git a/src/ceph/qa/tasks/ceph_fuse.py b/src/ceph/qa/tasks/ceph_fuse.py deleted file mode 100644 index c9d8354..0000000 --- a/src/ceph/qa/tasks/ceph_fuse.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -Ceph FUSE client task -""" - -import contextlib -import logging - -from teuthology import misc as teuthology -from cephfs.fuse_mount import FuseMount - -log = logging.getLogger(__name__) - - -def get_client_configs(ctx, config): - """ - Get a map of the configuration for each FUSE client in the configuration by - combining the configuration of the current task with any global overrides. - - :param ctx: Context instance - :param config: configuration for this task - :return: dict of client name to config or to None - """ - if config is None: - config = dict(('client.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph-fuse', {})) - - return config - - -@contextlib.contextmanager -def task(ctx, config): - """ - Mount/unmount a ``ceph-fuse`` client. - - The config is optional and defaults to mounting on all clients. If - a config is given, it is expected to be a list of clients to do - this operation on. This lets you e.g. set up one client with - ``ceph-fuse`` and another with ``kclient``. - - Example that mounts all clients:: - - tasks: - - ceph: - - ceph-fuse: - - interactive: - - Example that uses both ``kclient` and ``ceph-fuse``:: - - tasks: - - ceph: - - ceph-fuse: [client.0] - - kclient: [client.1] - - interactive: - - Example that enables valgrind: - - tasks: - - ceph: - - ceph-fuse: - client.0: - valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes] - - interactive: - - Example that stops an already-mounted client: - - :: - - tasks: - - ceph: - - ceph-fuse: [client.0] - - ... do something that requires the FS mounted ... - - ceph-fuse: - client.0: - mounted: false - - ... do something that requires the FS unmounted ... - - Example that adds more generous wait time for mount (for virtual machines): - - tasks: - - ceph: - - ceph-fuse: - client.0: - mount_wait: 60 # default is 0, do not wait before checking /sys/ - mount_timeout: 120 # default is 30, give up if /sys/ is not populated - - interactive: - - :param ctx: Context - :param config: Configuration - """ - log.info('Mounting ceph-fuse clients...') - - testdir = teuthology.get_testdir(ctx) - config = get_client_configs(ctx, config) - - # List clients we will configure mounts for, default is all clients - clients = list(teuthology.get_clients(ctx=ctx, roles=filter(lambda x: 'client.' in x, config.keys()))) - - all_mounts = getattr(ctx, 'mounts', {}) - mounted_by_me = {} - - # Construct any new FuseMount instances - for id_, remote in clients: - client_config = config.get("client.%s" % id_) - if client_config is None: - client_config = {} - - if id_ not in all_mounts: - fuse_mount = FuseMount(client_config, testdir, id_, remote) - all_mounts[id_] = fuse_mount - else: - # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client - assert isinstance(all_mounts[id_], FuseMount) - - if not config.get("disabled", False) and client_config.get('mounted', True): - mounted_by_me[id_] = all_mounts[id_] - - ctx.mounts = all_mounts - - # Mount any clients we have been asked to (default to mount all) - for mount in mounted_by_me.values(): - mount.mount() - - for mount in mounted_by_me.values(): - mount.wait_until_mounted() - - # Umount any pre-existing clients that we have not been asked to mount - for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()): - mount = all_mounts[client_id] - if mount.is_mounted(): - mount.umount_wait() - - try: - yield all_mounts - finally: - log.info('Unmounting ceph-fuse clients...') - - for mount in mounted_by_me.values(): - # Conditional because an inner context might have umounted it - if mount.is_mounted(): - mount.umount_wait() diff --git a/src/ceph/qa/tasks/ceph_manager.py b/src/ceph/qa/tasks/ceph_manager.py deleted file mode 100644 index 5a89f23..0000000 --- a/src/ceph/qa/tasks/ceph_manager.py +++ /dev/null @@ -1,2592 +0,0 @@ -""" -ceph manager -- Thrasher and CephManager objects -""" -from cStringIO import StringIO -from functools import wraps -import contextlib -import random -import signal -import time -import gevent -import base64 -import json -import logging -import threading -import traceback -import os -from teuthology import misc as teuthology -from tasks.scrub import Scrubber -from util.rados import cmd_erasure_code_profile -from util import get_remote -from teuthology.contextutil import safe_while -from teuthology.orchestra.remote import Remote -from teuthology.orchestra import run -from teuthology.exceptions import CommandFailedError - -try: - from subprocess import DEVNULL # py3k -except ImportError: - DEVNULL = open(os.devnull, 'r+') - -DEFAULT_CONF_PATH = '/etc/ceph/ceph.conf' - -log = logging.getLogger(__name__) - - -def write_conf(ctx, conf_path=DEFAULT_CONF_PATH, cluster='ceph'): - conf_fp = StringIO() - ctx.ceph[cluster].conf.write(conf_fp) - conf_fp.seek(0) - writes = ctx.cluster.run( - args=[ - 'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'), - 'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'), - 'sudo', 'python', - '-c', - ('import shutil, sys; ' - 'shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))'), - conf_path, - run.Raw('&&'), - 'sudo', 'chmod', '0644', conf_path, - ], - stdin=run.PIPE, - wait=False) - teuthology.feed_many_stdins_and_close(conf_fp, writes) - run.wait(writes) - - -def mount_osd_data(ctx, remote, cluster, osd): - """ - Mount a remote OSD - - :param ctx: Context - :param remote: Remote site - :param cluster: name of ceph cluster - :param osd: Osd name - """ - log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote)) - role = "{0}.osd.{1}".format(cluster, osd) - alt_role = role if cluster != 'ceph' else "osd.{0}".format(osd) - if remote in ctx.disk_config.remote_to_roles_to_dev: - if alt_role in ctx.disk_config.remote_to_roles_to_dev[remote]: - role = alt_role - if role not in ctx.disk_config.remote_to_roles_to_dev[remote]: - return - dev = ctx.disk_config.remote_to_roles_to_dev[remote][role] - mount_options = ctx.disk_config.\ - remote_to_roles_to_dev_mount_options[remote][role] - fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] - mnt = os.path.join('/var/lib/ceph/osd', '{0}-{1}'.format(cluster, osd)) - - log.info('Mounting osd.{o}: dev: {n}, cluster: {c}' - 'mountpoint: {p}, type: {t}, options: {v}'.format( - o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options, - c=cluster)) - - remote.run( - args=[ - 'sudo', - 'mount', - '-t', fstype, - '-o', ','.join(mount_options), - dev, - mnt, - ] - ) - - -class Thrasher: - """ - Object used to thrash Ceph - """ - def __init__(self, manager, config, logger=None): - self.ceph_manager = manager - self.cluster = manager.cluster - self.ceph_manager.wait_for_clean() - osd_status = self.ceph_manager.get_osd_status() - self.in_osds = osd_status['in'] - self.live_osds = osd_status['live'] - self.out_osds = osd_status['out'] - self.dead_osds = osd_status['dead'] - self.stopping = False - self.logger = logger - self.config = config - self.revive_timeout = self.config.get("revive_timeout", 360) - self.pools_to_fix_pgp_num = set() - if self.config.get('powercycle'): - self.revive_timeout += 120 - self.clean_wait = self.config.get('clean_wait', 0) - self.minin = self.config.get("min_in", 4) - self.chance_move_pg = self.config.get('chance_move_pg', 1.0) - self.sighup_delay = self.config.get('sighup_delay') - self.optrack_toggle_delay = self.config.get('optrack_toggle_delay') - self.dump_ops_enable = self.config.get('dump_ops_enable') - self.noscrub_toggle_delay = self.config.get('noscrub_toggle_delay') - self.chance_thrash_cluster_full = self.config.get('chance_thrash_cluster_full', .05) - self.chance_thrash_pg_upmap = self.config.get('chance_thrash_pg_upmap', 1.0) - self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0) - self.random_eio = self.config.get('random_eio') - self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3) - - num_osds = self.in_osds + self.out_osds - self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds - if self.logger is not None: - self.log = lambda x: self.logger.info(x) - else: - def tmp(x): - """ - Implement log behavior - """ - print x - self.log = tmp - if self.config is None: - self.config = dict() - # prevent monitor from auto-marking things out while thrasher runs - # try both old and new tell syntax, in case we are testing old code - self.saved_options = [] - # assuming that the default settings do not vary from one daemon to - # another - first_mon = teuthology.get_first_mon(manager.ctx, self.config).split('.') - opts = [('mon', 'mon_osd_down_out_interval', 0)] - for service, opt, new_value in opts: - old_value = manager.get_config(first_mon[0], - first_mon[1], - opt) - self.saved_options.append((service, opt, old_value)) - self._set_config(service, '*', opt, new_value) - # initialize ceph_objectstore_tool property - must be done before - # do_thrash is spawned - http://tracker.ceph.com/issues/18799 - if (self.config.get('powercycle') or - not self.cmd_exists_on_osds("ceph-objectstore-tool") or - self.config.get('disable_objectstore_tool_tests', False)): - self.ceph_objectstore_tool = False - self.test_rm_past_intervals = False - if self.config.get('powercycle'): - self.log("Unable to test ceph-objectstore-tool, " - "powercycle testing") - else: - self.log("Unable to test ceph-objectstore-tool, " - "not available on all OSD nodes") - else: - self.ceph_objectstore_tool = \ - self.config.get('ceph_objectstore_tool', True) - self.test_rm_past_intervals = \ - self.config.get('test_rm_past_intervals', True) - # spawn do_thrash - self.thread = gevent.spawn(self.do_thrash) - if self.sighup_delay: - self.sighup_thread = gevent.spawn(self.do_sighup) - if self.optrack_toggle_delay: - self.optrack_toggle_thread = gevent.spawn(self.do_optrack_toggle) - if self.dump_ops_enable == "true": - self.dump_ops_thread = gevent.spawn(self.do_dump_ops) - if self.noscrub_toggle_delay: - self.noscrub_toggle_thread = gevent.spawn(self.do_noscrub_toggle) - - def _set_config(self, service_type, service_id, name, value): - opt_arg = '--{name} {value}'.format(name=name, value=value) - whom = '.'.join([service_type, service_id]) - self.ceph_manager.raw_cluster_cmd('--', 'tell', whom, - 'injectargs', opt_arg) - - - def cmd_exists_on_osds(self, cmd): - allremotes = self.ceph_manager.ctx.cluster.only(\ - teuthology.is_type('osd', self.cluster)).remotes.keys() - allremotes = list(set(allremotes)) - for remote in allremotes: - proc = remote.run(args=['type', cmd], wait=True, - check_status=False, stdout=StringIO(), - stderr=StringIO()) - if proc.exitstatus != 0: - return False; - return True; - - def kill_osd(self, osd=None, mark_down=False, mark_out=False): - """ - :param osd: Osd to be killed. - :mark_down: Mark down if true. - :mark_out: Mark out if true. - """ - if osd is None: - osd = random.choice(self.live_osds) - self.log("Killing osd %s, live_osds are %s" % (str(osd), - str(self.live_osds))) - self.live_osds.remove(osd) - self.dead_osds.append(osd) - self.ceph_manager.kill_osd(osd) - if mark_down: - self.ceph_manager.mark_down_osd(osd) - if mark_out and osd in self.in_osds: - self.out_osd(osd) - if self.ceph_objectstore_tool: - self.log("Testing ceph-objectstore-tool on down osd") - remote = self.ceph_manager.find_remote('osd', osd) - FSPATH = self.ceph_manager.get_filepath() - JPATH = os.path.join(FSPATH, "journal") - exp_osd = imp_osd = osd - exp_remote = imp_remote = remote - # If an older osd is available we'll move a pg from there - if (len(self.dead_osds) > 1 and - random.random() < self.chance_move_pg): - exp_osd = random.choice(self.dead_osds[:-1]) - exp_remote = self.ceph_manager.find_remote('osd', exp_osd) - if ('keyvaluestore_backend' in - self.ceph_manager.ctx.ceph[self.cluster].conf['osd']): - prefix = ("sudo adjust-ulimits ceph-objectstore-tool " - "--data-path {fpath} --journal-path {jpath} " - "--type keyvaluestore " - "--log-file=" - "/var/log/ceph/objectstore_tool.\\$pid.log ". - format(fpath=FSPATH, jpath=JPATH)) - else: - prefix = ("sudo adjust-ulimits ceph-objectstore-tool " - "--data-path {fpath} --journal-path {jpath} " - "--log-file=" - "/var/log/ceph/objectstore_tool.\\$pid.log ". - format(fpath=FSPATH, jpath=JPATH)) - cmd = (prefix + "--op list-pgs").format(id=exp_osd) - - # ceph-objectstore-tool might be temporarily absent during an - # upgrade - see http://tracker.ceph.com/issues/18014 - with safe_while(sleep=15, tries=40, action="type ceph-objectstore-tool") as proceed: - while proceed(): - proc = exp_remote.run(args=['type', 'ceph-objectstore-tool'], - wait=True, check_status=False, stdout=StringIO(), - stderr=StringIO()) - if proc.exitstatus == 0: - break - log.debug("ceph-objectstore-tool binary not present, trying again") - - # ceph-objectstore-tool might bogusly fail with "OSD has the store locked" - # see http://tracker.ceph.com/issues/19556 - with safe_while(sleep=15, tries=40, action="ceph-objectstore-tool --op list-pgs") as proceed: - while proceed(): - proc = exp_remote.run(args=cmd, wait=True, - check_status=False, - stdout=StringIO(), stderr=StringIO()) - if proc.exitstatus == 0: - break - elif proc.exitstatus == 1 and proc.stderr == "OSD has the store locked": - continue - else: - raise Exception("ceph-objectstore-tool: " - "exp list-pgs failure with status {ret}". - format(ret=proc.exitstatus)) - - pgs = proc.stdout.getvalue().split('\n')[:-1] - if len(pgs) == 0: - self.log("No PGs found for osd.{osd}".format(osd=exp_osd)) - return - pg = random.choice(pgs) - exp_path = teuthology.get_testdir(self.ceph_manager.ctx) - exp_path = os.path.join(exp_path, '{0}.data'.format(self.cluster)) - exp_path = os.path.join(exp_path, - "exp.{pg}.{id}".format( - pg=pg, - id=exp_osd)) - # export - # Can't use new export-remove op since this is part of upgrade testing - cmd = prefix + "--op export --pgid {pg} --file {file}" - cmd = cmd.format(id=exp_osd, pg=pg, file=exp_path) - proc = exp_remote.run(args=cmd) - if proc.exitstatus: - raise Exception("ceph-objectstore-tool: " - "export failure with status {ret}". - format(ret=proc.exitstatus)) - # remove - cmd = prefix + "--force --op remove --pgid {pg}" - cmd = cmd.format(id=exp_osd, pg=pg) - proc = exp_remote.run(args=cmd) - if proc.exitstatus: - raise Exception("ceph-objectstore-tool: " - "remove failure with status {ret}". - format(ret=proc.exitstatus)) - # If there are at least 2 dead osds we might move the pg - if exp_osd != imp_osd: - # If pg isn't already on this osd, then we will move it there - cmd = (prefix + "--op list-pgs").format(id=imp_osd) - proc = imp_remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - if proc.exitstatus: - raise Exception("ceph-objectstore-tool: " - "imp list-pgs failure with status {ret}". - format(ret=proc.exitstatus)) - pgs = proc.stdout.getvalue().split('\n')[:-1] - if pg not in pgs: - self.log("Moving pg {pg} from osd.{fosd} to osd.{tosd}". - format(pg=pg, fosd=exp_osd, tosd=imp_osd)) - if imp_remote != exp_remote: - # Copy export file to the other machine - self.log("Transfer export file from {srem} to {trem}". - format(srem=exp_remote, trem=imp_remote)) - tmpexport = Remote.get_file(exp_remote, exp_path) - Remote.put_file(imp_remote, tmpexport, exp_path) - os.remove(tmpexport) - else: - # Can't move the pg after all - imp_osd = exp_osd - imp_remote = exp_remote - # import - cmd = (prefix + "--op import --file {file}") - cmd = cmd.format(id=imp_osd, file=exp_path) - proc = imp_remote.run(args=cmd, wait=True, check_status=False, - stderr=StringIO()) - if proc.exitstatus == 1: - bogosity = "The OSD you are using is older than the exported PG" - if bogosity in proc.stderr.getvalue(): - self.log("OSD older than exported PG" - "...ignored") - elif proc.exitstatus == 10: - self.log("Pool went away before processing an import" - "...ignored") - elif proc.exitstatus == 11: - self.log("Attempt to import an incompatible export" - "...ignored") - elif proc.exitstatus: - raise Exception("ceph-objectstore-tool: " - "import failure with status {ret}". - format(ret=proc.exitstatus)) - cmd = "rm -f {file}".format(file=exp_path) - exp_remote.run(args=cmd) - if imp_remote != exp_remote: - imp_remote.run(args=cmd) - - # apply low split settings to each pool - for pool in self.ceph_manager.list_pools(): - no_sudo_prefix = prefix[5:] - cmd = ("CEPH_ARGS='--filestore-merge-threshold 1 " - "--filestore-split-multiple 1' sudo -E " - + no_sudo_prefix + "--op apply-layout-settings --pool " + pool).format(id=osd) - proc = remote.run(args=cmd, wait=True, check_status=False, stderr=StringIO()) - output = proc.stderr.getvalue() - if 'Couldn\'t find pool' in output: - continue - if proc.exitstatus: - raise Exception("ceph-objectstore-tool apply-layout-settings" - " failed with {status}".format(status=proc.exitstatus)) - - def rm_past_intervals(self, osd=None): - """ - :param osd: Osd to find pg to remove past intervals - """ - if self.test_rm_past_intervals: - if osd is None: - osd = random.choice(self.dead_osds) - self.log("Use ceph_objectstore_tool to remove past intervals") - remote = self.ceph_manager.find_remote('osd', osd) - FSPATH = self.ceph_manager.get_filepath() - JPATH = os.path.join(FSPATH, "journal") - if ('keyvaluestore_backend' in - self.ceph_manager.ctx.ceph[self.cluster].conf['osd']): - prefix = ("sudo adjust-ulimits ceph-objectstore-tool " - "--data-path {fpath} --journal-path {jpath} " - "--type keyvaluestore " - "--log-file=" - "/var/log/ceph/objectstore_tool.\\$pid.log ". - format(fpath=FSPATH, jpath=JPATH)) - else: - prefix = ("sudo adjust-ulimits ceph-objectstore-tool " - "--data-path {fpath} --journal-path {jpath} " - "--log-file=" - "/var/log/ceph/objectstore_tool.\\$pid.log ". - format(fpath=FSPATH, jpath=JPATH)) - cmd = (prefix + "--op list-pgs").format(id=osd) - proc = remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - if proc.exitstatus: - raise Exception("ceph_objectstore_tool: " - "exp list-pgs failure with status {ret}". - format(ret=proc.exitstatus)) - pgs = proc.stdout.getvalue().split('\n')[:-1] - if len(pgs) == 0: - self.log("No PGs found for osd.{osd}".format(osd=osd)) - return - pg = random.choice(pgs) - cmd = (prefix + "--op rm-past-intervals --pgid {pg}").\ - format(id=osd, pg=pg) - proc = remote.run(args=cmd) - if proc.exitstatus: - raise Exception("ceph_objectstore_tool: " - "rm-past-intervals failure with status {ret}". - format(ret=proc.exitstatus)) - - def blackhole_kill_osd(self, osd=None): - """ - If all else fails, kill the osd. - :param osd: Osd to be killed. - """ - if osd is None: - osd = random.choice(self.live_osds) - self.log("Blackholing and then killing osd %s, live_osds are %s" % - (str(osd), str(self.live_osds))) - self.live_osds.remove(osd) - self.dead_osds.append(osd) - self.ceph_manager.blackhole_kill_osd(osd) - - def revive_osd(self, osd=None, skip_admin_check=False): - """ - Revive the osd. - :param osd: Osd to be revived. - """ - if osd is None: - osd = random.choice(self.dead_osds) - self.log("Reviving osd %s" % (str(osd),)) - self.ceph_manager.revive_osd( - osd, - self.revive_timeout, - skip_admin_check=skip_admin_check) - self.dead_osds.remove(osd) - self.live_osds.append(osd) - if self.random_eio > 0 and osd is self.rerrosd: - self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd), - 'injectargs', '--', '--filestore_debug_random_read_err='+str(self.random_eio)) - self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd), - 'injectargs', '--', '--bluestore_debug_random_read_err='+str(self.random_eio)) - - - def out_osd(self, osd=None): - """ - Mark the osd out - :param osd: Osd to be marked. - """ - if osd is None: - osd = random.choice(self.in_osds) - self.log("Removing osd %s, in_osds are: %s" % - (str(osd), str(self.in_osds))) - self.ceph_manager.mark_out_osd(osd) - self.in_osds.remove(osd) - self.out_osds.append(osd) - - def in_osd(self, osd=None): - """ - Mark the osd out - :param osd: Osd to be marked. - """ - if osd is None: - osd = random.choice(self.out_osds) - if osd in self.dead_osds: - return self.revive_osd(osd) - self.log("Adding osd %s" % (str(osd),)) - self.out_osds.remove(osd) - self.in_osds.append(osd) - self.ceph_manager.mark_in_osd(osd) - self.log("Added osd %s" % (str(osd),)) - - def reweight_osd_or_by_util(self, osd=None): - """ - Reweight an osd that is in - :param osd: Osd to be marked. - """ - if osd is not None or random.choice([True, False]): - if osd is None: - osd = random.choice(self.in_osds) - val = random.uniform(.1, 1.0) - self.log("Reweighting osd %s to %s" % (str(osd), str(val))) - self.ceph_manager.raw_cluster_cmd('osd', 'reweight', - str(osd), str(val)) - else: - # do it several times, the option space is large - for i in range(5): - options = { - 'max_change': random.choice(['0.05', '1.0', '3.0']), - 'overage': random.choice(['110', '1000']), - 'type': random.choice([ - 'reweight-by-utilization', - 'test-reweight-by-utilization']), - } - self.log("Reweighting by: %s"%(str(options),)) - self.ceph_manager.raw_cluster_cmd( - 'osd', - options['type'], - options['overage'], - options['max_change']) - - def primary_affinity(self, osd=None): - if osd is None: - osd = random.choice(self.in_osds) - if random.random() >= .5: - pa = random.random() - elif random.random() >= .5: - pa = 1 - else: - pa = 0 - self.log('Setting osd %s primary_affinity to %f' % (str(osd), pa)) - self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity', - str(osd), str(pa)) - - def thrash_cluster_full(self): - """ - Set and unset cluster full condition - """ - self.log('Setting full ratio to .001') - self.ceph_manager.raw_cluster_cmd('osd', 'set-full-ratio', '.001') - time.sleep(1) - self.log('Setting full ratio back to .95') - self.ceph_manager.raw_cluster_cmd('osd', 'set-full-ratio', '.95') - - def thrash_pg_upmap(self): - """ - Install or remove random pg_upmap entries in OSDMap - """ - from random import shuffle - out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') - j = json.loads(out) - self.log('j is %s' % j) - try: - if random.random() >= .3: - pgs = self.ceph_manager.get_pg_stats() - pg = random.choice(pgs) - pgid = str(pg['pgid']) - poolid = int(pgid.split('.')[0]) - sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] - if len(sizes) == 0: - return - n = sizes[0] - osds = self.in_osds + self.out_osds - shuffle(osds) - osds = osds[0:n] - self.log('Setting %s to %s' % (pgid, osds)) - cmd = ['osd', 'pg-upmap', pgid] + [str(x) for x in osds] - self.log('cmd %s' % cmd) - self.ceph_manager.raw_cluster_cmd(*cmd) - else: - m = j['pg_upmap'] - if len(m) > 0: - shuffle(m) - pg = m[0]['pgid'] - self.log('Clearing pg_upmap on %s' % pg) - self.ceph_manager.raw_cluster_cmd( - 'osd', - 'rm-pg-upmap', - pg) - else: - self.log('No pg_upmap entries; doing nothing') - except CommandFailedError: - self.log('Failed to rm-pg-upmap, ignoring') - - def thrash_pg_upmap_items(self): - """ - Install or remove random pg_upmap_items entries in OSDMap - """ - from random import shuffle - out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') - j = json.loads(out) - self.log('j is %s' % j) - try: - if random.random() >= .3: - pgs = self.ceph_manager.get_pg_stats() - pg = random.choice(pgs) - pgid = str(pg['pgid']) - poolid = int(pgid.split('.')[0]) - sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] - if len(sizes) == 0: - return - n = sizes[0] - osds = self.in_osds + self.out_osds - shuffle(osds) - osds = osds[0:n*2] - self.log('Setting %s to %s' % (pgid, osds)) - cmd = ['osd', 'pg-upmap-items', pgid] + [str(x) for x in osds] - self.log('cmd %s' % cmd) - self.ceph_manager.raw_cluster_cmd(*cmd) - else: - m = j['pg_upmap_items'] - if len(m) > 0: - shuffle(m) - pg = m[0]['pgid'] - self.log('Clearing pg_upmap on %s' % pg) - self.ceph_manager.raw_cluster_cmd( - 'osd', - 'rm-pg-upmap-items', - pg) - else: - self.log('No pg_upmap entries; doing nothing') - except CommandFailedError: - self.log('Failed to rm-pg-upmap-items, ignoring') - - def force_recovery(self): - """ - Force recovery on some of PGs - """ - backfill = random.random() >= 0.5 - j = self.ceph_manager.get_pgids_to_force(backfill) - if j: - if backfill: - self.ceph_manager.raw_cluster_cmd('pg', 'force-backfill', *j) - else: - self.ceph_manager.raw_cluster_cmd('pg', 'force-recovery', *j) - - def cancel_force_recovery(self): - """ - Force recovery on some of PGs - """ - backfill = random.random() >= 0.5 - j = self.ceph_manager.get_pgids_to_cancel_force(backfill) - if j: - if backfill: - self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-backfill', *j) - else: - self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-recovery', *j) - - def force_cancel_recovery(self): - """ - Force or cancel forcing recovery - """ - if random.random() >= 0.4: - self.force_recovery() - else: - self.cancel_force_recovery() - - def all_up(self): - """ - Make sure all osds are up and not out. - """ - while len(self.dead_osds) > 0: - self.log("reviving osd") - self.revive_osd() - while len(self.out_osds) > 0: - self.log("inning osd") - self.in_osd() - - def all_up_in(self): - """ - Make sure all osds are up and fully in. - """ - self.all_up(); - for osd in self.live_osds: - self.ceph_manager.raw_cluster_cmd('osd', 'reweight', - str(osd), str(1)) - self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity', - str(osd), str(1)) - - def do_join(self): - """ - Break out of this Ceph loop - """ - self.stopping = True - self.thread.get() - if self.sighup_delay: - self.log("joining the do_sighup greenlet") - self.sighup_thread.get() - if self.optrack_toggle_delay: - self.log("joining the do_optrack_toggle greenlet") - self.optrack_toggle_thread.join() - if self.dump_ops_enable == "true": - self.log("joining the do_dump_ops greenlet") - self.dump_ops_thread.join() - if self.noscrub_toggle_delay: - self.log("joining the do_noscrub_toggle greenlet") - self.noscrub_toggle_thread.join() - - def grow_pool(self): - """ - Increase the size of the pool - """ - pool = self.ceph_manager.get_pool() - orig_pg_num = self.ceph_manager.get_pool_pg_num(pool) - self.log("Growing pool %s" % (pool,)) - if self.ceph_manager.expand_pool(pool, - self.config.get('pool_grow_by', 10), - self.max_pgs): - self.pools_to_fix_pgp_num.add(pool) - - def fix_pgp_num(self, pool=None): - """ - Fix number of pgs in pool. - """ - if pool is None: - pool = self.ceph_manager.get_pool() - force = False - else: - force = True - self.log("fixing pg num pool %s" % (pool,)) - if self.ceph_manager.set_pool_pgpnum(pool, force): - self.pools_to_fix_pgp_num.discard(pool) - - def test_pool_min_size(self): - """ - Kill and revive all osds except one. - """ - self.log("test_pool_min_size") - self.all_up() - self.ceph_manager.wait_for_recovery( - timeout=self.config.get('timeout') - ) - the_one = random.choice(self.in_osds) - self.log("Killing everyone but %s", the_one) - to_kill = filter(lambda x: x != the_one, self.in_osds) - [self.kill_osd(i) for i in to_kill] - [self.out_osd(i) for i in to_kill] - time.sleep(self.config.get("test_pool_min_size_time", 10)) - self.log("Killing %s" % (the_one,)) - self.kill_osd(the_one) - self.out_osd(the_one) - self.log("Reviving everyone but %s" % (the_one,)) - [self.revive_osd(i) for i in to_kill] - [self.in_osd(i) for i in to_kill] - self.log("Revived everyone but %s" % (the_one,)) - self.log("Waiting for clean") - self.ceph_manager.wait_for_recovery( - timeout=self.config.get('timeout') - ) - - def inject_pause(self, conf_key, duration, check_after, should_be_down): - """ - Pause injection testing. Check for osd being down when finished. - """ - the_one = random.choice(self.live_osds) - self.log("inject_pause on {osd}".format(osd=the_one)) - self.log( - "Testing {key} pause injection for duration {duration}".format( - key=conf_key, - duration=duration - )) - self.log( - "Checking after {after}, should_be_down={shouldbedown}".format( - after=check_after, - shouldbedown=should_be_down - )) - self.ceph_manager.set_config(the_one, **{conf_key: duration}) - if not should_be_down: - return - time.sleep(check_after) - status = self.ceph_manager.get_osd_status() - assert the_one in status['down'] - time.sleep(duration - check_after + 20) - status = self.ceph_manager.get_osd_status() - assert not the_one in status['down'] - - def test_backfill_full(self): - """ - Test backfills stopping when the replica fills up. - - First, use injectfull admin command to simulate a now full - osd by setting it to 0 on all of the OSDs. - - Second, on a random subset, set - osd_debug_skip_full_check_in_backfill_reservation to force - the more complicated check in do_scan to be exercised. - - Then, verify that all backfillings stop. - """ - self.log("injecting backfill full") - for i in self.live_osds: - self.ceph_manager.set_config( - i, - osd_debug_skip_full_check_in_backfill_reservation= - random.choice(['false', 'true'])) - self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'backfillfull'], - check_status=True, timeout=30, stdout=DEVNULL) - for i in range(30): - status = self.ceph_manager.compile_pg_status() - if 'backfilling' not in status.keys(): - break - self.log( - "waiting for {still_going} backfillings".format( - still_going=status.get('backfilling'))) - time.sleep(1) - assert('backfilling' not in self.ceph_manager.compile_pg_status().keys()) - for i in self.live_osds: - self.ceph_manager.set_config( - i, - osd_debug_skip_full_check_in_backfill_reservation='false') - self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'none'], - check_status=True, timeout=30, stdout=DEVNULL) - - def test_map_discontinuity(self): - """ - 1) Allows the osds to recover - 2) kills an osd - 3) allows the remaining osds to recover - 4) waits for some time - 5) revives the osd - This sequence should cause the revived osd to have to handle - a map gap since the mons would have trimmed - """ - while len(self.in_osds) < (self.minin + 1): - self.in_osd() - self.log("Waiting for recovery") - self.ceph_manager.wait_for_all_osds_up( - timeout=self.config.get('timeout') - ) - # now we wait 20s for the pg status to change, if it takes longer, - # the test *should* fail! - time.sleep(20) - self.ceph_manager.wait_for_clean( - timeout=self.config.get('timeout') - ) - - # now we wait 20s for the backfill replicas to hear about the clean - time.sleep(20) - self.log("Recovered, killing an osd") - self.kill_osd(mark_down=True, mark_out=True) - self.log("Waiting for clean again") - self.ceph_manager.wait_for_clean( - timeout=self.config.get('timeout') - ) - self.log("Waiting for trim") - time.sleep(int(self.config.get("map_discontinuity_sleep_time", 40))) - self.revive_osd() - - def choose_action(self): - """ - Random action selector. - """ - chance_down = self.config.get('chance_down', 0.4) - chance_test_min_size = self.config.get('chance_test_min_size', 0) - chance_test_backfill_full = \ - self.config.get('chance_test_backfill_full', 0) - if isinstance(chance_down, int): - chance_down = float(chance_down) / 100 - minin = self.minin - minout = self.config.get("min_out", 0) - minlive = self.config.get("min_live", 2) - mindead = self.config.get("min_dead", 0) - - self.log('choose_action: min_in %d min_out ' - '%d min_live %d min_dead %d' % - (minin, minout, minlive, mindead)) - actions = [] - if len(self.in_osds) > minin: - actions.append((self.out_osd, 1.0,)) - if len(self.live_osds) > minlive and chance_down > 0: - actions.append((self.kill_osd, chance_down,)) - if len(self.dead_osds) > 1: - actions.append((self.rm_past_intervals, 1.0,)) - if len(self.out_osds) > minout: - actions.append((self.in_osd, 1.7,)) - if len(self.dead_osds) > mindead: - actions.append((self.revive_osd, 1.0,)) - if self.config.get('thrash_primary_affinity', True): - actions.append((self.primary_affinity, 1.0,)) - actions.append((self.reweight_osd_or_by_util, - self.config.get('reweight_osd', .5),)) - actions.append((self.grow_pool, - self.config.get('chance_pgnum_grow', 0),)) - actions.append((self.fix_pgp_num, - self.config.get('chance_pgpnum_fix', 0),)) - actions.append((self.test_pool_min_size, - chance_test_min_size,)) - actions.append((self.test_backfill_full, - chance_test_backfill_full,)) - if self.chance_thrash_cluster_full > 0: - actions.append((self.thrash_cluster_full, self.chance_thrash_cluster_full,)) - if self.chance_thrash_pg_upmap > 0: - actions.append((self.thrash_pg_upmap, self.chance_thrash_pg_upmap,)) - if self.chance_thrash_pg_upmap_items > 0: - actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,)) - if self.chance_force_recovery > 0: - actions.append((self.force_cancel_recovery, self.chance_force_recovery)) - - for key in ['heartbeat_inject_failure', 'filestore_inject_stall']: - for scenario in [ - (lambda: - self.inject_pause(key, - self.config.get('pause_short', 3), - 0, - False), - self.config.get('chance_inject_pause_short', 1),), - (lambda: - self.inject_pause(key, - self.config.get('pause_long', 80), - self.config.get('pause_check_after', 70), - True), - self.config.get('chance_inject_pause_long', 0),)]: - actions.append(scenario) - - total = sum([y for (x, y) in actions]) - val = random.uniform(0, total) - for (action, prob) in actions: - if val < prob: - return action - val -= prob - return None - - def log_exc(func): - @wraps(func) - def wrapper(self): - try: - return func(self) - except: - self.log(traceback.format_exc()) - raise - return wrapper - - @log_exc - def do_sighup(self): - """ - Loops and sends signal.SIGHUP to a random live osd. - - Loop delay is controlled by the config value sighup_delay. - """ - delay = float(self.sighup_delay) - self.log("starting do_sighup with a delay of {0}".format(delay)) - while not self.stopping: - osd = random.choice(self.live_osds) - self.ceph_manager.signal_osd(osd, signal.SIGHUP, silent=True) - time.sleep(delay) - - @log_exc - def do_optrack_toggle(self): - """ - Loops and toggle op tracking to all osds. - - Loop delay is controlled by the config value optrack_toggle_delay. - """ - delay = float(self.optrack_toggle_delay) - osd_state = "true" - self.log("starting do_optrack_toggle with a delay of {0}".format(delay)) - while not self.stopping: - if osd_state == "true": - osd_state = "false" - else: - osd_state = "true" - self.ceph_manager.raw_cluster_cmd_result('tell', 'osd.*', - 'injectargs', '--osd_enable_op_tracker=%s' % osd_state) - gevent.sleep(delay) - - @log_exc - def do_dump_ops(self): - """ - Loops and does op dumps on all osds - """ - self.log("starting do_dump_ops") - while not self.stopping: - for osd in self.live_osds: - # Ignore errors because live_osds is in flux - self.ceph_manager.osd_admin_socket(osd, command=['dump_ops_in_flight'], - check_status=False, timeout=30, stdout=DEVNULL) - self.ceph_manager.osd_admin_socket(osd, command=['dump_blocked_ops'], - check_status=False, timeout=30, stdout=DEVNULL) - self.ceph_manager.osd_admin_socket(osd, command=['dump_historic_ops'], - check_status=False, timeout=30, stdout=DEVNULL) - gevent.sleep(0) - - @log_exc - def do_noscrub_toggle(self): - """ - Loops and toggle noscrub flags - - Loop delay is controlled by the config value noscrub_toggle_delay. - """ - delay = float(self.noscrub_toggle_delay) - scrub_state = "none" - self.log("starting do_noscrub_toggle with a delay of {0}".format(delay)) - while not self.stopping: - if scrub_state == "none": - self.ceph_manager.raw_cluster_cmd('osd', 'set', 'noscrub') - scrub_state = "noscrub" - elif scrub_state == "noscrub": - self.ceph_manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub') - scrub_state = "both" - elif scrub_state == "both": - self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub') - scrub_state = "nodeep-scrub" - else: - self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub') - scrub_state = "none" - gevent.sleep(delay) - self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub') - self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub') - - @log_exc - def do_thrash(self): - """ - Loop to select random actions to thrash ceph manager with. - """ - cleanint = self.config.get("clean_interval", 60) - scrubint = self.config.get("scrub_interval", -1) - maxdead = self.config.get("max_dead", 0) - delay = self.config.get("op_delay", 5) - self.rerrosd = self.live_osds[0] - if self.random_eio > 0: - self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd), - 'injectargs', '--', '--filestore_debug_random_read_err='+str(self.random_eio)) - self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd), - 'injectargs', '--', '--bluestore_debug_random_read_err='+str(self.random_eio)) - self.log("starting do_thrash") - while not self.stopping: - to_log = [str(x) for x in ["in_osds: ", self.in_osds, - "out_osds: ", self.out_osds, - "dead_osds: ", self.dead_osds, - "live_osds: ", self.live_osds]] - self.log(" ".join(to_log)) - if random.uniform(0, 1) < (float(delay) / cleanint): - while len(self.dead_osds) > maxdead: - self.revive_osd() - for osd in self.in_osds: - self.ceph_manager.raw_cluster_cmd('osd', 'reweight', - str(osd), str(1)) - if random.uniform(0, 1) < float( - self.config.get('chance_test_map_discontinuity', 0)): - self.test_map_discontinuity() - else: - self.ceph_manager.wait_for_recovery( - timeout=self.config.get('timeout') - ) - time.sleep(self.clean_wait) - if scrubint > 0: - if random.uniform(0, 1) < (float(delay) / scrubint): - self.log('Scrubbing while thrashing being performed') - Scrubber(self.ceph_manager, self.config) - self.choose_action()() - time.sleep(delay) - self.all_up() - if self.random_eio > 0: - self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd), - 'injectargs', '--', '--filestore_debug_random_read_err=0.0') - self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd), - 'injectargs', '--', '--bluestore_debug_random_read_err=0.0') - for pool in list(self.pools_to_fix_pgp_num): - if self.ceph_manager.get_pool_pg_num(pool) > 0: - self.fix_pgp_num(pool) - self.pools_to_fix_pgp_num.clear() - for service, opt, saved_value in self.saved_options: - self._set_config(service, '*', opt, saved_value) - self.saved_options = [] - self.all_up_in() - - -class ObjectStoreTool: - - def __init__(self, manager, pool, **kwargs): - self.manager = manager - self.pool = pool - self.osd = kwargs.get('osd', None) - self.object_name = kwargs.get('object_name', None) - self.do_revive = kwargs.get('do_revive', True) - if self.osd and self.pool and self.object_name: - if self.osd == "primary": - self.osd = self.manager.get_object_primary(self.pool, - self.object_name) - assert self.osd - if self.object_name: - self.pgid = self.manager.get_object_pg_with_shard(self.pool, - self.object_name, - self.osd) - self.remote = self.manager.ctx.\ - cluster.only('osd.{o}'.format(o=self.osd)).remotes.keys()[0] - path = self.manager.get_filepath().format(id=self.osd) - self.paths = ("--data-path {path} --journal-path {path}/journal". - format(path=path)) - - def build_cmd(self, options, args, stdin): - lines = [] - if self.object_name: - lines.append("object=$(sudo adjust-ulimits ceph-objectstore-tool " - "{paths} --pgid {pgid} --op list |" - "grep '\"oid\":\"{name}\"')". - format(paths=self.paths, - pgid=self.pgid, - name=self.object_name)) - args = '"$object" ' + args - options += " --pgid {pgid}".format(pgid=self.pgid) - cmd = ("sudo adjust-ulimits ceph-objectstore-tool {paths} {options} {args}". - format(paths=self.paths, - args=args, - options=options)) - if stdin: - cmd = ("echo {payload} | base64 --decode | {cmd}". - format(payload=base64.encode(stdin), - cmd=cmd)) - lines.append(cmd) - return "\n".join(lines) - - def run(self, options, args, stdin=None, stdout=None): - if stdout is None: - stdout = StringIO() - self.manager.kill_osd(self.osd) - cmd = self.build_cmd(options, args, stdin) - self.manager.log(cmd) - try: - proc = self.remote.run(args=['bash', '-e', '-x', '-c', cmd], - check_status=False, - stdout=stdout, - stderr=StringIO()) - proc.wait() - if proc.exitstatus != 0: - self.manager.log("failed with " + str(proc.exitstatus)) - error = proc.stdout.getvalue() + " " + proc.stderr.getvalue() - raise Exception(error) - finally: - if self.do_revive: - self.manager.revive_osd(self.osd) - self.manager.wait_till_osd_is_up(self.osd, 300) - - -class CephManager: - """ - Ceph manager object. - Contains several local functions that form a bulk of this module. - - Note: this class has nothing to do with the Ceph daemon (ceph-mgr) of - the same name. - """ - - REPLICATED_POOL = 1 - ERASURE_CODED_POOL = 3 - - def __init__(self, controller, ctx=None, config=None, logger=None, - cluster='ceph'): - self.lock = threading.RLock() - self.ctx = ctx - self.config = config - self.controller = controller - self.next_pool_id = 0 - self.cluster = cluster - if (logger): - self.log = lambda x: logger.info(x) - else: - def tmp(x): - """ - implement log behavior. - """ - print x - self.log = tmp - if self.config is None: - self.config = dict() - pools = self.list_pools() - self.pools = {} - for pool in pools: - # we may race with a pool deletion; ignore failures here - try: - self.pools[pool] = self.get_pool_property(pool, 'pg_num') - except CommandFailedError: - self.log('Failed to get pg_num from pool %s, ignoring' % pool) - - def raw_cluster_cmd(self, *args): - """ - Start ceph on a raw cluster. Return count - """ - testdir = teuthology.get_testdir(self.ctx) - ceph_args = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'timeout', - '120', - 'ceph', - '--cluster', - self.cluster, - ] - ceph_args.extend(args) - proc = self.controller.run( - args=ceph_args, - stdout=StringIO(), - ) - return proc.stdout.getvalue() - - def raw_cluster_cmd_result(self, *args): - """ - Start ceph on a cluster. Return success or failure information. - """ - testdir = teuthology.get_testdir(self.ctx) - ceph_args = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'timeout', - '120', - 'ceph', - '--cluster', - self.cluster, - ] - ceph_args.extend(args) - proc = self.controller.run( - args=ceph_args, - check_status=False, - ) - return proc.exitstatus - - def run_ceph_w(self): - """ - Execute "ceph -w" in the background with stdout connected to a StringIO, - and return the RemoteProcess. - """ - return self.controller.run( - args=["sudo", - "daemon-helper", - "kill", - "ceph", - '--cluster', - self.cluster, - "-w"], - wait=False, stdout=StringIO(), stdin=run.PIPE) - - def flush_pg_stats(self, osds, no_wait=None, wait_for_mon=300): - """ - Flush pg stats from a list of OSD ids, ensuring they are reflected - all the way to the monitor. Luminous and later only. - - :param osds: list of OSDs to flush - :param no_wait: list of OSDs not to wait for seq id. by default, we - wait for all specified osds, but some of them could be - moved out of osdmap, so we cannot get their updated - stat seq from monitor anymore. in that case, you need - to pass a blacklist. - :param wait_for_mon: wait for mon to be synced with mgr. 0 to disable - it. (5 min by default) - """ - seq = {osd: self.raw_cluster_cmd('tell', 'osd.%d' % osd, 'flush_pg_stats') - for osd in osds} - if not wait_for_mon: - return - if no_wait is None: - no_wait = [] - for osd, need in seq.iteritems(): - if osd in no_wait: - continue - got = 0 - while wait_for_mon > 0: - got = self.raw_cluster_cmd('osd', 'last-stat-seq', 'osd.%d' % osd) - self.log('need seq {need} got {got} for osd.{osd}'.format( - need=need, got=got, osd=osd)) - if got >= need: - break - A_WHILE = 1 - time.sleep(A_WHILE) - wait_for_mon -= A_WHILE - else: - raise Exception('timed out waiting for mon to be updated with ' - 'osd.{osd}: {got} < {need}'. - format(osd=osd, got=got, need=need)) - - def flush_all_pg_stats(self): - self.flush_pg_stats(range(len(self.get_osd_dump()))) - - def do_rados(self, remote, cmd, check_status=True): - """ - Execute a remote rados command. - """ - testdir = teuthology.get_testdir(self.ctx) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - '--cluster', - self.cluster, - ] - pre.extend(cmd) - proc = remote.run( - args=pre, - wait=True, - check_status=check_status - ) - return proc - - def rados_write_objects(self, pool, num_objects, size, - timelimit, threads, cleanup=False): - """ - Write rados objects - Threads not used yet. - """ - args = [ - '-p', pool, - '--num-objects', num_objects, - '-b', size, - 'bench', timelimit, - 'write' - ] - if not cleanup: - args.append('--no-cleanup') - return self.do_rados(self.controller, map(str, args)) - - def do_put(self, pool, obj, fname, namespace=None): - """ - Implement rados put operation - """ - args = ['-p', pool] - if namespace is not None: - args += ['-N', namespace] - args += [ - 'put', - obj, - fname - ] - return self.do_rados( - self.controller, - args, - check_status=False - ).exitstatus - - def do_get(self, pool, obj, fname='/dev/null', namespace=None): - """ - Implement rados get operation - """ - args = ['-p', pool] - if namespace is not None: - args += ['-N', namespace] - args += [ - 'get', - obj, - fname - ] - return self.do_rados( - self.controller, - args, - check_status=False - ).exitstatus - - def do_rm(self, pool, obj, namespace=None): - """ - Implement rados rm operation - """ - args = ['-p', pool] - if namespace is not None: - args += ['-N', namespace] - args += [ - 'rm', - obj - ] - return self.do_rados( - self.controller, - args, - check_status=False - ).exitstatus - - def osd_admin_socket(self, osd_id, command, check_status=True, timeout=0, stdout=None): - if stdout is None: - stdout = StringIO() - return self.admin_socket('osd', osd_id, command, check_status, timeout, stdout) - - def find_remote(self, service_type, service_id): - """ - Get the Remote for the host where a particular service runs. - - :param service_type: 'mds', 'osd', 'client' - :param service_id: The second part of a role, e.g. '0' for - the role 'client.0' - :return: a Remote instance for the host where the - requested role is placed - """ - return get_remote(self.ctx, self.cluster, - service_type, service_id) - - def admin_socket(self, service_type, service_id, - command, check_status=True, timeout=0, stdout=None): - """ - Remotely start up ceph specifying the admin socket - :param command: a list of words to use as the command - to the admin socket - """ - if stdout is None: - stdout = StringIO() - testdir = teuthology.get_testdir(self.ctx) - remote = self.find_remote(service_type, service_id) - args = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'timeout', - str(timeout), - 'ceph', - '--cluster', - self.cluster, - '--admin-daemon', - '/var/run/ceph/{cluster}-{type}.{id}.asok'.format( - cluster=self.cluster, - type=service_type, - id=service_id), - ] - args.extend(command) - return remote.run( - args=args, - stdout=stdout, - wait=True, - check_status=check_status - ) - - def objectstore_tool(self, pool, options, args, **kwargs): - return ObjectStoreTool(self, pool, **kwargs).run(options, args) - - def get_pgid(self, pool, pgnum): - """ - :param pool: pool name - :param pgnum: pg number - :returns: a string representing this pg. - """ - poolnum = self.get_pool_num(pool) - pg_str = "{poolnum}.{pgnum}".format( - poolnum=poolnum, - pgnum=pgnum) - return pg_str - - def get_pg_replica(self, pool, pgnum): - """ - get replica for pool, pgnum (e.g. (data, 0)->0 - """ - pg_str = self.get_pgid(pool, pgnum) - output = self.raw_cluster_cmd("pg", "map", pg_str, '--format=json') - j = json.loads('\n'.join(output.split('\n')[1:])) - return int(j['acting'][-1]) - assert False - - def wait_for_pg_stats(func): - # both osd_mon_report_interval_min and mgr_stats_period are 5 seconds - # by default, and take the faulty injection in ms into consideration, - # 12 seconds are more than enough - delays = [1, 1, 2, 3, 5, 8, 13] - @wraps(func) - def wrapper(self, *args, **kwargs): - exc = None - for delay in delays: - try: - return func(self, *args, **kwargs) - except AssertionError as e: - time.sleep(delay) - exc = e - raise exc - return wrapper - - def get_pg_primary(self, pool, pgnum): - """ - get primary for pool, pgnum (e.g. (data, 0)->0 - """ - pg_str = self.get_pgid(pool, pgnum) - output = self.raw_cluster_cmd("pg", "map", pg_str, '--format=json') - j = json.loads('\n'.join(output.split('\n')[1:])) - return int(j['acting'][0]) - assert False - - def get_pool_num(self, pool): - """ - get number for pool (e.g., data -> 2) - """ - return int(self.get_pool_dump(pool)['pool']) - - def list_pools(self): - """ - list all pool names - """ - osd_dump = self.get_osd_dump_json() - self.log(osd_dump['pools']) - return [str(i['pool_name']) for i in osd_dump['pools']] - - def clear_pools(self): - """ - remove all pools - """ - [self.remove_pool(i) for i in self.list_pools()] - - def kick_recovery_wq(self, osdnum): - """ - Run kick_recovery_wq on cluster. - """ - return self.raw_cluster_cmd( - 'tell', "osd.%d" % (int(osdnum),), - 'debug', - 'kick_recovery_wq', - '0') - - def wait_run_admin_socket(self, service_type, - service_id, args=['version'], timeout=75, stdout=None): - """ - If osd_admin_socket call suceeds, return. Otherwise wait - five seconds and try again. - """ - if stdout is None: - stdout = StringIO() - tries = 0 - while True: - proc = self.admin_socket(service_type, service_id, - args, check_status=False, stdout=stdout) - if proc.exitstatus is 0: - return proc - else: - tries += 1 - if (tries * 5) > timeout: - raise Exception('timed out waiting for admin_socket ' - 'to appear after {type}.{id} restart'. - format(type=service_type, - id=service_id)) - self.log("waiting on admin_socket for {type}-{id}, " - "{command}".format(type=service_type, - id=service_id, - command=args)) - time.sleep(5) - - def get_pool_dump(self, pool): - """ - get the osd dump part of a pool - """ - osd_dump = self.get_osd_dump_json() - for i in osd_dump['pools']: - if i['pool_name'] == pool: - return i - assert False - - def get_config(self, service_type, service_id, name): - """ - :param node: like 'mon.a' - :param name: the option name - """ - proc = self.wait_run_admin_socket(service_type, service_id, - ['config', 'show']) - j = json.loads(proc.stdout.getvalue()) - return j[name] - - def set_config(self, osdnum, **argdict): - """ - :param osdnum: osd number - :param argdict: dictionary containing values to set. - """ - for k, v in argdict.iteritems(): - self.wait_run_admin_socket( - 'osd', osdnum, - ['config', 'set', str(k), str(v)]) - - def raw_cluster_status(self): - """ - Get status from cluster - """ - status = self.raw_cluster_cmd('status', '--format=json-pretty') - return json.loads(status) - - def raw_osd_status(self): - """ - Get osd status from cluster - """ - return self.raw_cluster_cmd('osd', 'dump') - - def get_osd_status(self): - """ - Get osd statuses sorted by states that the osds are in. - """ - osd_lines = filter( - lambda x: x.startswith('osd.') and (("up" in x) or ("down" in x)), - self.raw_osd_status().split('\n')) - self.log(osd_lines) - in_osds = [int(i[4:].split()[0]) - for i in filter(lambda x: " in " in x, osd_lines)] - out_osds = [int(i[4:].split()[0]) - for i in filter(lambda x: " out " in x, osd_lines)] - up_osds = [int(i[4:].split()[0]) - for i in filter(lambda x: " up " in x, osd_lines)] - down_osds = [int(i[4:].split()[0]) - for i in filter(lambda x: " down " in x, osd_lines)] - dead_osds = [int(x.id_) - for x in filter(lambda x: - not x.running(), - self.ctx.daemons. - iter_daemons_of_role('osd', self.cluster))] - live_osds = [int(x.id_) for x in - filter(lambda x: - x.running(), - self.ctx.daemons.iter_daemons_of_role('osd', - self.cluster))] - return {'in': in_osds, 'out': out_osds, 'up': up_osds, - 'down': down_osds, 'dead': dead_osds, 'live': live_osds, - 'raw': osd_lines} - - def get_num_pgs(self): - """ - Check cluster status for the number of pgs - """ - status = self.raw_cluster_status() - self.log(status) - return status['pgmap']['num_pgs'] - - def create_erasure_code_profile(self, profile_name, profile): - """ - Create an erasure code profile name that can be used as a parameter - when creating an erasure coded pool. - """ - with self.lock: - args = cmd_erasure_code_profile(profile_name, profile) - self.raw_cluster_cmd(*args) - - def create_pool_with_unique_name(self, pg_num=16, - erasure_code_profile_name=None, - min_size=None, - erasure_code_use_overwrites=False): - """ - Create a pool named unique_pool_X where X is unique. - """ - name = "" - with self.lock: - name = "unique_pool_%s" % (str(self.next_pool_id),) - self.next_pool_id += 1 - self.create_pool( - name, - pg_num, - erasure_code_profile_name=erasure_code_profile_name, - min_size=min_size, - erasure_code_use_overwrites=erasure_code_use_overwrites) - return name - - @contextlib.contextmanager - def pool(self, pool_name, pg_num=16, erasure_code_profile_name=None): - self.create_pool(pool_name, pg_num, erasure_code_profile_name) - yield - self.remove_pool(pool_name) - - def create_pool(self, pool_name, pg_num=16, - erasure_code_profile_name=None, - min_size=None, - erasure_code_use_overwrites=False): - """ - Create a pool named from the pool_name parameter. - :param pool_name: name of the pool being created. - :param pg_num: initial number of pgs. - :param erasure_code_profile_name: if set and !None create an - erasure coded pool using the profile - :param erasure_code_use_overwrites: if true, allow overwrites - """ - with self.lock: - assert isinstance(pool_name, basestring) - assert isinstance(pg_num, int) - assert pool_name not in self.pools - self.log("creating pool_name %s" % (pool_name,)) - if erasure_code_profile_name: - self.raw_cluster_cmd('osd', 'pool', 'create', - pool_name, str(pg_num), str(pg_num), - 'erasure', erasure_code_profile_name) - else: - self.raw_cluster_cmd('osd', 'pool', 'create', - pool_name, str(pg_num)) - if min_size is not None: - self.raw_cluster_cmd( - 'osd', 'pool', 'set', pool_name, - 'min_size', - str(min_size)) - if erasure_code_use_overwrites: - self.raw_cluster_cmd( - 'osd', 'pool', 'set', pool_name, - 'allow_ec_overwrites', - 'true') - self.raw_cluster_cmd( - 'osd', 'pool', 'application', 'enable', - pool_name, 'rados', '--yes-i-really-mean-it', - run.Raw('||'), 'true') - self.pools[pool_name] = pg_num - time.sleep(1) - - def add_pool_snap(self, pool_name, snap_name): - """ - Add pool snapshot - :param pool_name: name of pool to snapshot - :param snap_name: name of snapshot to take - """ - self.raw_cluster_cmd('osd', 'pool', 'mksnap', - str(pool_name), str(snap_name)) - - def remove_pool_snap(self, pool_name, snap_name): - """ - Remove pool snapshot - :param pool_name: name of pool to snapshot - :param snap_name: name of snapshot to remove - """ - self.raw_cluster_cmd('osd', 'pool', 'rmsnap', - str(pool_name), str(snap_name)) - - def remove_pool(self, pool_name): - """ - Remove the indicated pool - :param pool_name: Pool to be removed - """ - with self.lock: - assert isinstance(pool_name, basestring) - assert pool_name in self.pools - self.log("removing pool_name %s" % (pool_name,)) - del self.pools[pool_name] - self.do_rados(self.controller, - ['rmpool', pool_name, pool_name, - "--yes-i-really-really-mean-it"]) - - def get_pool(self): - """ - Pick a random pool - """ - with self.lock: - return random.choice(self.pools.keys()) - - def get_pool_pg_num(self, pool_name): - """ - Return the number of pgs in the pool specified. - """ - with self.lock: - assert isinstance(pool_name, basestring) - if pool_name in self.pools: - return self.pools[pool_name] - return 0 - - def get_pool_property(self, pool_name, prop): - """ - :param pool_name: pool - :param prop: property to be checked. - :returns: property as an int value. - """ - with self.lock: - assert isinstance(pool_name, basestring) - assert isinstance(prop, basestring) - output = self.raw_cluster_cmd( - 'osd', - 'pool', - 'get', - pool_name, - prop) - return int(output.split()[1]) - - def set_pool_property(self, pool_name, prop, val): - """ - :param pool_name: pool - :param prop: property to be set. - :param val: value to set. - - This routine retries if set operation fails. - """ - with self.lock: - assert isinstance(pool_name, basestring) - assert isinstance(prop, basestring) - assert isinstance(val, int) - tries = 0 - while True: - r = self.raw_cluster_cmd_result( - 'osd', - 'pool', - 'set', - pool_name, - prop, - str(val)) - if r != 11: # EAGAIN - break - tries += 1 - if tries > 50: - raise Exception('timed out getting EAGAIN ' - 'when setting pool property %s %s = %s' % - (pool_name, prop, val)) - self.log('got EAGAIN setting pool property, ' - 'waiting a few seconds...') - time.sleep(2) - - def expand_pool(self, pool_name, by, max_pgs): - """ - Increase the number of pgs in a pool - """ - with self.lock: - assert isinstance(pool_name, basestring) - assert isinstance(by, int) - assert pool_name in self.pools - if self.get_num_creating() > 0: - return False - if (self.pools[pool_name] + by) > max_pgs: - return False - self.log("increase pool size by %d" % (by,)) - new_pg_num = self.pools[pool_name] + by - self.set_pool_property(pool_name, "pg_num", new_pg_num) - self.pools[pool_name] = new_pg_num - return True - - def set_pool_pgpnum(self, pool_name, force): - """ - Set pgpnum property of pool_name pool. - """ - with self.lock: - assert isinstance(pool_name, basestring) - assert pool_name in self.pools - if not force and self.get_num_creating() > 0: - return False - self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name]) - return True - - def list_pg_missing(self, pgid): - """ - return list of missing pgs with the id specified - """ - r = None - offset = {} - while True: - out = self.raw_cluster_cmd('--', 'pg', pgid, 'list_missing', - json.dumps(offset)) - j = json.loads(out) - if r is None: - r = j - else: - r['objects'].extend(j['objects']) - if not 'more' in j: - break - if j['more'] == 0: - break - offset = j['objects'][-1]['oid'] - if 'more' in r: - del r['more'] - return r - - def get_pg_stats(self): - """ - Dump the cluster and get pg stats - """ - out = self.raw_cluster_cmd('pg', 'dump', '--format=json') - j = json.loads('\n'.join(out.split('\n')[1:])) - return j['pg_stats'] - - def get_pgids_to_force(self, backfill): - """ - Return the randomized list of PGs that can have their recovery/backfill forced - """ - j = self.get_pg_stats(); - pgids = [] - if backfill: - wanted = ['degraded', 'backfilling', 'backfill_wait'] - else: - wanted = ['recovering', 'degraded', 'recovery_wait'] - for pg in j: - status = pg['state'].split('+') - for t in wanted: - if random.random() > 0.5 and not ('forced_backfill' in status or 'forced_recovery' in status) and t in status: - pgids.append(pg['pgid']) - break - return pgids - - def get_pgids_to_cancel_force(self, backfill): - """ - Return the randomized list of PGs whose recovery/backfill priority is forced - """ - j = self.get_pg_stats(); - pgids = [] - if backfill: - wanted = 'forced_backfill' - else: - wanted = 'forced_recovery' - for pg in j: - status = pg['state'].split('+') - if wanted in status and random.random() > 0.5: - pgids.append(pg['pgid']) - return pgids - - def compile_pg_status(self): - """ - Return a histogram of pg state values - """ - ret = {} - j = self.get_pg_stats() - for pg in j: - for status in pg['state'].split('+'): - if status not in ret: - ret[status] = 0 - ret[status] += 1 - return ret - - @wait_for_pg_stats - def with_pg_state(self, pool, pgnum, check): - pgstr = self.get_pgid(pool, pgnum) - stats = self.get_single_pg_stats(pgstr) - assert(check(stats['state'])) - - @wait_for_pg_stats - def with_pg(self, pool, pgnum, check): - pgstr = self.get_pgid(pool, pgnum) - stats = self.get_single_pg_stats(pgstr) - return check(stats) - - def get_last_scrub_stamp(self, pool, pgnum): - """ - Get the timestamp of the last scrub. - """ - stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum)) - return stats["last_scrub_stamp"] - - def do_pg_scrub(self, pool, pgnum, stype): - """ - Scrub pg and wait for scrubbing to finish - """ - init = self.get_last_scrub_stamp(pool, pgnum) - RESEND_TIMEOUT = 120 # Must be a multiple of SLEEP_TIME - FATAL_TIMEOUT = RESEND_TIMEOUT * 3 - SLEEP_TIME = 10 - timer = 0 - while init == self.get_last_scrub_stamp(pool, pgnum): - assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype - self.log("waiting for scrub type %s" % (stype,)) - if (timer % RESEND_TIMEOUT) == 0: - self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum)) - # The first time in this loop is the actual request - if timer != 0 and stype == "repair": - self.log("WARNING: Resubmitted a non-idempotent repair") - time.sleep(SLEEP_TIME) - timer += SLEEP_TIME - - def wait_snap_trimming_complete(self, pool): - """ - Wait for snap trimming on pool to end - """ - POLL_PERIOD = 10 - FATAL_TIMEOUT = 600 - start = time.time() - poolnum = self.get_pool_num(pool) - poolnumstr = "%s." % (poolnum,) - while (True): - now = time.time() - if (now - start) > FATAL_TIMEOUT: - assert (now - start) < FATAL_TIMEOUT, \ - 'failed to complete snap trimming before timeout' - all_stats = self.get_pg_stats() - trimming = False - for pg in all_stats: - if (poolnumstr in pg['pgid']) and ('snaptrim' in pg['state']): - self.log("pg {pg} in trimming, state: {state}".format( - pg=pg['pgid'], - state=pg['state'])) - trimming = True - if not trimming: - break - self.log("{pool} still trimming, waiting".format(pool=pool)) - time.sleep(POLL_PERIOD) - - def get_single_pg_stats(self, pgid): - """ - Return pg for the pgid specified. - """ - all_stats = self.get_pg_stats() - - for pg in all_stats: - if pg['pgid'] == pgid: - return pg - - return None - - def get_object_pg_with_shard(self, pool, name, osdid): - """ - """ - pool_dump = self.get_pool_dump(pool) - object_map = self.get_object_map(pool, name) - if pool_dump["type"] == CephManager.ERASURE_CODED_POOL: - shard = object_map['acting'].index(osdid) - return "{pgid}s{shard}".format(pgid=object_map['pgid'], - shard=shard) - else: - return object_map['pgid'] - - def get_object_primary(self, pool, name): - """ - """ - object_map = self.get_object_map(pool, name) - return object_map['acting_primary'] - - def get_object_map(self, pool, name): - """ - osd map --format=json converted to a python object - :returns: the python object - """ - out = self.raw_cluster_cmd('--format=json', 'osd', 'map', pool, name) - return json.loads('\n'.join(out.split('\n')[1:])) - - def get_osd_dump_json(self): - """ - osd dump --format=json converted to a python object - :returns: the python object - """ - out = self.raw_cluster_cmd('osd', 'dump', '--format=json') - return json.loads('\n'.join(out.split('\n')[1:])) - - def get_osd_dump(self): - """ - Dump osds - :returns: all osds - """ - return self.get_osd_dump_json()['osds'] - - def get_mgr_dump(self): - out = self.raw_cluster_cmd('mgr', 'dump', '--format=json') - return json.loads(out) - - def get_stuck_pgs(self, type_, threshold): - """ - :returns: stuck pg information from the cluster - """ - out = self.raw_cluster_cmd('pg', 'dump_stuck', type_, str(threshold), - '--format=json') - return json.loads(out) - - def get_num_unfound_objects(self): - """ - Check cluster status to get the number of unfound objects - """ - status = self.raw_cluster_status() - self.log(status) - return status['pgmap'].get('unfound_objects', 0) - - def get_num_creating(self): - """ - Find the number of pgs in creating mode. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if 'creating' in pg['state']: - num += 1 - return num - - def get_num_active_clean(self): - """ - Find the number of active and clean pgs. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if (pg['state'].count('active') and - pg['state'].count('clean') and - not pg['state'].count('stale')): - num += 1 - return num - - def get_num_active_recovered(self): - """ - Find the number of active and recovered pgs. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if (pg['state'].count('active') and - not pg['state'].count('recover') and - not pg['state'].count('backfilling') and - not pg['state'].count('stale')): - num += 1 - return num - - def get_is_making_recovery_progress(self): - """ - Return whether there is recovery progress discernable in the - raw cluster status - """ - status = self.raw_cluster_status() - kps = status['pgmap'].get('recovering_keys_per_sec', 0) - bps = status['pgmap'].get('recovering_bytes_per_sec', 0) - ops = status['pgmap'].get('recovering_objects_per_sec', 0) - return kps > 0 or bps > 0 or ops > 0 - - def get_num_active(self): - """ - Find the number of active pgs. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if pg['state'].count('active') and not pg['state'].count('stale'): - num += 1 - return num - - def get_num_down(self): - """ - Find the number of pgs that are down. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if ((pg['state'].count('down') and not - pg['state'].count('stale')) or - (pg['state'].count('incomplete') and not - pg['state'].count('stale'))): - num += 1 - return num - - def get_num_active_down(self): - """ - Find the number of pgs that are either active or down. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if ((pg['state'].count('active') and not - pg['state'].count('stale')) or - (pg['state'].count('down') and not - pg['state'].count('stale')) or - (pg['state'].count('incomplete') and not - pg['state'].count('stale'))): - num += 1 - return num - - def is_clean(self): - """ - True if all pgs are clean - """ - return self.get_num_active_clean() == self.get_num_pgs() - - def is_recovered(self): - """ - True if all pgs have recovered - """ - return self.get_num_active_recovered() == self.get_num_pgs() - - def is_active_or_down(self): - """ - True if all pgs are active or down - """ - return self.get_num_active_down() == self.get_num_pgs() - - def wait_for_clean(self, timeout=None): - """ - Returns true when all pgs are clean. - """ - self.log("waiting for clean") - start = time.time() - num_active_clean = self.get_num_active_clean() - while not self.is_clean(): - if timeout is not None: - if self.get_is_making_recovery_progress(): - self.log("making progress, resetting timeout") - start = time.time() - else: - self.log("no progress seen, keeping timeout for now") - if time.time() - start >= timeout: - self.log('dumping pgs') - out = self.raw_cluster_cmd('pg', 'dump') - self.log(out) - assert time.time() - start < timeout, \ - 'failed to become clean before timeout expired' - cur_active_clean = self.get_num_active_clean() - if cur_active_clean != num_active_clean: - start = time.time() - num_active_clean = cur_active_clean - time.sleep(3) - self.log("clean!") - - def are_all_osds_up(self): - """ - Returns true if all osds are up. - """ - x = self.get_osd_dump() - return (len(x) == sum([(y['up'] > 0) for y in x])) - - def wait_for_all_osds_up(self, timeout=None): - """ - When this exits, either the timeout has expired, or all - osds are up. - """ - self.log("waiting for all up") - start = time.time() - while not self.are_all_osds_up(): - if timeout is not None: - assert time.time() - start < timeout, \ - 'timeout expired in wait_for_all_osds_up' - time.sleep(3) - self.log("all up!") - - def pool_exists(self, pool): - if pool in self.list_pools(): - return True - return False - - def wait_for_pool(self, pool, timeout=300): - """ - Wait for a pool to exist - """ - self.log('waiting for pool %s to exist' % pool) - start = time.time() - while not self.pool_exists(pool): - if timeout is not None: - assert time.time() - start < timeout, \ - 'timeout expired in wait_for_pool' - time.sleep(3) - - def wait_for_pools(self, pools): - for pool in pools: - self.wait_for_pool(pool) - - def is_mgr_available(self): - x = self.get_mgr_dump() - return x.get('available', False) - - def wait_for_mgr_available(self, timeout=None): - self.log("waiting for mgr available") - start = time.time() - while not self.is_mgr_available(): - if timeout is not None: - assert time.time() - start < timeout, \ - 'timeout expired in wait_for_mgr_available' - time.sleep(3) - self.log("mgr available!") - - def wait_for_recovery(self, timeout=None): - """ - Check peering. When this exists, we have recovered. - """ - self.log("waiting for recovery to complete") - start = time.time() - num_active_recovered = self.get_num_active_recovered() - while not self.is_recovered(): - now = time.time() - if timeout is not None: - if self.get_is_making_recovery_progress(): - self.log("making progress, resetting timeout") - start = time.time() - else: - self.log("no progress seen, keeping timeout for now") - if now - start >= timeout: - if self.is_recovered(): - break - self.log('dumping pgs') - out = self.raw_cluster_cmd('pg', 'dump') - self.log(out) - assert now - start < timeout, \ - 'failed to recover before timeout expired' - cur_active_recovered = self.get_num_active_recovered() - if cur_active_recovered != num_active_recovered: - start = time.time() - num_active_recovered = cur_active_recovered - time.sleep(3) - self.log("recovered!") - - def wait_for_active(self, timeout=None): - """ - Check peering. When this exists, we are definitely active - """ - self.log("waiting for peering to complete") - start = time.time() - num_active = self.get_num_active() - while not self.is_active(): - if timeout is not None: - if time.time() - start >= timeout: - self.log('dumping pgs') - out = self.raw_cluster_cmd('pg', 'dump') - self.log(out) - assert time.time() - start < timeout, \ - 'failed to recover before timeout expired' - cur_active = self.get_num_active() - if cur_active != num_active: - start = time.time() - num_active = cur_active - time.sleep(3) - self.log("active!") - - def wait_for_active_or_down(self, timeout=None): - """ - Check peering. When this exists, we are definitely either - active or down - """ - self.log("waiting for peering to complete or become blocked") - start = time.time() - num_active_down = self.get_num_active_down() - while not self.is_active_or_down(): - if timeout is not None: - if time.time() - start >= timeout: - self.log('dumping pgs') - out = self.raw_cluster_cmd('pg', 'dump') - self.log(out) - assert time.time() - start < timeout, \ - 'failed to recover before timeout expired' - cur_active_down = self.get_num_active_down() - if cur_active_down != num_active_down: - start = time.time() - num_active_down = cur_active_down - time.sleep(3) - self.log("active or down!") - - def osd_is_up(self, osd): - """ - Wrapper for osd check - """ - osds = self.get_osd_dump() - return osds[osd]['up'] > 0 - - def wait_till_osd_is_up(self, osd, timeout=None): - """ - Loop waiting for osd. - """ - self.log('waiting for osd.%d to be up' % osd) - start = time.time() - while not self.osd_is_up(osd): - if timeout is not None: - assert time.time() - start < timeout, \ - 'osd.%d failed to come up before timeout expired' % osd - time.sleep(3) - self.log('osd.%d is up' % osd) - - def is_active(self): - """ - Wrapper to check if all pgs are active - """ - return self.get_num_active() == self.get_num_pgs() - - def wait_till_active(self, timeout=None): - """ - Wait until all pgs are active. - """ - self.log("waiting till active") - start = time.time() - while not self.is_active(): - if timeout is not None: - if time.time() - start >= timeout: - self.log('dumping pgs') - out = self.raw_cluster_cmd('pg', 'dump') - self.log(out) - assert time.time() - start < timeout, \ - 'failed to become active before timeout expired' - time.sleep(3) - self.log("active!") - - def wait_till_pg_convergence(self, timeout=None): - start = time.time() - old_stats = None - active_osds = [osd['osd'] for osd in self.get_osd_dump() - if osd['in'] and osd['up']] - while True: - # strictly speaking, no need to wait for mon. but due to the - # "ms inject socket failures" setting, the osdmap could be delayed, - # so mgr is likely to ignore the pg-stat messages with pgs serving - # newly created pools which is not yet known by mgr. so, to make sure - # the mgr is updated with the latest pg-stats, waiting for mon/mgr is - # necessary. - self.flush_pg_stats(active_osds) - new_stats = dict((stat['pgid'], stat['state']) - for stat in self.get_pg_stats()) - if old_stats == new_stats: - return old_stats - if timeout is not None: - assert time.time() - start < timeout, \ - 'failed to reach convergence before %d secs' % timeout - old_stats = new_stats - # longer than mgr_stats_period - time.sleep(5 + 1) - - def mark_out_osd(self, osd): - """ - Wrapper to mark osd out. - """ - self.raw_cluster_cmd('osd', 'out', str(osd)) - - def kill_osd(self, osd): - """ - Kill osds by either power cycling (if indicated by the config) - or by stopping. - """ - if self.config.get('powercycle'): - remote = self.find_remote('osd', osd) - self.log('kill_osd on osd.{o} ' - 'doing powercycle of {s}'.format(o=osd, s=remote.name)) - self._assert_ipmi(remote) - remote.console.power_off() - elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'): - if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5): - self.raw_cluster_cmd( - '--', 'tell', 'osd.%d' % osd, - 'injectargs', - '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'), - ) - try: - self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait() - except: - pass - else: - raise RuntimeError('osd.%s did not fail' % osd) - else: - self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop() - else: - self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop() - - @staticmethod - def _assert_ipmi(remote): - assert remote.console.has_ipmi_credentials, ( - "powercycling requested but RemoteConsole is not " - "initialized. Check ipmi config.") - - def blackhole_kill_osd(self, osd): - """ - Stop osd if nothing else works. - """ - self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd, - 'injectargs', - '--objectstore-blackhole') - time.sleep(2) - self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop() - - def revive_osd(self, osd, timeout=360, skip_admin_check=False): - """ - Revive osds by either power cycling (if indicated by the config) - or by restarting. - """ - if self.config.get('powercycle'): - remote = self.find_remote('osd', osd) - self.log('kill_osd on osd.{o} doing powercycle of {s}'. - format(o=osd, s=remote.name)) - self._assert_ipmi(remote) - remote.console.power_on() - if not remote.console.check_status(300): - raise Exception('Failed to revive osd.{o} via ipmi'. - format(o=osd)) - teuthology.reconnect(self.ctx, 60, [remote]) - mount_osd_data(self.ctx, remote, self.cluster, str(osd)) - self.make_admin_daemon_dir(remote) - self.ctx.daemons.get_daemon('osd', osd, self.cluster).reset() - self.ctx.daemons.get_daemon('osd', osd, self.cluster).restart() - - if not skip_admin_check: - # wait for dump_ops_in_flight; this command doesn't appear - # until after the signal handler is installed and it is safe - # to stop the osd again without making valgrind leak checks - # unhappy. see #5924. - self.wait_run_admin_socket('osd', osd, - args=['dump_ops_in_flight'], - timeout=timeout, stdout=DEVNULL) - - def mark_down_osd(self, osd): - """ - Cluster command wrapper - """ - self.raw_cluster_cmd('osd', 'down', str(osd)) - - def mark_in_osd(self, osd): - """ - Cluster command wrapper - """ - self.raw_cluster_cmd('osd', 'in', str(osd)) - - def signal_osd(self, osd, sig, silent=False): - """ - Wrapper to local get_daemon call which sends the given - signal to the given osd. - """ - self.ctx.daemons.get_daemon('osd', osd, - self.cluster).signal(sig, silent=silent) - - ## monitors - def signal_mon(self, mon, sig, silent=False): - """ - Wrapper to local get_deamon call - """ - self.ctx.daemons.get_daemon('mon', mon, - self.cluster).signal(sig, silent=silent) - - def kill_mon(self, mon): - """ - Kill the monitor by either power cycling (if the config says so), - or by doing a stop. - """ - if self.config.get('powercycle'): - remote = self.find_remote('mon', mon) - self.log('kill_mon on mon.{m} doing powercycle of {s}'. - format(m=mon, s=remote.name)) - self._assert_ipmi(remote) - remote.console.power_off() - else: - self.ctx.daemons.get_daemon('mon', mon, self.cluster).stop() - - def revive_mon(self, mon): - """ - Restart by either power cycling (if the config says so), - or by doing a normal restart. - """ - if self.config.get('powercycle'): - remote = self.find_remote('mon', mon) - self.log('revive_mon on mon.{m} doing powercycle of {s}'. - format(m=mon, s=remote.name)) - self._assert_ipmi(remote) - remote.console.power_on() - self.make_admin_daemon_dir(remote) - self.ctx.daemons.get_daemon('mon', mon, self.cluster).restart() - - def revive_mgr(self, mgr): - """ - Restart by either power cycling (if the config says so), - or by doing a normal restart. - """ - if self.config.get('powercycle'): - remote = self.find_remote('mgr', mgr) - self.log('revive_mgr on mgr.{m} doing powercycle of {s}'. - format(m=mgr, s=remote.name)) - self._assert_ipmi(remote) - remote.console.power_on() - self.make_admin_daemon_dir(remote) - self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart() - - def get_mon_status(self, mon): - """ - Extract all the monitor status information from the cluster - """ - addr = self.ctx.ceph[self.cluster].conf['mon.%s' % mon]['mon addr'] - out = self.raw_cluster_cmd('-m', addr, 'mon_status') - return json.loads(out) - - def get_mon_quorum(self): - """ - Extract monitor quorum information from the cluster - """ - out = self.raw_cluster_cmd('quorum_status') - j = json.loads(out) - self.log('quorum_status is %s' % out) - return j['quorum'] - - def wait_for_mon_quorum_size(self, size, timeout=300): - """ - Loop until quorum size is reached. - """ - self.log('waiting for quorum size %d' % size) - start = time.time() - while not len(self.get_mon_quorum()) == size: - if timeout is not None: - assert time.time() - start < timeout, \ - ('failed to reach quorum size %d ' - 'before timeout expired' % size) - time.sleep(3) - self.log("quorum is size %d" % size) - - def get_mon_health(self, debug=False): - """ - Extract all the monitor health information. - """ - out = self.raw_cluster_cmd('health', '--format=json') - if debug: - self.log('health:\n{h}'.format(h=out)) - return json.loads(out) - - def get_mds_status(self, mds): - """ - Run cluster commands for the mds in order to get mds information - """ - out = self.raw_cluster_cmd('mds', 'dump', '--format=json') - j = json.loads(' '.join(out.splitlines()[1:])) - # collate; for dup ids, larger gid wins. - for info in j['info'].itervalues(): - if info['name'] == mds: - return info - return None - - def get_filepath(self): - """ - Return path to osd data with {id} needing to be replaced - """ - return '/var/lib/ceph/osd/' + self.cluster + '-{id}' - - def make_admin_daemon_dir(self, remote): - """ - Create /var/run/ceph directory on remote site. - - :param ctx: Context - :param remote: Remote site - """ - remote.run(args=['sudo', - 'install', '-d', '-m0777', '--', '/var/run/ceph', ], ) - - -def utility_task(name): - """ - Generate ceph_manager subtask corresponding to ceph_manager - method name - """ - def task(ctx, config): - if config is None: - config = {} - args = config.get('args', []) - kwargs = config.get('kwargs', {}) - cluster = config.get('cluster', 'ceph') - fn = getattr(ctx.managers[cluster], name) - fn(*args, **kwargs) - return task - -revive_osd = utility_task("revive_osd") -revive_mon = utility_task("revive_mon") -kill_osd = utility_task("kill_osd") -kill_mon = utility_task("kill_mon") -create_pool = utility_task("create_pool") -remove_pool = utility_task("remove_pool") -wait_for_clean = utility_task("wait_for_clean") -flush_all_pg_stats = utility_task("flush_all_pg_stats") -set_pool_property = utility_task("set_pool_property") -do_pg_scrub = utility_task("do_pg_scrub") -wait_for_pool = utility_task("wait_for_pool") -wait_for_pools = utility_task("wait_for_pools") diff --git a/src/ceph/qa/tasks/ceph_objectstore_tool.py b/src/ceph/qa/tasks/ceph_objectstore_tool.py deleted file mode 100644 index 9125773..0000000 --- a/src/ceph/qa/tasks/ceph_objectstore_tool.py +++ /dev/null @@ -1,670 +0,0 @@ -""" -ceph_objectstore_tool - Simple test of ceph-objectstore-tool utility -""" -from cStringIO import StringIO -import contextlib -import logging -import ceph_manager -from teuthology import misc as teuthology -import time -import os -import string -from teuthology.orchestra import run -import sys -import tempfile -import json -from util.rados import (rados, create_replicated_pool, create_ec_pool) -# from util.rados import (rados, create_ec_pool, -# create_replicated_pool, -# create_cache_pool) - -log = logging.getLogger(__name__) - -# Should get cluster name "ceph" from somewhere -# and normal path from osd_data and osd_journal in conf -FSPATH = "/var/lib/ceph/osd/ceph-{id}" -JPATH = "/var/lib/ceph/osd/ceph-{id}/journal" - - -def cod_setup_local_data(log, ctx, NUM_OBJECTS, DATADIR, - BASE_NAME, DATALINECOUNT): - objects = range(1, NUM_OBJECTS + 1) - for i in objects: - NAME = BASE_NAME + "{num}".format(num=i) - LOCALNAME = os.path.join(DATADIR, NAME) - - dataline = range(DATALINECOUNT) - fd = open(LOCALNAME, "w") - data = "This is the data for " + NAME + "\n" - for _ in dataline: - fd.write(data) - fd.close() - - -def cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR, - BASE_NAME, DATALINECOUNT): - - objects = range(1, NUM_OBJECTS + 1) - for i in objects: - NAME = BASE_NAME + "{num}".format(num=i) - DDNAME = os.path.join(DATADIR, NAME) - - remote.run(args=['rm', '-f', DDNAME]) - - dataline = range(DATALINECOUNT) - data = "This is the data for " + NAME + "\n" - DATA = "" - for _ in dataline: - DATA += data - teuthology.write_file(remote, DDNAME, DATA) - - -def cod_setup(log, ctx, remote, NUM_OBJECTS, DATADIR, - BASE_NAME, DATALINECOUNT, POOL, db, ec): - ERRORS = 0 - log.info("Creating {objs} objects in pool".format(objs=NUM_OBJECTS)) - - objects = range(1, NUM_OBJECTS + 1) - for i in objects: - NAME = BASE_NAME + "{num}".format(num=i) - DDNAME = os.path.join(DATADIR, NAME) - - proc = rados(ctx, remote, ['-p', POOL, 'put', NAME, DDNAME], - wait=False) - # proc = remote.run(args=['rados', '-p', POOL, 'put', NAME, DDNAME]) - ret = proc.wait() - if ret != 0: - log.critical("Rados put failed with status {ret}". - format(ret=proc.exitstatus)) - sys.exit(1) - - db[NAME] = {} - - keys = range(i) - db[NAME]["xattr"] = {} - for k in keys: - if k == 0: - continue - mykey = "key{i}-{k}".format(i=i, k=k) - myval = "val{i}-{k}".format(i=i, k=k) - proc = remote.run(args=['rados', '-p', POOL, 'setxattr', - NAME, mykey, myval]) - ret = proc.wait() - if ret != 0: - log.error("setxattr failed with {ret}".format(ret=ret)) - ERRORS += 1 - db[NAME]["xattr"][mykey] = myval - - # Erasure coded pools don't support omap - if ec: - continue - - # Create omap header in all objects but REPobject1 - if i != 1: - myhdr = "hdr{i}".format(i=i) - proc = remote.run(args=['rados', '-p', POOL, 'setomapheader', - NAME, myhdr]) - ret = proc.wait() - if ret != 0: - log.critical("setomapheader failed with {ret}".format(ret=ret)) - ERRORS += 1 - db[NAME]["omapheader"] = myhdr - - db[NAME]["omap"] = {} - for k in keys: - if k == 0: - continue - mykey = "okey{i}-{k}".format(i=i, k=k) - myval = "oval{i}-{k}".format(i=i, k=k) - proc = remote.run(args=['rados', '-p', POOL, 'setomapval', - NAME, mykey, myval]) - ret = proc.wait() - if ret != 0: - log.critical("setomapval failed with {ret}".format(ret=ret)) - db[NAME]["omap"][mykey] = myval - - return ERRORS - - -def get_lines(filename): - tmpfd = open(filename, "r") - line = True - lines = [] - while line: - line = tmpfd.readline().rstrip('\n') - if line: - lines += [line] - tmpfd.close() - os.unlink(filename) - return lines - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run ceph_objectstore_tool test - - The config should be as follows:: - - ceph_objectstore_tool: - objects: 20 # <number of objects> - pgnum: 12 - """ - - if config is None: - config = {} - assert isinstance(config, dict), \ - 'ceph_objectstore_tool task only accepts a dict for configuration' - - log.info('Beginning ceph_objectstore_tool...') - - log.debug(config) - log.debug(ctx) - clients = ctx.cluster.only(teuthology.is_type('client')) - assert len(clients.remotes) > 0, 'Must specify at least 1 client' - (cli_remote, _) = clients.remotes.popitem() - log.debug(cli_remote) - - # clients = dict(teuthology.get_clients(ctx=ctx, roles=config.keys())) - # client = clients.popitem() - # log.info(client) - osds = ctx.cluster.only(teuthology.is_type('osd')) - log.info("OSDS") - log.info(osds) - log.info(osds.remotes) - - manager = ctx.managers['ceph'] - while (len(manager.get_osd_status()['up']) != - len(manager.get_osd_status()['raw'])): - time.sleep(10) - while (len(manager.get_osd_status()['in']) != - len(manager.get_osd_status()['up'])): - time.sleep(10) - manager.raw_cluster_cmd('osd', 'set', 'noout') - manager.raw_cluster_cmd('osd', 'set', 'nodown') - - PGNUM = config.get('pgnum', 12) - log.info("pgnum: {num}".format(num=PGNUM)) - - ERRORS = 0 - - REP_POOL = "rep_pool" - REP_NAME = "REPobject" - create_replicated_pool(cli_remote, REP_POOL, PGNUM) - ERRORS += test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME) - - EC_POOL = "ec_pool" - EC_NAME = "ECobject" - create_ec_pool(cli_remote, EC_POOL, 'default', PGNUM) - ERRORS += test_objectstore(ctx, config, cli_remote, - EC_POOL, EC_NAME, ec=True) - - if ERRORS == 0: - log.info("TEST PASSED") - else: - log.error("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS)) - - assert ERRORS == 0 - - try: - yield - finally: - log.info('Ending ceph_objectstore_tool') - - -def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False): - manager = ctx.managers['ceph'] - - osds = ctx.cluster.only(teuthology.is_type('osd')) - - TEUTHDIR = teuthology.get_testdir(ctx) - DATADIR = os.path.join(TEUTHDIR, "ceph.data") - DATALINECOUNT = 10000 - ERRORS = 0 - NUM_OBJECTS = config.get('objects', 10) - log.info("objects: {num}".format(num=NUM_OBJECTS)) - - pool_dump = manager.get_pool_dump(REP_POOL) - REPID = pool_dump['pool'] - - log.debug("repid={num}".format(num=REPID)) - - db = {} - - LOCALDIR = tempfile.mkdtemp("cod") - - cod_setup_local_data(log, ctx, NUM_OBJECTS, LOCALDIR, - REP_NAME, DATALINECOUNT) - allremote = [] - allremote.append(cli_remote) - allremote += osds.remotes.keys() - allremote = list(set(allremote)) - for remote in allremote: - cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR, - REP_NAME, DATALINECOUNT) - - ERRORS += cod_setup(log, ctx, cli_remote, NUM_OBJECTS, DATADIR, - REP_NAME, DATALINECOUNT, REP_POOL, db, ec) - - pgs = {} - for stats in manager.get_pg_stats(): - if stats["pgid"].find(str(REPID) + ".") != 0: - continue - if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL: - for osd in stats["acting"]: - pgs.setdefault(osd, []).append(stats["pgid"]) - elif pool_dump["type"] == ceph_manager.CephManager.ERASURE_CODED_POOL: - shard = 0 - for osd in stats["acting"]: - pgs.setdefault(osd, []).append("{pgid}s{shard}". - format(pgid=stats["pgid"], - shard=shard)) - shard += 1 - else: - raise Exception("{pool} has an unexpected type {type}". - format(pool=REP_POOL, type=pool_dump["type"])) - - log.info(pgs) - log.info(db) - - for osd in manager.get_osd_status()['up']: - manager.kill_osd(osd) - time.sleep(5) - - pgswithobjects = set() - objsinpg = {} - - # Test --op list and generate json for all objects - log.info("Test --op list by generating json for all objects") - prefix = ("sudo ceph-objectstore-tool " - "--data-path {fpath} " - "--journal-path {jpath} ").format(fpath=FSPATH, jpath=JPATH) - for remote in osds.remotes.iterkeys(): - log.debug(remote) - log.debug(osds.remotes[remote]) - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - log.info("process osd.{id} on {remote}". - format(id=osdid, remote=remote)) - cmd = (prefix + "--op list").format(id=osdid) - proc = remote.run(args=cmd.split(), check_status=False, - stdout=StringIO()) - if proc.exitstatus != 0: - log.error("Bad exit status {ret} from --op list request". - format(ret=proc.exitstatus)) - ERRORS += 1 - else: - for pgline in proc.stdout.getvalue().splitlines(): - if not pgline: - continue - (pg, obj) = json.loads(pgline) - name = obj['oid'] - if name in db: - pgswithobjects.add(pg) - objsinpg.setdefault(pg, []).append(name) - db[name].setdefault("pg2json", - {})[pg] = json.dumps(obj) - - log.info(db) - log.info(pgswithobjects) - log.info(objsinpg) - - if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL: - # Test get-bytes - log.info("Test get-bytes and set-bytes") - for basename in db.keys(): - file = os.path.join(DATADIR, basename) - GETNAME = os.path.join(DATADIR, "get") - SETNAME = os.path.join(DATADIR, "set") - - for remote in osds.remotes.iterkeys(): - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - if osdid not in pgs: - continue - - for pg, JSON in db[basename]["pg2json"].iteritems(): - if pg in pgs[osdid]: - cmd = ((prefix + "--pgid {pg}"). - format(id=osdid, pg=pg).split()) - cmd.append(run.Raw("'{json}'".format(json=JSON))) - cmd += ("get-bytes {fname}". - format(fname=GETNAME).split()) - proc = remote.run(args=cmd, check_status=False) - if proc.exitstatus != 0: - remote.run(args="rm -f {getfile}". - format(getfile=GETNAME).split()) - log.error("Bad exit status {ret}". - format(ret=proc.exitstatus)) - ERRORS += 1 - continue - cmd = ("diff -q {file} {getfile}". - format(file=file, getfile=GETNAME)) - proc = remote.run(args=cmd.split()) - if proc.exitstatus != 0: - log.error("Data from get-bytes differ") - # log.debug("Got:") - # cat_file(logging.DEBUG, GETNAME) - # log.debug("Expected:") - # cat_file(logging.DEBUG, file) - ERRORS += 1 - remote.run(args="rm -f {getfile}". - format(getfile=GETNAME).split()) - - data = ("put-bytes going into {file}\n". - format(file=file)) - teuthology.write_file(remote, SETNAME, data) - cmd = ((prefix + "--pgid {pg}"). - format(id=osdid, pg=pg).split()) - cmd.append(run.Raw("'{json}'".format(json=JSON))) - cmd += ("set-bytes {fname}". - format(fname=SETNAME).split()) - proc = remote.run(args=cmd, check_status=False) - proc.wait() - if proc.exitstatus != 0: - log.info("set-bytes failed for object {obj} " - "in pg {pg} osd.{id} ret={ret}". - format(obj=basename, pg=pg, - id=osdid, ret=proc.exitstatus)) - ERRORS += 1 - - cmd = ((prefix + "--pgid {pg}"). - format(id=osdid, pg=pg).split()) - cmd.append(run.Raw("'{json}'".format(json=JSON))) - cmd += "get-bytes -".split() - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("get-bytes after " - "set-bytes ret={ret}". - format(ret=proc.exitstatus)) - ERRORS += 1 - else: - if data != proc.stdout.getvalue(): - log.error("Data inconsistent after " - "set-bytes, got:") - log.error(proc.stdout.getvalue()) - ERRORS += 1 - - cmd = ((prefix + "--pgid {pg}"). - format(id=osdid, pg=pg).split()) - cmd.append(run.Raw("'{json}'".format(json=JSON))) - cmd += ("set-bytes {fname}". - format(fname=file).split()) - proc = remote.run(args=cmd, check_status=False) - proc.wait() - if proc.exitstatus != 0: - log.info("set-bytes failed for object {obj} " - "in pg {pg} osd.{id} ret={ret}". - format(obj=basename, pg=pg, - id=osdid, ret=proc.exitstatus)) - ERRORS += 1 - - log.info("Test list-attrs get-attr") - for basename in db.keys(): - file = os.path.join(DATADIR, basename) - GETNAME = os.path.join(DATADIR, "get") - SETNAME = os.path.join(DATADIR, "set") - - for remote in osds.remotes.iterkeys(): - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - if osdid not in pgs: - continue - - for pg, JSON in db[basename]["pg2json"].iteritems(): - if pg in pgs[osdid]: - cmd = ((prefix + "--pgid {pg}"). - format(id=osdid, pg=pg).split()) - cmd.append(run.Raw("'{json}'".format(json=JSON))) - cmd += ["list-attrs"] - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO(), stderr=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("Bad exit status {ret}". - format(ret=proc.exitstatus)) - ERRORS += 1 - continue - keys = proc.stdout.getvalue().split() - values = dict(db[basename]["xattr"]) - - for key in keys: - if (key == "_" or - key == "snapset" or - key == "hinfo_key"): - continue - key = key.strip("_") - if key not in values: - log.error("The key {key} should be present". - format(key=key)) - ERRORS += 1 - continue - exp = values.pop(key) - cmd = ((prefix + "--pgid {pg}"). - format(id=osdid, pg=pg).split()) - cmd.append(run.Raw("'{json}'".format(json=JSON))) - cmd += ("get-attr {key}". - format(key="_" + key).split()) - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("get-attr failed with {ret}". - format(ret=proc.exitstatus)) - ERRORS += 1 - continue - val = proc.stdout.getvalue() - if exp != val: - log.error("For key {key} got value {got} " - "instead of {expected}". - format(key=key, got=val, - expected=exp)) - ERRORS += 1 - if "hinfo_key" in keys: - cmd_prefix = prefix.format(id=osdid) - cmd = """ - expected=$({prefix} --pgid {pg} '{json}' get-attr {key} | base64) - echo placeholder | {prefix} --pgid {pg} '{json}' set-attr {key} - - test $({prefix} --pgid {pg} '{json}' get-attr {key}) = placeholder - echo $expected | base64 --decode | \ - {prefix} --pgid {pg} '{json}' set-attr {key} - - test $({prefix} --pgid {pg} '{json}' get-attr {key} | base64) = $expected - """.format(prefix=cmd_prefix, pg=pg, json=JSON, - key="hinfo_key") - log.debug(cmd) - proc = remote.run(args=['bash', '-e', '-x', - '-c', cmd], - check_status=False, - stdout=StringIO(), - stderr=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("failed with " + - str(proc.exitstatus)) - log.error(proc.stdout.getvalue() + " " + - proc.stderr.getvalue()) - ERRORS += 1 - - if len(values) != 0: - log.error("Not all keys found, remaining keys:") - log.error(values) - - log.info("Test pg info") - for remote in osds.remotes.iterkeys(): - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - if osdid not in pgs: - continue - - for pg in pgs[osdid]: - cmd = ((prefix + "--op info --pgid {pg}"). - format(id=osdid, pg=pg).split()) - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("Failure of --op info command with {ret}". - format(proc.exitstatus)) - ERRORS += 1 - continue - info = proc.stdout.getvalue() - if not str(pg) in info: - log.error("Bad data from info: {info}".format(info=info)) - ERRORS += 1 - - log.info("Test pg logging") - for remote in osds.remotes.iterkeys(): - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - if osdid not in pgs: - continue - - for pg in pgs[osdid]: - cmd = ((prefix + "--op log --pgid {pg}"). - format(id=osdid, pg=pg).split()) - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("Getting log failed for pg {pg} " - "from osd.{id} with {ret}". - format(pg=pg, id=osdid, ret=proc.exitstatus)) - ERRORS += 1 - continue - HASOBJ = pg in pgswithobjects - MODOBJ = "modify" in proc.stdout.getvalue() - if HASOBJ != MODOBJ: - log.error("Bad log for pg {pg} from osd.{id}". - format(pg=pg, id=osdid)) - MSG = (HASOBJ and [""] or ["NOT "])[0] - log.error("Log should {msg}have a modify entry". - format(msg=MSG)) - ERRORS += 1 - - log.info("Test pg export") - EXP_ERRORS = 0 - for remote in osds.remotes.iterkeys(): - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - if osdid not in pgs: - continue - - for pg in pgs[osdid]: - fpath = os.path.join(DATADIR, "osd{id}.{pg}". - format(id=osdid, pg=pg)) - - cmd = ((prefix + "--op export --pgid {pg} --file {file}"). - format(id=osdid, pg=pg, file=fpath)) - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("Exporting failed for pg {pg} " - "on osd.{id} with {ret}". - format(pg=pg, id=osdid, ret=proc.exitstatus)) - EXP_ERRORS += 1 - - ERRORS += EXP_ERRORS - - log.info("Test pg removal") - RM_ERRORS = 0 - for remote in osds.remotes.iterkeys(): - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - if osdid not in pgs: - continue - - for pg in pgs[osdid]: - cmd = ((prefix + "--force --op remove --pgid {pg}"). - format(pg=pg, id=osdid)) - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("Removing failed for pg {pg} " - "on osd.{id} with {ret}". - format(pg=pg, id=osdid, ret=proc.exitstatus)) - RM_ERRORS += 1 - - ERRORS += RM_ERRORS - - IMP_ERRORS = 0 - if EXP_ERRORS == 0 and RM_ERRORS == 0: - log.info("Test pg import") - - for remote in osds.remotes.iterkeys(): - for role in osds.remotes[remote]: - if string.find(role, "osd.") != 0: - continue - osdid = int(role.split('.')[1]) - if osdid not in pgs: - continue - - for pg in pgs[osdid]: - fpath = os.path.join(DATADIR, "osd{id}.{pg}". - format(id=osdid, pg=pg)) - - cmd = ((prefix + "--op import --file {file}"). - format(id=osdid, file=fpath)) - proc = remote.run(args=cmd, check_status=False, - stdout=StringIO()) - proc.wait() - if proc.exitstatus != 0: - log.error("Import failed from {file} with {ret}". - format(file=fpath, ret=proc.exitstatus)) - IMP_ERRORS += 1 - else: - log.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES") - - ERRORS += IMP_ERRORS - - if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0: - log.info("Restarting OSDs....") - # They are still look to be up because of setting nodown - for osd in manager.get_osd_status()['up']: - manager.revive_osd(osd) - # Wait for health? - time.sleep(5) - # Let scrub after test runs verify consistency of all copies - log.info("Verify replicated import data") - objects = range(1, NUM_OBJECTS + 1) - for i in objects: - NAME = REP_NAME + "{num}".format(num=i) - TESTNAME = os.path.join(DATADIR, "gettest") - REFNAME = os.path.join(DATADIR, NAME) - - proc = rados(ctx, cli_remote, - ['-p', REP_POOL, 'get', NAME, TESTNAME], wait=False) - - ret = proc.wait() - if ret != 0: - log.error("After import, rados get failed with {ret}". - format(ret=proc.exitstatus)) - ERRORS += 1 - continue - - cmd = "diff -q {gettest} {ref}".format(gettest=TESTNAME, - ref=REFNAME) - proc = cli_remote.run(args=cmd, check_status=False) - proc.wait() - if proc.exitstatus != 0: - log.error("Data comparison failed for {obj}".format(obj=NAME)) - ERRORS += 1 - - return ERRORS diff --git a/src/ceph/qa/tasks/ceph_test_case.py b/src/ceph/qa/tasks/ceph_test_case.py deleted file mode 100644 index 5767df4..0000000 --- a/src/ceph/qa/tasks/ceph_test_case.py +++ /dev/null @@ -1,150 +0,0 @@ - -import unittest -import time -import logging - -from teuthology.orchestra.run import CommandFailedError - -log = logging.getLogger(__name__) - - -class CephTestCase(unittest.TestCase): - """ - For test tasks that want to define a structured set of - tests implemented in python. Subclass this with appropriate - helpers for the subsystem you're testing. - """ - - # Environment references - mounts = None - fs = None - recovery_fs = None - ceph_cluster = None - mds_cluster = None - mgr_cluster = None - ctx = None - - mon_manager = None - - def setUp(self): - self.ceph_cluster.mon_manager.raw_cluster_cmd("log", - "Starting test {0}".format(self.id())) - - def tearDown(self): - self.ceph_cluster.mon_manager.raw_cluster_cmd("log", - "Ended test {0}".format(self.id())) - - def assert_cluster_log(self, expected_pattern, invert_match=False, timeout=10): - """ - Context manager. Assert that during execution, or up to 5 seconds later, - the Ceph cluster log emits a message matching the expected pattern. - - :param expected_pattern: a string that you expect to see in the log output - """ - - ceph_manager = self.ceph_cluster.mon_manager - - class ContextManager(object): - def match(self): - found = expected_pattern in self.watcher_process.stdout.getvalue() - if invert_match: - return not found - - return found - - def __enter__(self): - self.watcher_process = ceph_manager.run_ceph_w() - - def __exit__(self, exc_type, exc_val, exc_tb): - if not self.watcher_process.finished: - # Check if we got an early match, wait a bit if we didn't - if self.match(): - return - else: - log.debug("No log hits yet, waiting...") - # Default monc tick interval is 10s, so wait that long and - # then some grace - time.sleep(5 + timeout) - - self.watcher_process.stdin.close() - try: - self.watcher_process.wait() - except CommandFailedError: - pass - - if not self.match(): - log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue())) - raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern)) - - return ContextManager() - - def wait_for_health(self, pattern, timeout): - """ - Wait until 'ceph health' contains messages matching the pattern - """ - def seen_health_warning(): - health = self.ceph_cluster.mon_manager.get_mon_health() - codes = [s for s in health['checks']] - summary_strings = [s[1]['summary']['message'] for s in health['checks'].iteritems()] - if len(summary_strings) == 0: - log.debug("Not expected number of summary strings ({0})".format(summary_strings)) - return False - else: - for ss in summary_strings: - if pattern in ss: - return True - if pattern in codes: - return True - - log.debug("Not found expected summary strings yet ({0})".format(summary_strings)) - return False - - self.wait_until_true(seen_health_warning, timeout) - - def wait_for_health_clear(self, timeout): - """ - Wait until `ceph health` returns no messages - """ - def is_clear(): - health = self.ceph_cluster.mon_manager.get_mon_health() - return len(health['checks']) == 0 - - self.wait_until_true(is_clear, timeout) - - def wait_until_equal(self, get_fn, expect_val, timeout, reject_fn=None): - period = 5 - elapsed = 0 - while True: - val = get_fn() - if val == expect_val: - return - elif reject_fn and reject_fn(val): - raise RuntimeError("wait_until_equal: forbidden value {0} seen".format(val)) - else: - if elapsed >= timeout: - raise RuntimeError("Timed out after {0} seconds waiting for {1} (currently {2})".format( - elapsed, expect_val, val - )) - else: - log.debug("wait_until_equal: {0} != {1}, waiting...".format(val, expect_val)) - time.sleep(period) - elapsed += period - - log.debug("wait_until_equal: success") - - def wait_until_true(self, condition, timeout): - period = 5 - elapsed = 0 - while True: - if condition(): - log.debug("wait_until_true: success in {0}s".format(elapsed)) - return - else: - if elapsed >= timeout: - raise RuntimeError("Timed out after {0}s".format(elapsed)) - else: - log.debug("wait_until_true: waiting...") - time.sleep(period) - elapsed += period - - diff --git a/src/ceph/qa/tasks/cephfs/__init__.py b/src/ceph/qa/tasks/cephfs/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/src/ceph/qa/tasks/cephfs/__init__.py +++ /dev/null diff --git a/src/ceph/qa/tasks/cephfs/cephfs_test_case.py b/src/ceph/qa/tasks/cephfs/cephfs_test_case.py deleted file mode 100644 index 801d0d3..0000000 --- a/src/ceph/qa/tasks/cephfs/cephfs_test_case.py +++ /dev/null @@ -1,315 +0,0 @@ -import json -import logging -from unittest import case -from tasks.ceph_test_case import CephTestCase -import os -import re -from StringIO import StringIO - -from tasks.cephfs.fuse_mount import FuseMount - -from teuthology.orchestra import run -from teuthology.orchestra.run import CommandFailedError - - -log = logging.getLogger(__name__) - - -def for_teuthology(f): - """ - Decorator that adds an "is_for_teuthology" attribute to the wrapped function - """ - f.is_for_teuthology = True - return f - - -def needs_trimming(f): - """ - Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse - this means it needs to be able to run as root, currently) - """ - f.needs_trimming = True - return f - - -class CephFSTestCase(CephTestCase): - """ - Test case for Ceph FS, requires caller to populate Filesystem and Mounts, - into the fs, mount_a, mount_b class attributes (setting mount_b is optional) - - Handles resetting the cluster under test between tests. - """ - - # FIXME weird explicit naming - mount_a = None - mount_b = None - recovery_mount = None - - # Declarative test requirements: subclasses should override these to indicate - # their special needs. If not met, tests will be skipped. - CLIENTS_REQUIRED = 1 - MDSS_REQUIRED = 1 - REQUIRE_KCLIENT_REMOTE = False - REQUIRE_ONE_CLIENT_REMOTE = False - REQUIRE_MEMSTORE = False - - # Whether to create the default filesystem during setUp - REQUIRE_FILESYSTEM = True - - # requires REQUIRE_FILESYSTEM = True - REQUIRE_RECOVERY_FILESYSTEM = False - - LOAD_SETTINGS = [] - - def setUp(self): - super(CephFSTestCase, self).setUp() - - if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED: - raise case.SkipTest("Only have {0} MDSs, require {1}".format( - len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED - )) - - if len(self.mounts) < self.CLIENTS_REQUIRED: - raise case.SkipTest("Only have {0} clients, require {1}".format( - len(self.mounts), self.CLIENTS_REQUIRED - )) - - if self.REQUIRE_KCLIENT_REMOTE: - if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount): - # kclient kill() power cycles nodes, so requires clients to each be on - # their own node - if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname: - raise case.SkipTest("kclient clients must be on separate nodes") - - if self.REQUIRE_ONE_CLIENT_REMOTE: - if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames(): - raise case.SkipTest("Require first client to be on separate server from MDSs") - - if self.REQUIRE_MEMSTORE: - objectstore = self.mds_cluster.get_config("osd_objectstore", "osd") - if objectstore != "memstore": - # You certainly *could* run this on a real OSD, but you don't want to sit - # here for hours waiting for the test to fill up a 1TB drive! - raise case.SkipTest("Require `memstore` OSD backend to simulate full drives") - - # Create friendly mount_a, mount_b attrs - for i in range(0, self.CLIENTS_REQUIRED): - setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i]) - - self.mds_cluster.clear_firewall() - - # Unmount all clients, we are about to blow away the filesystem - for mount in self.mounts: - if mount.is_mounted(): - mount.umount_wait(force=True) - - # To avoid any issues with e.g. unlink bugs, we destroy and recreate - # the filesystem rather than just doing a rm -rf of files - self.mds_cluster.mds_stop() - self.mds_cluster.mds_fail() - self.mds_cluster.delete_all_filesystems() - self.fs = None # is now invalid! - self.recovery_fs = None - - # In case the previous filesystem had filled up the RADOS cluster, wait for that - # flag to pass. - osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd')) - self.wait_until_true(lambda: not self.mds_cluster.is_full(), - timeout=osd_mon_report_interval_max * 5) - - # In case anything is in the OSD blacklist list, clear it out. This is to avoid - # the OSD map changing in the background (due to blacklist expiry) while tests run. - try: - self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear") - except CommandFailedError: - # Fallback for older Ceph cluster - blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd", - "dump", "--format=json-pretty"))['blacklist'] - log.info("Removing {0} blacklist entries".format(len(blacklist))) - for addr, blacklisted_at in blacklist.items(): - self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr) - - client_mount_ids = [m.client_id for m in self.mounts] - # In case the test changes the IDs of clients, stash them so that we can - # reset in tearDown - self._original_client_ids = client_mount_ids - log.info(client_mount_ids) - - # In case there were any extra auth identities around from a previous - # test, delete them - for entry in self.auth_list(): - ent_type, ent_id = entry['entity'].split(".") - if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin": - self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity']) - - if self.REQUIRE_FILESYSTEM: - self.fs = self.mds_cluster.newfs(create=True) - self.fs.mds_restart() - - # In case some test messed with auth caps, reset them - for client_id in client_mount_ids: - self.mds_cluster.mon_manager.raw_cluster_cmd_result( - 'auth', 'caps', "client.{0}".format(client_id), - 'mds', 'allow', - 'mon', 'allow r', - 'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name())) - - # wait for mds restart to complete... - self.fs.wait_for_daemons() - - # Mount the requested number of clients - for i in range(0, self.CLIENTS_REQUIRED): - self.mounts[i].mount() - self.mounts[i].wait_until_mounted() - - if self.REQUIRE_RECOVERY_FILESYSTEM: - if not self.REQUIRE_FILESYSTEM: - raise case.SkipTest("Recovery filesystem requires a primary filesystem as well") - self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set', - 'enable_multiple', 'true', - '--yes-i-really-mean-it') - self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False) - self.recovery_fs.set_metadata_overlay(True) - self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name()) - self.recovery_fs.create() - self.recovery_fs.getinfo(refresh=True) - self.recovery_fs.mds_restart() - self.recovery_fs.wait_for_daemons() - - # Load an config settings of interest - for setting in self.LOAD_SETTINGS: - setattr(self, setting, float(self.fs.mds_asok( - ['config', 'get', setting], self.mds_cluster.mds_ids[0] - )[setting])) - - self.configs_set = set() - - def tearDown(self): - super(CephFSTestCase, self).tearDown() - - self.mds_cluster.clear_firewall() - for m in self.mounts: - m.teardown() - - for i, m in enumerate(self.mounts): - m.client_id = self._original_client_ids[i] - - for subsys, key in self.configs_set: - self.mds_cluster.clear_ceph_conf(subsys, key) - - def set_conf(self, subsys, key, value): - self.configs_set.add((subsys, key)) - self.mds_cluster.set_ceph_conf(subsys, key, value) - - def auth_list(self): - """ - Convenience wrapper on "ceph auth ls" - """ - return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd( - "auth", "ls", "--format=json-pretty" - ))['auth_dump'] - - def assert_session_count(self, expected, ls_data=None, mds_id=None): - if ls_data is None: - ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id) - - alive_count = len([s for s in ls_data if s['state'] != 'killing']) - - self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format( - expected, alive_count - )) - - def assert_session_state(self, client_id, expected_state): - self.assertEqual( - self._session_by_id( - self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'], - expected_state) - - def get_session_data(self, client_id): - return self._session_by_id(client_id) - - def _session_list(self): - ls_data = self.fs.mds_asok(['session', 'ls']) - ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']] - return ls_data - - def get_session(self, client_id, session_ls=None): - if session_ls is None: - session_ls = self.fs.mds_asok(['session', 'ls']) - - return self._session_by_id(session_ls)[client_id] - - def _session_by_id(self, session_ls): - return dict([(s['id'], s) for s in session_ls]) - - def wait_for_daemon_start(self, daemon_ids=None): - """ - Wait until all the daemons appear in the FSMap, either assigned - MDS ranks or in the list of standbys - """ - def get_daemon_names(): - return [info['name'] for info in self.mds_cluster.status().get_all()] - - if daemon_ids is None: - daemon_ids = self.mds_cluster.mds_ids - - try: - self.wait_until_true( - lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids), - timeout=30 - ) - except RuntimeError: - log.warn("Timeout waiting for daemons {0}, while we have {1}".format( - daemon_ids, get_daemon_names() - )) - raise - - def assert_mds_crash(self, daemon_id): - """ - Assert that the a particular MDS daemon crashes (block until - it does) - """ - try: - self.mds_cluster.mds_daemons[daemon_id].proc.wait() - except CommandFailedError as e: - log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus)) - self.mds_cluster.mds_daemons[daemon_id].proc = None - - # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will - # catch it later and treat it as a failure. - p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ - "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO()) - core_pattern = p.stdout.getvalue().strip() - if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it - # We have seen a core_pattern that looks like it's from teuthology's coredump - # task, so proceed to clear out the core file - log.info("Clearing core from pattern: {0}".format(core_pattern)) - - # Determine the PID of the crashed MDS by inspecting the MDSMap, it had - # to talk to the mons to get assigned a rank to reach the point of crashing - addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr'] - pid_str = addr.split("/")[1] - log.info("Determined crasher PID was {0}".format(pid_str)) - - # Substitute PID into core_pattern to get a glob - core_glob = core_pattern.replace("%p", pid_str) - core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens - - # Verify that we see the expected single coredump matching the expected pattern - ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ - "sudo", "ls", run.Raw(core_glob) - ], stdout=StringIO()) - cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f] - log.info("Enumerated cores: {0}".format(cores)) - self.assertEqual(len(cores), 1) - - log.info("Found core file {0}, deleting it".format(cores[0])) - - self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ - "sudo", "rm", "-f", cores[0] - ]) - else: - log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)") - - else: - raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id)) diff --git a/src/ceph/qa/tasks/cephfs/filesystem.py b/src/ceph/qa/tasks/cephfs/filesystem.py deleted file mode 100644 index 9638fd5..0000000 --- a/src/ceph/qa/tasks/cephfs/filesystem.py +++ /dev/null @@ -1,1213 +0,0 @@ - -from StringIO import StringIO -import json -import logging -from gevent import Greenlet -import os -import time -import datetime -import re -import errno -import random - -from teuthology.exceptions import CommandFailedError -from teuthology import misc -from teuthology.nuke import clear_firewall -from teuthology.parallel import parallel -from tasks.ceph_manager import write_conf -from tasks import ceph_manager - - -log = logging.getLogger(__name__) - - -DAEMON_WAIT_TIMEOUT = 120 -ROOT_INO = 1 - - -class ObjectNotFound(Exception): - def __init__(self, object_name): - self._object_name = object_name - - def __str__(self): - return "Object not found: '{0}'".format(self._object_name) - -class FSStatus(object): - """ - Operations on a snapshot of the FSMap. - """ - def __init__(self, mon_manager): - self.mon = mon_manager - self.map = json.loads(self.mon.raw_cluster_cmd("fs", "dump", "--format=json")) - - def __str__(self): - return json.dumps(self.map, indent = 2, sort_keys = True) - - # Expose the fsmap for manual inspection. - def __getitem__(self, key): - """ - Get a field from the fsmap. - """ - return self.map[key] - - def get_filesystems(self): - """ - Iterator for all filesystems. - """ - for fs in self.map['filesystems']: - yield fs - - def get_all(self): - """ - Iterator for all the mds_info components in the FSMap. - """ - for info in self.get_standbys(): - yield info - for fs in self.map['filesystems']: - for info in fs['mdsmap']['info'].values(): - yield info - - def get_standbys(self): - """ - Iterator for all standbys. - """ - for info in self.map['standbys']: - yield info - - def get_fsmap(self, fscid): - """ - Get the fsmap for the given FSCID. - """ - for fs in self.map['filesystems']: - if fscid is None or fs['id'] == fscid: - return fs - raise RuntimeError("FSCID {0} not in map".format(fscid)) - - def get_fsmap_byname(self, name): - """ - Get the fsmap for the given file system name. - """ - for fs in self.map['filesystems']: - if name is None or fs['mdsmap']['fs_name'] == name: - return fs - raise RuntimeError("FS {0} not in map".format(name)) - - def get_replays(self, fscid): - """ - Get the standby:replay MDS for the given FSCID. - """ - fs = self.get_fsmap(fscid) - for info in fs['mdsmap']['info'].values(): - if info['state'] == 'up:standby-replay': - yield info - - def get_ranks(self, fscid): - """ - Get the ranks for the given FSCID. - """ - fs = self.get_fsmap(fscid) - for info in fs['mdsmap']['info'].values(): - if info['rank'] >= 0: - yield info - - def get_rank(self, fscid, rank): - """ - Get the rank for the given FSCID. - """ - for info in self.get_ranks(fscid): - if info['rank'] == rank: - return info - raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank)) - - def get_mds(self, name): - """ - Get the info for the given MDS name. - """ - for info in self.get_all(): - if info['name'] == name: - return info - return None - - def get_mds_addr(self, name): - """ - Return the instance addr as a string, like "10.214.133.138:6807\/10825" - """ - info = self.get_mds(name) - if info: - return info['addr'] - else: - log.warn(json.dumps(list(self.get_all()), indent=2)) # dump for debugging - raise RuntimeError("MDS id '{0}' not found in map".format(name)) - -class CephCluster(object): - @property - def admin_remote(self): - first_mon = misc.get_first_mon(self._ctx, None) - (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys() - return result - - def __init__(self, ctx): - self._ctx = ctx - self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager')) - - def get_config(self, key, service_type=None): - """ - Get config from mon by default, or a specific service if caller asks for it - """ - if service_type is None: - service_type = 'mon' - - service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0] - return self.json_asok(['config', 'get', key], service_type, service_id)[key] - - def set_ceph_conf(self, subsys, key, value): - if subsys not in self._ctx.ceph['ceph'].conf: - self._ctx.ceph['ceph'].conf[subsys] = {} - self._ctx.ceph['ceph'].conf[subsys][key] = value - write_conf(self._ctx) # XXX because we don't have the ceph task's config object, if they - # used a different config path this won't work. - - def clear_ceph_conf(self, subsys, key): - del self._ctx.ceph['ceph'].conf[subsys][key] - write_conf(self._ctx) - - def json_asok(self, command, service_type, service_id): - proc = self.mon_manager.admin_socket(service_type, service_id, command) - response_data = proc.stdout.getvalue() - log.info("_json_asok output: {0}".format(response_data)) - if response_data.strip(): - return json.loads(response_data) - else: - return None - - -class MDSCluster(CephCluster): - """ - Collective operations on all the MDS daemons in the Ceph cluster. These - daemons may be in use by various Filesystems. - - For the benefit of pre-multi-filesystem tests, this class is also - a parent of Filesystem. The correct way to use MDSCluster going forward is - as a separate instance outside of your (multiple) Filesystem instances. - """ - def __init__(self, ctx): - super(MDSCluster, self).__init__(ctx) - - self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds')) - - if len(self.mds_ids) == 0: - raise RuntimeError("This task requires at least one MDS") - - if hasattr(self._ctx, "daemons"): - # Presence of 'daemons' attribute implies ceph task rather than ceph_deploy task - self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids]) - - def _one_or_all(self, mds_id, cb, in_parallel=True): - """ - Call a callback for a single named MDS, or for all. - - Note that the parallelism here isn't for performance, it's to avoid being overly kind - to the cluster by waiting a graceful ssh-latency of time between doing things, and to - avoid being overly kind by executing them in a particular order. However, some actions - don't cope with being done in parallel, so it's optional (`in_parallel`) - - :param mds_id: MDS daemon name, or None - :param cb: Callback taking single argument of MDS daemon name - :param in_parallel: whether to invoke callbacks concurrently (else one after the other) - """ - if mds_id is None: - if in_parallel: - with parallel() as p: - for mds_id in self.mds_ids: - p.spawn(cb, mds_id) - else: - for mds_id in self.mds_ids: - cb(mds_id) - else: - cb(mds_id) - - def get_config(self, key, service_type=None): - """ - get_config specialization of service_type="mds" - """ - if service_type != "mds": - return super(MDSCluster, self).get_config(key, service_type) - - # Some tests stop MDS daemons, don't send commands to a dead one: - service_id = random.sample(filter(lambda i: self.mds_daemons[i].running(), self.mds_daemons), 1)[0] - return self.json_asok(['config', 'get', key], service_type, service_id)[key] - - def mds_stop(self, mds_id=None): - """ - Stop the MDS daemon process(se). If it held a rank, that rank - will eventually go laggy. - """ - self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop()) - - def mds_fail(self, mds_id=None): - """ - Inform MDSMonitor of the death of the daemon process(es). If it held - a rank, that rank will be relinquished. - """ - self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_)) - - def mds_restart(self, mds_id=None): - self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart()) - - def mds_fail_restart(self, mds_id=None): - """ - Variation on restart that includes marking MDSs as failed, so that doing this - operation followed by waiting for healthy daemon states guarantees that they - have gone down and come up, rather than potentially seeing the healthy states - that existed before the restart. - """ - def _fail_restart(id_): - self.mds_daemons[id_].stop() - self.mon_manager.raw_cluster_cmd("mds", "fail", id_) - self.mds_daemons[id_].restart() - - self._one_or_all(mds_id, _fail_restart) - - def newfs(self, name='cephfs', create=True): - return Filesystem(self._ctx, name=name, create=create) - - def status(self): - return FSStatus(self.mon_manager) - - def delete_all_filesystems(self): - """ - Remove all filesystems that exist, and any pools in use by them. - """ - pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] - pool_id_name = {} - for pool in pools: - pool_id_name[pool['pool']] = pool['pool_name'] - - # mark cluster down for each fs to prevent churn during deletion - status = self.status() - for fs in status.get_filesystems(): - self.mon_manager.raw_cluster_cmd("fs", "set", fs['mdsmap']['fs_name'], "cluster_down", "true") - - # get a new copy as actives may have since changed - status = self.status() - for fs in status.get_filesystems(): - mdsmap = fs['mdsmap'] - metadata_pool = pool_id_name[mdsmap['metadata_pool']] - - for gid in mdsmap['up'].values(): - self.mon_manager.raw_cluster_cmd('mds', 'fail', gid.__str__()) - - self.mon_manager.raw_cluster_cmd('fs', 'rm', mdsmap['fs_name'], '--yes-i-really-mean-it') - self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', - metadata_pool, metadata_pool, - '--yes-i-really-really-mean-it') - for data_pool in mdsmap['data_pools']: - data_pool = pool_id_name[data_pool] - try: - self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', - data_pool, data_pool, - '--yes-i-really-really-mean-it') - except CommandFailedError as e: - if e.exitstatus == 16: # EBUSY, this data pool is used - pass # by two metadata pools, let the 2nd - else: # pass delete it - raise - - def get_standby_daemons(self): - return set([s['name'] for s in self.status().get_standbys()]) - - def get_mds_hostnames(self): - result = set() - for mds_id in self.mds_ids: - mds_remote = self.mon_manager.find_remote('mds', mds_id) - result.add(mds_remote.hostname) - - return list(result) - - def set_clients_block(self, blocked, mds_id=None): - """ - Block (using iptables) client communications to this MDS. Be careful: if - other services are running on this MDS, or other MDSs try to talk to this - MDS, their communications may also be blocked as collatoral damage. - - :param mds_id: Optional ID of MDS to block, default to all - :return: - """ - da_flag = "-A" if blocked else "-D" - - def set_block(_mds_id): - remote = self.mon_manager.find_remote('mds', _mds_id) - status = self.status() - - addr = status.get_mds_addr(_mds_id) - ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups() - - remote.run( - args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m", - "comment", "--comment", "teuthology"]) - remote.run( - args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m", - "comment", "--comment", "teuthology"]) - - self._one_or_all(mds_id, set_block, in_parallel=False) - - def clear_firewall(self): - clear_firewall(self._ctx) - - def get_mds_info(self, mds_id): - return FSStatus(self.mon_manager).get_mds(mds_id) - - def is_full(self): - flags = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['flags'] - return 'full' in flags - - def is_pool_full(self, pool_name): - pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] - for pool in pools: - if pool['pool_name'] == pool_name: - return 'full' in pool['flags_names'].split(",") - - raise RuntimeError("Pool not found '{0}'".format(pool_name)) - -class Filesystem(MDSCluster): - """ - This object is for driving a CephFS filesystem. The MDS daemons driven by - MDSCluster may be shared with other Filesystems. - """ - def __init__(self, ctx, fscid=None, name=None, create=False, - ec_profile=None): - super(Filesystem, self).__init__(ctx) - - self.name = name - self.ec_profile = ec_profile - self.id = None - self.metadata_pool_name = None - self.metadata_overlay = False - self.data_pool_name = None - self.data_pools = None - - client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client')) - self.client_id = client_list[0] - self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1] - - if name is not None: - if fscid is not None: - raise RuntimeError("cannot specify fscid when creating fs") - if create and not self.legacy_configured(): - self.create() - else: - if fscid is not None: - self.id = fscid - self.getinfo(refresh = True) - - # Stash a reference to the first created filesystem on ctx, so - # that if someone drops to the interactive shell they can easily - # poke our methods. - if not hasattr(self._ctx, "filesystem"): - self._ctx.filesystem = self - - def getinfo(self, refresh = False): - status = self.status() - if self.id is not None: - fsmap = status.get_fsmap(self.id) - elif self.name is not None: - fsmap = status.get_fsmap_byname(self.name) - else: - fss = [fs for fs in status.get_filesystems()] - if len(fss) == 1: - fsmap = fss[0] - elif len(fss) == 0: - raise RuntimeError("no file system available") - else: - raise RuntimeError("more than one file system available") - self.id = fsmap['id'] - self.name = fsmap['mdsmap']['fs_name'] - self.get_pool_names(status = status, refresh = refresh) - return status - - def set_metadata_overlay(self, overlay): - if self.id is not None: - raise RuntimeError("cannot specify fscid when configuring overlay") - self.metadata_overlay = overlay - - def deactivate(self, rank): - if rank < 0: - raise RuntimeError("invalid rank") - elif rank == 0: - raise RuntimeError("cannot deactivate rank 0") - self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank)) - - def set_max_mds(self, max_mds): - self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds) - - def set_allow_dirfrags(self, yes): - self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it') - - def get_pgs_per_fs_pool(self): - """ - Calculate how many PGs to use when creating a pool, in order to avoid raising any - health warnings about mon_pg_warn_min_per_osd - - :return: an integer number of PGs - """ - pg_warn_min_per_osd = int(self.get_config('mon_pg_warn_min_per_osd')) - osd_count = len(list(misc.all_roles_of_type(self._ctx.cluster, 'osd'))) - return pg_warn_min_per_osd * osd_count - - def create(self): - if self.name is None: - self.name = "cephfs" - if self.metadata_pool_name is None: - self.metadata_pool_name = "{0}_metadata".format(self.name) - if self.data_pool_name is None: - data_pool_name = "{0}_data".format(self.name) - else: - data_pool_name = self.data_pool_name - - log.info("Creating filesystem '{0}'".format(self.name)) - - pgs_per_fs_pool = self.get_pgs_per_fs_pool() - - self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', - self.metadata_pool_name, pgs_per_fs_pool.__str__()) - if self.metadata_overlay: - self.mon_manager.raw_cluster_cmd('fs', 'new', - self.name, self.metadata_pool_name, data_pool_name, - '--allow-dangerous-metadata-overlay') - else: - if self.ec_profile: - log.info("EC profile is %s", self.ec_profile) - cmd = ['osd', 'erasure-code-profile', 'set', data_pool_name] - cmd.extend(self.ec_profile) - self.mon_manager.raw_cluster_cmd(*cmd) - self.mon_manager.raw_cluster_cmd( - 'osd', 'pool', 'create', - data_pool_name, pgs_per_fs_pool.__str__(), 'erasure', - data_pool_name) - self.mon_manager.raw_cluster_cmd( - 'osd', 'pool', 'set', - data_pool_name, 'allow_ec_overwrites', 'true') - else: - self.mon_manager.raw_cluster_cmd( - 'osd', 'pool', 'create', - data_pool_name, pgs_per_fs_pool.__str__()) - self.mon_manager.raw_cluster_cmd('fs', 'new', - self.name, self.metadata_pool_name, data_pool_name) - self.check_pool_application(self.metadata_pool_name) - self.check_pool_application(data_pool_name) - # Turn off spurious standby count warnings from modifying max_mds in tests. - try: - self.mon_manager.raw_cluster_cmd('fs', 'set', self.name, 'standby_count_wanted', '0') - except CommandFailedError as e: - if e.exitstatus == 22: - # standby_count_wanted not available prior to luminous (upgrade tests would fail otherwise) - pass - else: - raise - - self.getinfo(refresh = True) - - - def check_pool_application(self, pool_name): - osd_map = self.mon_manager.get_osd_dump_json() - for pool in osd_map['pools']: - if pool['pool_name'] == pool_name: - if "application_metadata" in pool: - if not "cephfs" in pool['application_metadata']: - raise RuntimeError("Pool %p does not name cephfs as application!".\ - format(pool_name)) - - - def __del__(self): - if getattr(self._ctx, "filesystem", None) == self: - delattr(self._ctx, "filesystem") - - def exists(self): - """ - Whether a filesystem exists in the mon's filesystem list - """ - fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty')) - return self.name in [fs['name'] for fs in fs_list] - - def legacy_configured(self): - """ - Check if a legacy (i.e. pre "fs new") filesystem configuration is present. If this is - the case, the caller should avoid using Filesystem.create - """ - try: - out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools') - pools = json.loads(out_text) - metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools] - if metadata_pool_exists: - self.metadata_pool_name = 'metadata' - except CommandFailedError as e: - # For use in upgrade tests, Ceph cuttlefish and earlier don't support - # structured output (--format) from the CLI. - if e.exitstatus == 22: - metadata_pool_exists = True - else: - raise - - return metadata_pool_exists - - def _df(self): - return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")) - - def get_mds_map(self): - return self.status().get_fsmap(self.id)['mdsmap'] - - def add_data_pool(self, name): - self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.get_pgs_per_fs_pool().__str__()) - self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name) - self.get_pool_names(refresh = True) - for poolid, fs_name in self.data_pools.items(): - if name == fs_name: - return poolid - raise RuntimeError("could not get just created pool '{0}'".format(name)) - - def get_pool_names(self, refresh = False, status = None): - if refresh or self.metadata_pool_name is None or self.data_pools is None: - if status is None: - status = self.status() - fsmap = status.get_fsmap(self.id) - - osd_map = self.mon_manager.get_osd_dump_json() - id_to_name = {} - for p in osd_map['pools']: - id_to_name[p['pool']] = p['pool_name'] - - self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']] - self.data_pools = {} - for data_pool in fsmap['mdsmap']['data_pools']: - self.data_pools[data_pool] = id_to_name[data_pool] - - def get_data_pool_name(self, refresh = False): - if refresh or self.data_pools is None: - self.get_pool_names(refresh = True) - assert(len(self.data_pools) == 1) - return self.data_pools.values()[0] - - def get_data_pool_id(self, refresh = False): - """ - Don't call this if you have multiple data pools - :return: integer - """ - if refresh or self.data_pools is None: - self.get_pool_names(refresh = True) - assert(len(self.data_pools) == 1) - return self.data_pools.keys()[0] - - def get_data_pool_names(self, refresh = False): - if refresh or self.data_pools is None: - self.get_pool_names(refresh = True) - return self.data_pools.values() - - def get_metadata_pool_name(self): - return self.metadata_pool_name - - def set_data_pool_name(self, name): - if self.id is not None: - raise RuntimeError("can't set filesystem name if its fscid is set") - self.data_pool_name = name - - def get_namespace_id(self): - return self.id - - def get_pool_df(self, pool_name): - """ - Return a dict like: - {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0} - """ - for pool_df in self._df()['pools']: - if pool_df['name'] == pool_name: - return pool_df['stats'] - - raise RuntimeError("Pool name '{0}' not found".format(pool_name)) - - def get_usage(self): - return self._df()['stats']['total_used_bytes'] - - def are_daemons_healthy(self): - """ - Return true if all daemons are in one of active, standby, standby-replay, and - at least max_mds daemons are in 'active'. - - Unlike most of Filesystem, this function is tolerant of new-style `fs` - commands being missing, because we are part of the ceph installation - process during upgrade suites, so must fall back to old style commands - when we get an EINVAL on a new style command. - - :return: - """ - - active_count = 0 - try: - mds_map = self.get_mds_map() - except CommandFailedError as cfe: - # Old version, fall back to non-multi-fs commands - if cfe.exitstatus == errno.EINVAL: - mds_map = json.loads( - self.mon_manager.raw_cluster_cmd('mds', 'dump', '--format=json')) - else: - raise - - log.info("are_daemons_healthy: mds map: {0}".format(mds_map)) - - for mds_id, mds_status in mds_map['info'].items(): - if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]: - log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state'])) - return False - elif mds_status['state'] == 'up:active': - active_count += 1 - - log.info("are_daemons_healthy: {0}/{1}".format( - active_count, mds_map['max_mds'] - )) - - if active_count >= mds_map['max_mds']: - # The MDSMap says these guys are active, but let's check they really are - for mds_id, mds_status in mds_map['info'].items(): - if mds_status['state'] == 'up:active': - try: - daemon_status = self.mds_asok(["status"], mds_id=mds_status['name']) - except CommandFailedError as cfe: - if cfe.exitstatus == errno.EINVAL: - # Old version, can't do this check - continue - else: - # MDS not even running - return False - - if daemon_status['state'] != 'up:active': - # MDS hasn't taken the latest map yet - return False - - return True - else: - return False - - def get_daemon_names(self, state=None): - """ - Return MDS daemon names of those daemons in the given state - :param state: - :return: - """ - status = self.get_mds_map() - result = [] - for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])): - if mds_status['state'] == state or state is None: - result.append(mds_status['name']) - - return result - - def get_active_names(self): - """ - Return MDS daemon names of those daemons holding ranks - in state up:active - - :return: list of strings like ['a', 'b'], sorted by rank - """ - return self.get_daemon_names("up:active") - - def get_all_mds_rank(self): - status = self.get_mds_map() - result = [] - for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])): - if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay': - result.append(mds_status['rank']) - - return result - - def get_rank_names(self): - """ - Return MDS daemon names of those daemons holding a rank, - sorted by rank. This includes e.g. up:replay/reconnect - as well as active, but does not include standby or - standby-replay. - """ - status = self.get_mds_map() - result = [] - for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])): - if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay': - result.append(mds_status['name']) - - return result - - def wait_for_daemons(self, timeout=None): - """ - Wait until all daemons are healthy - :return: - """ - - if timeout is None: - timeout = DAEMON_WAIT_TIMEOUT - - elapsed = 0 - while True: - if self.are_daemons_healthy(): - return - else: - time.sleep(1) - elapsed += 1 - - if elapsed > timeout: - raise RuntimeError("Timed out waiting for MDS daemons to become healthy") - - def get_lone_mds_id(self): - """ - Get a single MDS ID: the only one if there is only one - configured, else the only one currently holding a rank, - else raise an error. - """ - if len(self.mds_ids) != 1: - alive = self.get_rank_names() - if len(alive) == 1: - return alive[0] - else: - raise ValueError("Explicit MDS argument required when multiple MDSs in use") - else: - return self.mds_ids[0] - - def recreate(self): - log.info("Creating new filesystem") - self.delete_all_filesystems() - self.id = None - self.create() - - def put_metadata_object_raw(self, object_id, infile): - """ - Save an object to the metadata pool - """ - temp_bin_path = infile - self.client_remote.run(args=[ - 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'put', object_id, temp_bin_path - ]) - - def get_metadata_object_raw(self, object_id): - """ - Retrieve an object from the metadata pool and store it in a file. - """ - temp_bin_path = '/tmp/' + object_id + '.bin' - - self.client_remote.run(args=[ - 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path - ]) - - return temp_bin_path - - def get_metadata_object(self, object_type, object_id): - """ - Retrieve an object from the metadata pool, pass it through - ceph-dencoder to dump it to JSON, and return the decoded object. - """ - temp_bin_path = '/tmp/out.bin' - - self.client_remote.run(args=[ - 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path - ]) - - stdout = StringIO() - self.client_remote.run(args=[ - 'sudo', os.path.join(self._prefix, 'ceph-dencoder'), 'type', object_type, 'import', temp_bin_path, 'decode', 'dump_json' - ], stdout=stdout) - dump_json = stdout.getvalue().strip() - try: - dump = json.loads(dump_json) - except (TypeError, ValueError): - log.error("Failed to decode JSON: '{0}'".format(dump_json)) - raise - - return dump - - def get_journal_version(self): - """ - Read the JournalPointer and Journal::Header objects to learn the version of - encoding in use. - """ - journal_pointer_object = '400.00000000' - journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object) - journal_ino = journal_pointer_dump['journal_pointer']['front'] - - journal_header_object = "{0:x}.00000000".format(journal_ino) - journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object) - - version = journal_header_dump['journal_header']['stream_format'] - log.info("Read journal version {0}".format(version)) - - return version - - def mds_asok(self, command, mds_id=None): - if mds_id is None: - mds_id = self.get_lone_mds_id() - - return self.json_asok(command, 'mds', mds_id) - - def read_cache(self, path, depth=None): - cmd = ["dump", "tree", path] - if depth is not None: - cmd.append(depth.__str__()) - result = self.mds_asok(cmd) - if len(result) == 0: - raise RuntimeError("Path not found in cache: {0}".format(path)) - - return result - - def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None, rank=None): - """ - Block until the MDS reaches a particular state, or a failure condition - is met. - - When there are multiple MDSs, succeed when exaclty one MDS is in the - goal state, or fail when any MDS is in the reject state. - - :param goal_state: Return once the MDS is in this state - :param reject: Fail if the MDS enters this state before the goal state - :param timeout: Fail if this many seconds pass before reaching goal - :return: number of seconds waited, rounded down to integer - """ - - started_at = time.time() - while True: - status = self.status() - if rank is not None: - mds_info = status.get_rank(self.id, rank) - current_state = mds_info['state'] if mds_info else None - log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state)) - elif mds_id is not None: - # mds_info is None if no daemon with this ID exists in the map - mds_info = status.get_mds(mds_id) - current_state = mds_info['state'] if mds_info else None - log.info("Looked up MDS state for {0}: {1}".format(mds_id, current_state)) - else: - # In general, look for a single MDS - states = [m['state'] for m in status.get_ranks(self.id)] - if [s for s in states if s == goal_state] == [goal_state]: - current_state = goal_state - elif reject in states: - current_state = reject - else: - current_state = None - log.info("mapped states {0} to {1}".format(states, current_state)) - - elapsed = time.time() - started_at - if current_state == goal_state: - log.info("reached state '{0}' in {1}s".format(current_state, elapsed)) - return elapsed - elif reject is not None and current_state == reject: - raise RuntimeError("MDS in reject state {0}".format(current_state)) - elif timeout is not None and elapsed > timeout: - log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id))) - raise RuntimeError( - "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format( - elapsed, goal_state, current_state - )) - else: - time.sleep(1) - - def _read_data_xattr(self, ino_no, xattr_name, type, pool): - mds_id = self.mds_ids[0] - remote = self.mds_daemons[mds_id].remote - if pool is None: - pool = self.get_data_pool_name() - - obj_name = "{0:x}.00000000".format(ino_no) - - args = [ - os.path.join(self._prefix, "rados"), "-p", pool, "getxattr", obj_name, xattr_name - ] - try: - proc = remote.run( - args=args, - stdout=StringIO()) - except CommandFailedError as e: - log.error(e.__str__()) - raise ObjectNotFound(obj_name) - - data = proc.stdout.getvalue() - - p = remote.run( - args=[os.path.join(self._prefix, "ceph-dencoder"), "type", type, "import", "-", "decode", "dump_json"], - stdout=StringIO(), - stdin=data - ) - - return json.loads(p.stdout.getvalue().strip()) - - def _write_data_xattr(self, ino_no, xattr_name, data, pool=None): - """ - Write to an xattr of the 0th data object of an inode. Will - succeed whether the object and/or xattr already exist or not. - - :param ino_no: integer inode number - :param xattr_name: string name of the xattr - :param data: byte array data to write to the xattr - :param pool: name of data pool or None to use primary data pool - :return: None - """ - remote = self.mds_daemons[self.mds_ids[0]].remote - if pool is None: - pool = self.get_data_pool_name() - - obj_name = "{0:x}.00000000".format(ino_no) - args = [ - os.path.join(self._prefix, "rados"), "-p", pool, "setxattr", - obj_name, xattr_name, data - ] - remote.run( - args=args, - stdout=StringIO()) - - def read_backtrace(self, ino_no, pool=None): - """ - Read the backtrace from the data pool, return a dict in the format - given by inode_backtrace_t::dump, which is something like: - - :: - - rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin - ceph-dencoder type inode_backtrace_t import out.bin decode dump_json - - { "ino": 1099511627778, - "ancestors": [ - { "dirino": 1, - "dname": "blah", - "version": 11}], - "pool": 1, - "old_pools": []} - - :param pool: name of pool to read backtrace from. If omitted, FS must have only - one data pool and that will be used. - """ - return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool) - - def read_layout(self, ino_no, pool=None): - """ - Read 'layout' xattr of an inode and parse the result, returning a dict like: - :: - { - "stripe_unit": 4194304, - "stripe_count": 1, - "object_size": 4194304, - "pool_id": 1, - "pool_ns": "", - } - - :param pool: name of pool to read backtrace from. If omitted, FS must have only - one data pool and that will be used. - """ - return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool) - - def _enumerate_data_objects(self, ino, size): - """ - Get the list of expected data objects for a range, and the list of objects - that really exist. - - :return a tuple of two lists of strings (expected, actual) - """ - stripe_size = 1024 * 1024 * 4 - - size = max(stripe_size, size) - - want_objects = [ - "{0:x}.{1:08x}".format(ino, n) - for n in range(0, ((size - 1) / stripe_size) + 1) - ] - - exist_objects = self.rados(["ls"], pool=self.get_data_pool_name()).split("\n") - - return want_objects, exist_objects - - def data_objects_present(self, ino, size): - """ - Check that *all* the expected data objects for an inode are present in the data pool - """ - - want_objects, exist_objects = self._enumerate_data_objects(ino, size) - missing = set(want_objects) - set(exist_objects) - - if missing: - log.info("Objects missing (ino {0}, size {1}): {2}".format( - ino, size, missing - )) - return False - else: - log.info("All objects for ino {0} size {1} found".format(ino, size)) - return True - - def data_objects_absent(self, ino, size): - want_objects, exist_objects = self._enumerate_data_objects(ino, size) - present = set(want_objects) & set(exist_objects) - - if present: - log.info("Objects not absent (ino {0}, size {1}): {2}".format( - ino, size, present - )) - return False - else: - log.info("All objects for ino {0} size {1} are absent".format(ino, size)) - return True - - def dirfrag_exists(self, ino, frag): - try: - self.rados(["stat", "{0:x}.{1:08x}".format(ino, frag)]) - except CommandFailedError as e: - return False - else: - return True - - def rados(self, args, pool=None, namespace=None, stdin_data=None): - """ - Call into the `rados` CLI from an MDS - """ - - if pool is None: - pool = self.get_metadata_pool_name() - - # Doesn't matter which MDS we use to run rados commands, they all - # have access to the pools - mds_id = self.mds_ids[0] - remote = self.mds_daemons[mds_id].remote - - # NB we could alternatively use librados pybindings for this, but it's a one-liner - # using the `rados` CLI - args = ([os.path.join(self._prefix, "rados"), "-p", pool] + - (["--namespace", namespace] if namespace else []) + - args) - p = remote.run( - args=args, - stdin=stdin_data, - stdout=StringIO()) - return p.stdout.getvalue().strip() - - def list_dirfrag(self, dir_ino): - """ - Read the named object and return the list of omap keys - - :return a list of 0 or more strings - """ - - dirfrag_obj_name = "{0:x}.00000000".format(dir_ino) - - try: - key_list_str = self.rados(["listomapkeys", dirfrag_obj_name]) - except CommandFailedError as e: - log.error(e.__str__()) - raise ObjectNotFound(dirfrag_obj_name) - - return key_list_str.split("\n") if key_list_str else [] - - def erase_metadata_objects(self, prefix): - """ - For all objects in the metadata pool matching the prefix, - erase them. - - This O(N) with the number of objects in the pool, so only suitable - for use on toy test filesystems. - """ - all_objects = self.rados(["ls"]).split("\n") - matching_objects = [o for o in all_objects if o.startswith(prefix)] - for o in matching_objects: - self.rados(["rm", o]) - - def erase_mds_objects(self, rank): - """ - Erase all the per-MDS objects for a particular rank. This includes - inotable, sessiontable, journal - """ - - def obj_prefix(multiplier): - """ - MDS object naming conventions like rank 1's - journal is at 201.*** - """ - return "%x." % (multiplier * 0x100 + rank) - - # MDS_INO_LOG_OFFSET - self.erase_metadata_objects(obj_prefix(2)) - # MDS_INO_LOG_BACKUP_OFFSET - self.erase_metadata_objects(obj_prefix(3)) - # MDS_INO_LOG_POINTER_OFFSET - self.erase_metadata_objects(obj_prefix(4)) - # MDSTables & SessionMap - self.erase_metadata_objects("mds{rank:d}_".format(rank=rank)) - - @property - def _prefix(self): - """ - Override this to set a different - """ - return "" - - def _run_tool(self, tool, args, rank=None, quiet=False): - # Tests frequently have [client] configuration that jacks up - # the objecter log level (unlikely to be interesting here) - # and does not set the mds log level (very interesting here) - if quiet: - base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1'] - else: - base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1'] - - if rank is not None: - base_args.extend(["--rank", "%d" % rank]) - - t1 = datetime.datetime.now() - r = self.tool_remote.run( - args=base_args + args, - stdout=StringIO()).stdout.getvalue().strip() - duration = datetime.datetime.now() - t1 - log.info("Ran {0} in time {1}, result:\n{2}".format( - base_args + args, duration, r - )) - return r - - @property - def tool_remote(self): - """ - An arbitrary remote to use when invoking recovery tools. Use an MDS host because - it'll definitely have keys with perms to access cephfs metadata pool. This is public - so that tests can use this remote to go get locally written output files from the tools. - """ - mds_id = self.mds_ids[0] - return self.mds_daemons[mds_id].remote - - def journal_tool(self, args, rank=None, quiet=False): - """ - Invoke cephfs-journal-tool with the passed arguments, and return its stdout - """ - return self._run_tool("cephfs-journal-tool", args, rank, quiet) - - def table_tool(self, args, quiet=False): - """ - Invoke cephfs-table-tool with the passed arguments, and return its stdout - """ - return self._run_tool("cephfs-table-tool", args, None, quiet) - - def data_scan(self, args, quiet=False, worker_count=1): - """ - Invoke cephfs-data-scan with the passed arguments, and return its stdout - - :param worker_count: if greater than 1, multiple workers will be run - in parallel and the return value will be None - """ - - workers = [] - - for n in range(0, worker_count): - if worker_count > 1: - # data-scan args first token is a command, followed by args to it. - # insert worker arguments after the command. - cmd = args[0] - worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:] - else: - worker_args = args - - workers.append(Greenlet.spawn(lambda wargs=worker_args: - self._run_tool("cephfs-data-scan", wargs, None, quiet))) - - for w in workers: - w.get() - - if worker_count == 1: - return workers[0].value - else: - return None diff --git a/src/ceph/qa/tasks/cephfs/fuse_mount.py b/src/ceph/qa/tasks/cephfs/fuse_mount.py deleted file mode 100644 index 8d8410c..0000000 --- a/src/ceph/qa/tasks/cephfs/fuse_mount.py +++ /dev/null @@ -1,428 +0,0 @@ - -from StringIO import StringIO -import json -import time -import logging -from textwrap import dedent - -from teuthology import misc -from teuthology.contextutil import MaxWhileTries -from teuthology.orchestra import run -from teuthology.orchestra.run import CommandFailedError -from .mount import CephFSMount - -log = logging.getLogger(__name__) - - -class FuseMount(CephFSMount): - def __init__(self, client_config, test_dir, client_id, client_remote): - super(FuseMount, self).__init__(test_dir, client_id, client_remote) - - self.client_config = client_config if client_config else {} - self.fuse_daemon = None - self._fuse_conn = None - - def mount(self, mount_path=None, mount_fs_name=None): - try: - return self._mount(mount_path, mount_fs_name) - except RuntimeError: - # Catch exceptions by the mount() logic (i.e. not remote command - # failures) and ensure the mount is not left half-up. - # Otherwise we might leave a zombie mount point that causes - # anyone traversing cephtest/ to get hung up on. - log.warn("Trying to clean up after failed mount") - self.umount_wait(force=True) - raise - - def _mount(self, mount_path, mount_fs_name): - log.info("Client client.%s config is %s" % (self.client_id, self.client_config)) - - daemon_signal = 'kill' - if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None: - daemon_signal = 'term' - - log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format( - id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) - - self.client_remote.run( - args=[ - 'mkdir', - '--', - self.mountpoint, - ], - ) - - run_cmd = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=self.test_dir), - 'daemon-helper', - daemon_signal, - ] - - fuse_cmd = ['ceph-fuse', "-f"] - - if mount_path is not None: - fuse_cmd += ["--client_mountpoint={0}".format(mount_path)] - - if mount_fs_name is not None: - fuse_cmd += ["--client_mds_namespace={0}".format(mount_fs_name)] - - fuse_cmd += [ - '--name', 'client.{id}'.format(id=self.client_id), - # TODO ceph-fuse doesn't understand dash dash '--', - self.mountpoint, - ] - - if self.client_config.get('valgrind') is not None: - run_cmd = misc.get_valgrind_args( - self.test_dir, - 'client.{id}'.format(id=self.client_id), - run_cmd, - self.client_config.get('valgrind'), - ) - - run_cmd.extend(fuse_cmd) - - def list_connections(): - self.client_remote.run( - args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"], - check_status=False - ) - p = self.client_remote.run( - args=["ls", "/sys/fs/fuse/connections"], - stdout=StringIO(), - check_status=False - ) - if p.exitstatus != 0: - return [] - - ls_str = p.stdout.getvalue().strip() - if ls_str: - return [int(n) for n in ls_str.split("\n")] - else: - return [] - - # Before starting ceph-fuse process, note the contents of - # /sys/fs/fuse/connections - pre_mount_conns = list_connections() - log.info("Pre-mount connections: {0}".format(pre_mount_conns)) - - proc = self.client_remote.run( - args=run_cmd, - logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)), - stdin=run.PIPE, - wait=False, - ) - self.fuse_daemon = proc - - # Wait for the connection reference to appear in /sys - mount_wait = self.client_config.get('mount_wait', 0) - if mount_wait > 0: - log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait)) - time.sleep(mount_wait) - timeout = int(self.client_config.get('mount_timeout', 30)) - waited = 0 - - post_mount_conns = list_connections() - while len(post_mount_conns) <= len(pre_mount_conns): - if self.fuse_daemon.finished: - # Did mount fail? Raise the CommandFailedError instead of - # hitting the "failed to populate /sys/" timeout - self.fuse_daemon.wait() - time.sleep(1) - waited += 1 - if waited > timeout: - raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format( - waited - )) - else: - post_mount_conns = list_connections() - - log.info("Post-mount connections: {0}".format(post_mount_conns)) - - # Record our fuse connection number so that we can use it when - # forcing an unmount - new_conns = list(set(post_mount_conns) - set(pre_mount_conns)) - if len(new_conns) == 0: - raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns)) - elif len(new_conns) > 1: - raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns)) - else: - self._fuse_conn = new_conns[0] - - def is_mounted(self): - proc = self.client_remote.run( - args=[ - 'stat', - '--file-system', - '--printf=%T\n', - '--', - self.mountpoint, - ], - stdout=StringIO(), - stderr=StringIO(), - wait=False - ) - try: - proc.wait() - except CommandFailedError: - if ("endpoint is not connected" in proc.stderr.getvalue() - or "Software caused connection abort" in proc.stderr.getvalue()): - # This happens is fuse is killed without unmount - log.warn("Found stale moutn point at {0}".format(self.mountpoint)) - return True - else: - # This happens if the mount directory doesn't exist - log.info('mount point does not exist: %s', self.mountpoint) - return False - - fstype = proc.stdout.getvalue().rstrip('\n') - if fstype == 'fuseblk': - log.info('ceph-fuse is mounted on %s', self.mountpoint) - return True - else: - log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format( - fstype=fstype)) - return False - - def wait_until_mounted(self): - """ - Check to make sure that fuse is mounted on mountpoint. If not, - sleep for 5 seconds and check again. - """ - - while not self.is_mounted(): - # Even if it's not mounted, it should at least - # be running: catch simple failures where it has terminated. - assert not self.fuse_daemon.poll() - - time.sleep(5) - - # Now that we're mounted, set permissions so that the rest of the test will have - # unrestricted access to the filesystem mount. - self.client_remote.run( - args=['sudo', 'chmod', '1777', self.mountpoint]) - - def _mountpoint_exists(self): - return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0 - - def umount(self): - try: - log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name)) - self.client_remote.run( - args=[ - 'sudo', - 'fusermount', - '-u', - self.mountpoint, - ], - ) - except run.CommandFailedError: - log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name)) - - self.client_remote.run(args=[ - 'sudo', - run.Raw('PATH=/usr/sbin:$PATH'), - 'lsof', - run.Raw(';'), - 'ps', - 'auxf', - ]) - - # abort the fuse mount, killing all hung processes - if self._fuse_conn: - self.run_python(dedent(""" - import os - path = "/sys/fs/fuse/connections/{0}/abort" - if os.path.exists(path): - open(path, "w").write("1") - """).format(self._fuse_conn)) - self._fuse_conn = None - - stderr = StringIO() - try: - # make sure its unmounted - self.client_remote.run( - args=[ - 'sudo', - 'umount', - '-l', - '-f', - self.mountpoint, - ], - stderr=stderr - ) - except CommandFailedError: - if self.is_mounted(): - raise - - assert not self.is_mounted() - self._fuse_conn = None - - def umount_wait(self, force=False, require_clean=False): - """ - :param force: Complete cleanly even if the MDS is offline - """ - if force: - assert not require_clean # mutually exclusive - - # When we expect to be forcing, kill the ceph-fuse process directly. - # This should avoid hitting the more aggressive fallback killing - # in umount() which can affect other mounts too. - self.fuse_daemon.stdin.close() - - # However, we will still hit the aggressive wait if there is an ongoing - # mount -o remount (especially if the remount is stuck because MDSs - # are unavailable) - - self.umount() - - try: - if self.fuse_daemon: - # Permit a timeout, so that we do not block forever - run.wait([self.fuse_daemon], 900) - except MaxWhileTries: - log.error("process failed to terminate after unmount. This probably" - "indicates a bug within ceph-fuse.") - raise - except CommandFailedError: - if require_clean: - raise - - self.cleanup() - - def cleanup(self): - """ - Remove the mount point. - - Prerequisite: the client is not mounted. - """ - stderr = StringIO() - try: - self.client_remote.run( - args=[ - 'rmdir', - '--', - self.mountpoint, - ], - stderr=stderr - ) - except CommandFailedError: - if "No such file or directory" in stderr.getvalue(): - pass - else: - raise - - def kill(self): - """ - Terminate the client without removing the mount point. - """ - self.fuse_daemon.stdin.close() - try: - self.fuse_daemon.wait() - except CommandFailedError: - pass - - def kill_cleanup(self): - """ - Follow up ``kill`` to get to a clean unmounted state. - """ - self.umount() - self.cleanup() - - def teardown(self): - """ - Whatever the state of the mount, get it gone. - """ - super(FuseMount, self).teardown() - - self.umount() - - if self.fuse_daemon and not self.fuse_daemon.finished: - self.fuse_daemon.stdin.close() - try: - self.fuse_daemon.wait() - except CommandFailedError: - pass - - # Indiscriminate, unlike the touchier cleanup() - self.client_remote.run( - args=[ - 'rm', - '-rf', - self.mountpoint, - ], - ) - - def _asok_path(self): - return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id) - - @property - def _prefix(self): - return "" - - def admin_socket(self, args): - pyscript = """ -import glob -import re -import os -import subprocess - -def find_socket(client_name): - asok_path = "{asok_path}" - files = glob.glob(asok_path) - - # Given a non-glob path, it better be there - if "*" not in asok_path: - assert(len(files) == 1) - return files[0] - - for f in files: - pid = re.match(".*\.(\d+)\.asok$", f).group(1) - if os.path.exists("/proc/{{0}}".format(pid)): - return f - raise RuntimeError("Client socket {{0}} not found".format(client_name)) - -print find_socket("{client_name}") -""".format( - asok_path=self._asok_path(), - client_name="client.{0}".format(self.client_id)) - - # Find the admin socket - p = self.client_remote.run(args=[ - 'python', '-c', pyscript - ], stdout=StringIO()) - asok_path = p.stdout.getvalue().strip() - log.info("Found client admin socket at {0}".format(asok_path)) - - # Query client ID from admin socket - p = self.client_remote.run( - args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args, - stdout=StringIO()) - return json.loads(p.stdout.getvalue()) - - def get_global_id(self): - """ - Look up the CephFS client ID for this mount - """ - - return self.admin_socket(['mds_sessions'])['id'] - - def get_osd_epoch(self): - """ - Return 2-tuple of osd_epoch, osd_epoch_barrier - """ - status = self.admin_socket(['status']) - return status['osd_epoch'], status['osd_epoch_barrier'] - - def get_dentry_count(self): - """ - Return 2-tuple of dentry_count, dentry_pinned_count - """ - status = self.admin_socket(['status']) - return status['dentry_count'], status['dentry_pinned_count'] - - def set_cache_size(self, size): - return self.admin_socket(['config', 'set', 'client_cache_size', str(size)]) diff --git a/src/ceph/qa/tasks/cephfs/kernel_mount.py b/src/ceph/qa/tasks/cephfs/kernel_mount.py deleted file mode 100644 index bfa1ac6..0000000 --- a/src/ceph/qa/tasks/cephfs/kernel_mount.py +++ /dev/null @@ -1,267 +0,0 @@ -from StringIO import StringIO -import json -import logging -from textwrap import dedent -from teuthology.orchestra.run import CommandFailedError -from teuthology import misc - -from teuthology.orchestra import remote as orchestra_remote -from teuthology.orchestra import run -from teuthology.contextutil import MaxWhileTries -from .mount import CephFSMount - -log = logging.getLogger(__name__) - - -UMOUNT_TIMEOUT = 300 - - -class KernelMount(CephFSMount): - def __init__(self, mons, test_dir, client_id, client_remote, - ipmi_user, ipmi_password, ipmi_domain): - super(KernelMount, self).__init__(test_dir, client_id, client_remote) - self.mons = mons - - self.mounted = False - self.ipmi_user = ipmi_user - self.ipmi_password = ipmi_password - self.ipmi_domain = ipmi_domain - - def write_secret_file(self, remote, role, keyring, filename): - """ - Stash the keyring in the filename specified. - """ - remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=self.test_dir), - 'ceph-authtool', - '--name={role}'.format(role=role), - '--print-key', - keyring, - run.Raw('>'), - filename, - ], - ) - - def mount(self, mount_path=None, mount_fs_name=None): - log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format( - id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) - - keyring = self.get_keyring_path() - secret = '{tdir}/ceph.data/client.{id}.secret'.format(tdir=self.test_dir, id=self.client_id) - self.write_secret_file(self.client_remote, 'client.{id}'.format(id=self.client_id), - keyring, secret) - - self.client_remote.run( - args=[ - 'mkdir', - '--', - self.mountpoint, - ], - ) - - if mount_path is None: - mount_path = "/" - - opts = 'name={id},secretfile={secret},norequire_active_mds'.format(id=self.client_id, - secret=secret) - - if mount_fs_name is not None: - opts += ",mds_namespace={0}".format(mount_fs_name) - - self.client_remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=self.test_dir), - '/sbin/mount.ceph', - '{mons}:{mount_path}'.format(mons=','.join(self.mons), mount_path=mount_path), - self.mountpoint, - '-v', - '-o', - opts - ], - ) - - self.client_remote.run( - args=['sudo', 'chmod', '1777', self.mountpoint]) - - self.mounted = True - - def umount(self, force=False): - log.debug('Unmounting client client.{id}...'.format(id=self.client_id)) - - cmd=['sudo', 'umount', self.mountpoint] - if force: - cmd.append('-f') - - try: - self.client_remote.run(args=cmd) - except Exception as e: - self.client_remote.run(args=[ - 'sudo', - run.Raw('PATH=/usr/sbin:$PATH'), - 'lsof', - run.Raw(';'), - 'ps', 'auxf', - ]) - raise e - - rproc = self.client_remote.run( - args=[ - 'rmdir', - '--', - self.mountpoint, - ], - wait=False - ) - run.wait([rproc], UMOUNT_TIMEOUT) - self.mounted = False - - def cleanup(self): - pass - - def umount_wait(self, force=False, require_clean=False): - """ - Unlike the fuse client, the kernel client's umount is immediate - """ - if not self.is_mounted(): - return - - try: - self.umount(force) - except (CommandFailedError, MaxWhileTries): - if not force: - raise - - self.kill() - self.kill_cleanup() - - self.mounted = False - - def is_mounted(self): - return self.mounted - - def wait_until_mounted(self): - """ - Unlike the fuse client, the kernel client is up and running as soon - as the initial mount() function returns. - """ - assert self.mounted - - def teardown(self): - super(KernelMount, self).teardown() - if self.mounted: - self.umount() - - def kill(self): - """ - The Ceph kernel client doesn't have a mechanism to kill itself (doing - that in side the kernel would be weird anyway), so we reboot the whole node - to get the same effect. - - We use IPMI to reboot, because we don't want the client to send any - releases of capabilities. - """ - - con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, - self.ipmi_user, - self.ipmi_password, - self.ipmi_domain) - con.power_off() - - self.mounted = False - - def kill_cleanup(self): - assert not self.mounted - - con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, - self.ipmi_user, - self.ipmi_password, - self.ipmi_domain) - con.power_on() - - # Wait for node to come back up after reboot - misc.reconnect(None, 300, [self.client_remote]) - - # Remove mount directory - self.client_remote.run( - args=[ - 'rmdir', - '--', - self.mountpoint, - ], - ) - - def _find_debug_dir(self): - """ - Find the debugfs folder for this mount - """ - pyscript = dedent(""" - import glob - import os - import json - - def get_id_to_dir(): - result = {} - for dir in glob.glob("/sys/kernel/debug/ceph/*"): - mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines() - client_id = mds_sessions_lines[1].split()[1].strip('"') - - result[client_id] = dir - return result - - print json.dumps(get_id_to_dir()) - """) - - p = self.client_remote.run(args=[ - 'sudo', 'python', '-c', pyscript - ], stdout=StringIO()) - client_id_to_dir = json.loads(p.stdout.getvalue()) - - try: - return client_id_to_dir[self.client_id] - except KeyError: - log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format( - self.client_id, ",".join(client_id_to_dir.keys()) - )) - raise - - def _read_debug_file(self, filename): - debug_dir = self._find_debug_dir() - - pyscript = dedent(""" - import os - - print open(os.path.join("{debug_dir}", "{filename}")).read() - """).format(debug_dir=debug_dir, filename=filename) - - p = self.client_remote.run(args=[ - 'sudo', 'python', '-c', pyscript - ], stdout=StringIO()) - return p.stdout.getvalue() - - def get_global_id(self): - """ - Look up the CephFS client ID for this mount, using debugfs. - """ - - assert self.mounted - - mds_sessions = self._read_debug_file("mds_sessions") - lines = mds_sessions.split("\n") - return int(lines[0].split()[1]) - - def get_osd_epoch(self): - """ - Return 2-tuple of osd_epoch, osd_epoch_barrier - """ - osd_map = self._read_debug_file("osdmap") - lines = osd_map.split("\n") - first_line_tokens = lines[0].split() - epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3]) - - return epoch, barrier diff --git a/src/ceph/qa/tasks/cephfs/mount.py b/src/ceph/qa/tasks/cephfs/mount.py deleted file mode 100644 index 4f96e6c..0000000 --- a/src/ceph/qa/tasks/cephfs/mount.py +++ /dev/null @@ -1,627 +0,0 @@ -from contextlib import contextmanager -import json -import logging -import datetime -import time -from textwrap import dedent -import os -from StringIO import StringIO -from teuthology.orchestra import run -from teuthology.orchestra.run import CommandFailedError, ConnectionLostError - -log = logging.getLogger(__name__) - - -class CephFSMount(object): - def __init__(self, test_dir, client_id, client_remote): - """ - :param test_dir: Global teuthology test dir - :param client_id: Client ID, the 'foo' in client.foo - :param client_remote: Remote instance for the host where client will run - """ - - self.test_dir = test_dir - self.client_id = client_id - self.client_remote = client_remote - self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id) - - self.test_files = ['a', 'b', 'c'] - - self.background_procs = [] - - @property - def mountpoint(self): - return os.path.join( - self.test_dir, '{dir_name}'.format(dir_name=self.mountpoint_dir_name)) - - def is_mounted(self): - raise NotImplementedError() - - def mount(self, mount_path=None, mount_fs_name=None): - raise NotImplementedError() - - def umount(self): - raise NotImplementedError() - - def umount_wait(self, force=False, require_clean=False): - """ - - :param force: Expect that the mount will not shutdown cleanly: kill - it hard. - :param require_clean: Wait for the Ceph client associated with the - mount (e.g. ceph-fuse) to terminate, and - raise if it doesn't do so cleanly. - :return: - """ - raise NotImplementedError() - - def kill_cleanup(self): - raise NotImplementedError() - - def kill(self): - raise NotImplementedError() - - def cleanup(self): - raise NotImplementedError() - - def wait_until_mounted(self): - raise NotImplementedError() - - def get_keyring_path(self): - return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id) - - @property - def config_path(self): - """ - Path to ceph.conf: override this if you're not a normal systemwide ceph install - :return: stringv - """ - return "/etc/ceph/ceph.conf" - - @contextmanager - def mounted(self): - """ - A context manager, from an initially unmounted state, to mount - this, yield, and then unmount and clean up. - """ - self.mount() - self.wait_until_mounted() - try: - yield - finally: - self.umount_wait() - - def create_files(self): - assert(self.is_mounted()) - - for suffix in self.test_files: - log.info("Creating file {0}".format(suffix)) - self.client_remote.run(args=[ - 'sudo', 'touch', os.path.join(self.mountpoint, suffix) - ]) - - def check_files(self): - assert(self.is_mounted()) - - for suffix in self.test_files: - log.info("Checking file {0}".format(suffix)) - r = self.client_remote.run(args=[ - 'sudo', 'ls', os.path.join(self.mountpoint, suffix) - ], check_status=False) - if r.exitstatus != 0: - raise RuntimeError("Expected file {0} not found".format(suffix)) - - def create_destroy(self): - assert(self.is_mounted()) - - filename = "{0} {1}".format(datetime.datetime.now(), self.client_id) - log.debug("Creating test file {0}".format(filename)) - self.client_remote.run(args=[ - 'sudo', 'touch', os.path.join(self.mountpoint, filename) - ]) - log.debug("Deleting test file {0}".format(filename)) - self.client_remote.run(args=[ - 'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename) - ]) - - def _run_python(self, pyscript): - return self.client_remote.run(args=[ - 'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', 'python', '-c', pyscript - ], wait=False, stdin=run.PIPE, stdout=StringIO()) - - def run_python(self, pyscript): - p = self._run_python(pyscript) - p.wait() - return p.stdout.getvalue().strip() - - def run_shell(self, args, wait=True): - args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args - return self.client_remote.run(args=args, stdout=StringIO(), - stderr=StringIO(), wait=wait) - - def open_no_data(self, basename): - """ - A pure metadata operation - """ - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - p = self._run_python(dedent( - """ - f = open("{path}", 'w') - """.format(path=path) - )) - p.wait() - - def open_background(self, basename="background_file"): - """ - Open a file for writing, then block such that the client - will hold a capability. - - Don't return until the remote process has got as far as opening - the file, then return the RemoteProcess instance. - """ - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - pyscript = dedent(""" - import time - - f = open("{path}", 'w') - f.write('content') - f.flush() - f.write('content2') - while True: - time.sleep(1) - """).format(path=path) - - rproc = self._run_python(pyscript) - self.background_procs.append(rproc) - - # This wait would not be sufficient if the file had already - # existed, but it's simple and in practice users of open_background - # are not using it on existing files. - self.wait_for_visible(basename) - - return rproc - - def wait_for_visible(self, basename="background_file", timeout=30): - i = 0 - while i < timeout: - r = self.client_remote.run(args=[ - 'sudo', 'ls', os.path.join(self.mountpoint, basename) - ], check_status=False) - if r.exitstatus == 0: - log.debug("File {0} became visible from {1} after {2}s".format( - basename, self.client_id, i)) - return - else: - time.sleep(1) - i += 1 - - raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format( - i, basename, self.client_id)) - - def lock_background(self, basename="background_file", do_flock=True): - """ - Open and lock a files for writing, hold the lock in a background process - """ - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - script_builder = """ - import time - import fcntl - import struct""" - if do_flock: - script_builder += """ - f1 = open("{path}-1", 'w') - fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)""" - script_builder += """ - f2 = open("{path}-2", 'w') - lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) - fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) - while True: - time.sleep(1) - """ - - pyscript = dedent(script_builder).format(path=path) - - log.info("lock_background file {0}".format(basename)) - rproc = self._run_python(pyscript) - self.background_procs.append(rproc) - return rproc - - def lock_and_release(self, basename="background_file"): - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - script = """ - import time - import fcntl - import struct - f1 = open("{path}-1", 'w') - fcntl.flock(f1, fcntl.LOCK_EX) - f2 = open("{path}-2", 'w') - lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) - fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) - """ - pyscript = dedent(script).format(path=path) - - log.info("lock_and_release file {0}".format(basename)) - return self._run_python(pyscript) - - def check_filelock(self, basename="background_file", do_flock=True): - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - script_builder = """ - import fcntl - import errno - import struct""" - if do_flock: - script_builder += """ - f1 = open("{path}-1", 'r') - try: - fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB) - except IOError, e: - if e.errno == errno.EAGAIN: - pass - else: - raise RuntimeError("flock on file {path}-1 not found")""" - script_builder += """ - f2 = open("{path}-2", 'r') - try: - lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) - fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) - except IOError, e: - if e.errno == errno.EAGAIN: - pass - else: - raise RuntimeError("posix lock on file {path}-2 not found") - """ - pyscript = dedent(script_builder).format(path=path) - - log.info("check lock on file {0}".format(basename)) - self.client_remote.run(args=[ - 'sudo', 'python', '-c', pyscript - ]) - - def write_background(self, basename="background_file", loop=False): - """ - Open a file for writing, complete as soon as you can - :param basename: - :return: - """ - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - pyscript = dedent(""" - import os - import time - - fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0644) - try: - while True: - os.write(fd, 'content') - time.sleep(1) - if not {loop}: - break - except IOError, e: - pass - os.close(fd) - """).format(path=path, loop=str(loop)) - - rproc = self._run_python(pyscript) - self.background_procs.append(rproc) - return rproc - - def write_n_mb(self, filename, n_mb, seek=0, wait=True): - """ - Write the requested number of megabytes to a file - """ - assert(self.is_mounted()) - - return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename), - "bs=1M", "conv=fdatasync", - "count={0}".format(n_mb), - "seek={0}".format(seek) - ], wait=wait) - - def write_test_pattern(self, filename, size): - log.info("Writing {0} bytes to {1}".format(size, filename)) - return self.run_python(dedent(""" - import zlib - path = "{path}" - f = open(path, 'w') - for i in range(0, {size}): - val = zlib.crc32("%s" % i) & 7 - f.write(chr(val)) - f.close() - """.format( - path=os.path.join(self.mountpoint, filename), - size=size - ))) - - def validate_test_pattern(self, filename, size): - log.info("Validating {0} bytes from {1}".format(size, filename)) - return self.run_python(dedent(""" - import zlib - path = "{path}" - f = open(path, 'r') - bytes = f.read() - f.close() - if len(bytes) != {size}: - raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format( - len(bytes), {size} - )) - for i, b in enumerate(bytes): - val = zlib.crc32("%s" % i) & 7 - if b != chr(val): - raise RuntimeError("Bad data at offset {{0}}".format(i)) - """.format( - path=os.path.join(self.mountpoint, filename), - size=size - ))) - - def open_n_background(self, fs_path, count): - """ - Open N files for writing, hold them open in a background process - - :param fs_path: Path relative to CephFS root, e.g. "foo/bar" - :return: a RemoteProcess - """ - assert(self.is_mounted()) - - abs_path = os.path.join(self.mountpoint, fs_path) - - pyscript = dedent(""" - import sys - import time - import os - - n = {count} - abs_path = "{abs_path}" - - if not os.path.exists(os.path.dirname(abs_path)): - os.makedirs(os.path.dirname(abs_path)) - - handles = [] - for i in range(0, n): - fname = "{{0}}_{{1}}".format(abs_path, i) - handles.append(open(fname, 'w')) - - while True: - time.sleep(1) - """).format(abs_path=abs_path, count=count) - - rproc = self._run_python(pyscript) - self.background_procs.append(rproc) - return rproc - - def create_n_files(self, fs_path, count, sync=False): - assert(self.is_mounted()) - - abs_path = os.path.join(self.mountpoint, fs_path) - - pyscript = dedent(""" - import sys - import time - import os - - n = {count} - abs_path = "{abs_path}" - - if not os.path.exists(os.path.dirname(abs_path)): - os.makedirs(os.path.dirname(abs_path)) - - for i in range(0, n): - fname = "{{0}}_{{1}}".format(abs_path, i) - h = open(fname, 'w') - h.write('content') - if {sync}: - h.flush() - os.fsync(h.fileno()) - h.close() - """).format(abs_path=abs_path, count=count, sync=str(sync)) - - self.run_python(pyscript) - - def teardown(self): - for p in self.background_procs: - log.info("Terminating background process") - self._kill_background(p) - - self.background_procs = [] - - def _kill_background(self, p): - if p.stdin: - p.stdin.close() - try: - p.wait() - except (CommandFailedError, ConnectionLostError): - pass - - def kill_background(self, p): - """ - For a process that was returned by one of the _background member functions, - kill it hard. - """ - self._kill_background(p) - self.background_procs.remove(p) - - def get_global_id(self): - raise NotImplementedError() - - def get_osd_epoch(self): - raise NotImplementedError() - - def stat(self, fs_path, wait=True): - """ - stat a file, and return the result as a dictionary like this: - { - "st_ctime": 1414161137.0, - "st_mtime": 1414161137.0, - "st_nlink": 33, - "st_gid": 0, - "st_dev": 16777218, - "st_size": 1190, - "st_ino": 2, - "st_uid": 0, - "st_mode": 16877, - "st_atime": 1431520593.0 - } - - Raises exception on absent file. - """ - abs_path = os.path.join(self.mountpoint, fs_path) - - pyscript = dedent(""" - import os - import stat - import json - import sys - - try: - s = os.stat("{path}") - except OSError as e: - sys.exit(e.errno) - - attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"] - print json.dumps( - dict([(a, getattr(s, a)) for a in attrs]), - indent=2) - """).format(path=abs_path) - proc = self._run_python(pyscript) - if wait: - proc.wait() - return json.loads(proc.stdout.getvalue().strip()) - else: - return proc - - def touch(self, fs_path): - """ - Create a dentry if it doesn't already exist. This python - implementation exists because the usual command line tool doesn't - pass through error codes like EIO. - - :param fs_path: - :return: - """ - abs_path = os.path.join(self.mountpoint, fs_path) - pyscript = dedent(""" - import sys - import errno - - try: - f = open("{path}", "w") - f.close() - except IOError as e: - sys.exit(errno.EIO) - """).format(path=abs_path) - proc = self._run_python(pyscript) - proc.wait() - - def path_to_ino(self, fs_path, follow_symlinks=True): - abs_path = os.path.join(self.mountpoint, fs_path) - - if follow_symlinks: - pyscript = dedent(""" - import os - import stat - - print os.stat("{path}").st_ino - """).format(path=abs_path) - else: - pyscript = dedent(""" - import os - import stat - - print os.lstat("{path}").st_ino - """).format(path=abs_path) - - proc = self._run_python(pyscript) - proc.wait() - return int(proc.stdout.getvalue().strip()) - - def path_to_nlink(self, fs_path): - abs_path = os.path.join(self.mountpoint, fs_path) - - pyscript = dedent(""" - import os - import stat - - print os.stat("{path}").st_nlink - """).format(path=abs_path) - - proc = self._run_python(pyscript) - proc.wait() - return int(proc.stdout.getvalue().strip()) - - def ls(self, path=None): - """ - Wrap ls: return a list of strings - """ - cmd = ["ls"] - if path: - cmd.append(path) - - ls_text = self.run_shell(cmd).stdout.getvalue().strip() - - if ls_text: - return ls_text.split("\n") - else: - # Special case because otherwise split on empty string - # gives you [''] instead of [] - return [] - - def setfattr(self, path, key, val): - """ - Wrap setfattr. - - :param path: relative to mount point - :param key: xattr name - :param val: xattr value - :return: None - """ - self.run_shell(["setfattr", "-n", key, "-v", val, path]) - - def getfattr(self, path, attr): - """ - Wrap getfattr: return the values of a named xattr on one file, or - None if the attribute is not found. - - :return: a string - """ - p = self.run_shell(["getfattr", "--only-values", "-n", attr, path], wait=False) - try: - p.wait() - except CommandFailedError as e: - if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue(): - return None - else: - raise - - return p.stdout.getvalue() - - def df(self): - """ - Wrap df: return a dict of usage fields in bytes - """ - - p = self.run_shell(["df", "-B1", "."]) - lines = p.stdout.getvalue().strip().split("\n") - fs, total, used, avail = lines[1].split()[:4] - log.warn(lines) - - return { - "total": int(total), - "used": int(used), - "available": int(avail) - } diff --git a/src/ceph/qa/tasks/cephfs/test_auto_repair.py b/src/ceph/qa/tasks/cephfs/test_auto_repair.py deleted file mode 100644 index c0aa2e4..0000000 --- a/src/ceph/qa/tasks/cephfs/test_auto_repair.py +++ /dev/null @@ -1,90 +0,0 @@ - -""" -Exercise the MDS's auto repair functions -""" - -import logging -import time - -from teuthology.orchestra.run import CommandFailedError -from tasks.cephfs.cephfs_test_case import CephFSTestCase - - -log = logging.getLogger(__name__) - - -# Arbitrary timeouts for operations involving restarting -# an MDS or waiting for it to come up -MDS_RESTART_GRACE = 60 - - -class TestMDSAutoRepair(CephFSTestCase): - def test_backtrace_repair(self): - """ - MDS should verify/fix backtrace on fetch dirfrag - """ - - self.mount_a.run_shell(["mkdir", "testdir1"]) - self.mount_a.run_shell(["touch", "testdir1/testfile"]) - dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1")) - - # drop inodes caps - self.mount_a.umount_wait() - - # flush journal entries to dirfrag objects, and expire journal - self.fs.mds_asok(['flush', 'journal']) - - # Restart the MDS to drop the metadata cache (because we expired the journal, - # nothing gets replayed into cache on restart) - self.fs.mds_stop() - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - # remove testdir1's backtrace - self.fs.rados(["rmxattr", dir_objname, "parent"]) - - # readdir (fetch dirfrag) should fix testdir1's backtrace - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.mount_a.run_shell(["ls", "testdir1"]) - - # flush journal entries to dirfrag objects - self.fs.mds_asok(['flush', 'journal']) - - # check if backtrace exists - self.fs.rados(["getxattr", dir_objname, "parent"]) - - def test_mds_readonly(self): - """ - test if MDS behave correct when it's readonly - """ - # operation should successd when MDS is not readonly - self.mount_a.run_shell(["touch", "test_file1"]) - writer = self.mount_a.write_background(loop=True) - - time.sleep(10) - self.assertFalse(writer.finished) - - # force MDS to read-only mode - self.fs.mds_asok(['force_readonly']) - time.sleep(10) - - # touching test file should fail - try: - self.mount_a.run_shell(["touch", "test_file1"]) - except CommandFailedError: - pass - else: - self.assertTrue(False) - - # background writer also should fail - self.assertTrue(writer.finished) - - # The MDS should report its readonly health state to the mon - self.wait_for_health("MDS_READ_ONLY", timeout=30) - - # restart mds to make it writable - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - self.wait_for_health_clear(timeout=30) diff --git a/src/ceph/qa/tasks/cephfs/test_backtrace.py b/src/ceph/qa/tasks/cephfs/test_backtrace.py deleted file mode 100644 index af246a1..0000000 --- a/src/ceph/qa/tasks/cephfs/test_backtrace.py +++ /dev/null @@ -1,78 +0,0 @@ - -from tasks.cephfs.cephfs_test_case import CephFSTestCase - - -class TestBacktrace(CephFSTestCase): - def test_backtrace(self): - """ - That the 'parent' and 'layout' xattrs on the head objects of files - are updated correctly. - """ - - old_data_pool_name = self.fs.get_data_pool_name() - old_pool_id = self.fs.get_data_pool_id() - - # Create a file for subsequent checks - self.mount_a.run_shell(["mkdir", "parent_a"]) - self.mount_a.run_shell(["touch", "parent_a/alpha"]) - file_ino = self.mount_a.path_to_ino("parent_a/alpha") - - # That backtrace and layout are written after initial flush - self.fs.mds_asok(["flush", "journal"]) - backtrace = self.fs.read_backtrace(file_ino) - self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']]) - layout = self.fs.read_layout(file_ino) - self.assertDictEqual(layout, { - "stripe_unit": 4194304, - "stripe_count": 1, - "object_size": 4194304, - "pool_id": old_pool_id, - "pool_ns": "", - }) - self.assertEqual(backtrace['pool'], old_pool_id) - - # That backtrace is written after parentage changes - self.mount_a.run_shell(["mkdir", "parent_b"]) - self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"]) - - self.fs.mds_asok(["flush", "journal"]) - backtrace = self.fs.read_backtrace(file_ino) - self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']]) - - # Create a new data pool - new_pool_name = "data_new" - new_pool_id = self.fs.add_data_pool(new_pool_name) - - # That an object which has switched pools gets its backtrace updated - self.mount_a.setfattr("./parent_b/alpha", - "ceph.file.layout.pool", new_pool_name) - self.fs.mds_asok(["flush", "journal"]) - backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name) - self.assertEqual(backtrace_old_pool['pool'], new_pool_id) - backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name) - self.assertEqual(backtrace_new_pool['pool'], new_pool_id) - new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name) - self.assertEqual(new_pool_layout['pool_id'], new_pool_id) - self.assertEqual(new_pool_layout['pool_ns'], '') - - # That subsequent linkage changes are only written to new pool backtrace - self.mount_a.run_shell(["mkdir", "parent_c"]) - self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"]) - self.fs.mds_asok(["flush", "journal"]) - backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name) - self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']]) - backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name) - self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']]) - - # That layout is written to new pool after change to other field in layout - self.mount_a.setfattr("./parent_c/alpha", - "ceph.file.layout.object_size", "8388608") - - self.fs.mds_asok(["flush", "journal"]) - new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name) - self.assertEqual(new_pool_layout['object_size'], 8388608) - - # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough, - # we don't update the layout in all the old pools whenever it changes - old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name) - self.assertEqual(old_pool_layout['object_size'], 4194304) diff --git a/src/ceph/qa/tasks/cephfs/test_cap_flush.py b/src/ceph/qa/tasks/cephfs/test_cap_flush.py deleted file mode 100644 index 1cd102f..0000000 --- a/src/ceph/qa/tasks/cephfs/test_cap_flush.py +++ /dev/null @@ -1,64 +0,0 @@ - -import os -import time -from textwrap import dedent -from unittest import SkipTest -from tasks.cephfs.fuse_mount import FuseMount -from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology - -class TestCapFlush(CephFSTestCase): - @for_teuthology - def test_replay_create(self): - """ - MDS starts to handle client caps when it enters clientreplay stage. - When handling a client cap in clientreplay stage, it's possible that - corresponding inode does not exist because the client request which - creates inode hasn't been replayed. - """ - - if not isinstance(self.mount_a, FuseMount): - raise SkipTest("Require FUSE client to inject client release failure") - - dir_path = os.path.join(self.mount_a.mountpoint, "testdir") - py_script = dedent(""" - import os - os.mkdir("{0}") - fd = os.open("{0}", os.O_RDONLY) - os.fchmod(fd, 0777) - os.fsync(fd) - """).format(dir_path) - self.mount_a.run_python(py_script) - - self.fs.mds_asok(["flush", "journal"]) - - # client will only get unsafe replay - self.fs.mds_asok(["config", "set", "mds_log_pause", "1"]) - - file_name = "testfile" - file_path = dir_path + "/" + file_name - - # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty - py_script = dedent(""" - import os - os.chdir("{0}") - os.setgid(65534) - os.setuid(65534) - fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0644) - os.fchmod(fd, 0640) - """).format(dir_path, file_name) - self.mount_a.run_python(py_script) - - # Modify file mode by different user. ceph-fuse will send a setattr request - self.mount_a.run_shell(["chmod", "600", file_path], wait=False) - - time.sleep(10) - - # Restart mds. Client will re-send the unsafe request and cap flush - self.fs.mds_stop() - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip() - # If the cap flush get dropped, mode should be 0644. - # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode) - self.assertEqual(mode, "600") diff --git a/src/ceph/qa/tasks/cephfs/test_client_limits.py b/src/ceph/qa/tasks/cephfs/test_client_limits.py deleted file mode 100644 index cb5e3a4..0000000 --- a/src/ceph/qa/tasks/cephfs/test_client_limits.py +++ /dev/null @@ -1,239 +0,0 @@ - -""" -Exercise the MDS's behaviour when clients and the MDCache reach or -exceed the limits of how many caps/inodes they should hold. -""" - -import logging -from textwrap import dedent -from unittest import SkipTest -from teuthology.orchestra.run import CommandFailedError -from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming -from tasks.cephfs.fuse_mount import FuseMount -import os - - -log = logging.getLogger(__name__) - - -# Arbitrary timeouts for operations involving restarting -# an MDS or waiting for it to come up -MDS_RESTART_GRACE = 60 - -# Hardcoded values from Server::recall_client_state -CAP_RECALL_RATIO = 0.8 -CAP_RECALL_MIN = 100 - - -class TestClientLimits(CephFSTestCase): - REQUIRE_KCLIENT_REMOTE = True - CLIENTS_REQUIRED = 2 - - def _test_client_pin(self, use_subdir, open_files): - """ - When a client pins an inode in its cache, for example because the file is held open, - it should reject requests from the MDS to trim these caps. The MDS should complain - to the user that it is unable to enforce its cache size limits because of this - objectionable client. - - :param use_subdir: whether to put test files in a subdir or use root - """ - - cache_size = open_files/2 - - self.set_conf('mds', 'mds cache size', cache_size) - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client")) - self.assertTrue(open_files >= mds_min_caps_per_client) - mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client")) - - mount_a_client_id = self.mount_a.get_global_id() - path = "subdir/mount_a" if use_subdir else "mount_a" - open_proc = self.mount_a.open_n_background(path, open_files) - - # Client should now hold: - # `open_files` caps for the open files - # 1 cap for root - # 1 cap for subdir - self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'], - open_files + (2 if use_subdir else 1), - timeout=600, - reject_fn=lambda x: x > open_files + 2) - - # MDS should not be happy about that, as the client is failing to comply - # with the SESSION_RECALL messages it is being sent - mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout")) - self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10) - - # We can also test that the MDS health warning for oversized - # cache is functioning as intended. - self.wait_for_health("MDS_CACHE_OVERSIZED", - mds_recall_state_timeout + 10) - - # When the client closes the files, it should retain only as many caps as allowed - # under the SESSION_RECALL policy - log.info("Terminating process holding files open") - open_proc.stdin.close() - try: - open_proc.wait() - except CommandFailedError: - # We killed it, so it raises an error - pass - - # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message, - # which depend on the caps outstanding, cache size and overall ratio - recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2)) - def expected_caps(): - num_caps = self.get_session(mount_a_client_id)['num_caps'] - if num_caps < mds_min_caps_per_client: - raise RuntimeError("client caps fell below min!") - elif num_caps == mds_min_caps_per_client: - return True - elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05: - return True - else: - return False - - self.wait_until_true(expected_caps, timeout=60) - - @needs_trimming - def test_client_pin_root(self): - self._test_client_pin(False, 400) - - @needs_trimming - def test_client_pin(self): - self._test_client_pin(True, 800) - - @needs_trimming - def test_client_pin_mincaps(self): - self._test_client_pin(True, 200) - - def test_client_release_bug(self): - """ - When a client has a bug (which we will simulate) preventing it from releasing caps, - the MDS should notice that releases are not being sent promptly, and generate a health - metric to that effect. - """ - - # The debug hook to inject the failure only exists in the fuse client - if not isinstance(self.mount_a, FuseMount): - raise SkipTest("Require FUSE client to inject client release failure") - - self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true') - self.mount_a.teardown() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - mount_a_client_id = self.mount_a.get_global_id() - - # Client A creates a file. He will hold the write caps on the file, and later (simulated bug) fail - # to comply with the MDSs request to release that cap - self.mount_a.run_shell(["touch", "file1"]) - - # Client B tries to stat the file that client A created - rproc = self.mount_b.write_background("file1") - - # After mds_revoke_cap_timeout, we should see a health warning (extra lag from - # MDS beacon period) - mds_revoke_cap_timeout = float(self.fs.get_config("mds_revoke_cap_timeout")) - self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_revoke_cap_timeout + 10) - - # Client B should still be stuck - self.assertFalse(rproc.finished) - - # Kill client A - self.mount_a.kill() - self.mount_a.kill_cleanup() - - # Client B should complete - self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) - rproc.wait() - - def test_client_oldest_tid(self): - """ - When a client does not advance its oldest tid, the MDS should notice that - and generate health warnings. - """ - - # num of requests client issues - max_requests = 1000 - - # The debug hook to inject the failure only exists in the fuse client - if not isinstance(self.mount_a, FuseMount): - raise SkipTest("Require FUSE client to inject client release failure") - - self.set_conf('client', 'client inject fixed oldest tid', 'true') - self.mount_a.teardown() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)]) - - # Create lots of files - self.mount_a.create_n_files("testdir/file1", max_requests + 100) - - # Create a few files synchronously. This makes sure previous requests are completed - self.mount_a.create_n_files("testdir/file2", 5, True) - - # Wait for the health warnings. Assume mds can handle 10 request per second at least - self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests / 10) - - def _test_client_cache_size(self, mount_subdir): - """ - check if client invalidate kernel dcache according to its cache size config - """ - - # The debug hook to inject the failure only exists in the fuse client - if not isinstance(self.mount_a, FuseMount): - raise SkipTest("Require FUSE client to inject client release failure") - - if mount_subdir: - # fuse assigns a fix inode number (1) to root inode. But in mounting into - # subdir case, the actual inode number of root is not 1. This mismatch - # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries - # in root directory. - self.mount_a.run_shell(["mkdir", "subdir"]) - self.mount_a.umount_wait() - self.set_conf('client', 'client mountpoint', '/subdir') - self.mount_a.mount() - self.mount_a.wait_until_mounted() - root_ino = self.mount_a.path_to_ino(".") - self.assertEqual(root_ino, 1); - - dir_path = os.path.join(self.mount_a.mountpoint, "testdir") - - mkdir_script = dedent(""" - import os - os.mkdir("{path}") - for n in range(0, {num_dirs}): - os.mkdir("{path}/dir{{0}}".format(n)) - """) - - num_dirs = 1000 - self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs)) - self.mount_a.run_shell(["sync"]) - - dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count() - self.assertGreaterEqual(dentry_count, num_dirs) - self.assertGreaterEqual(dentry_pinned_count, num_dirs) - - cache_size = num_dirs / 10 - self.mount_a.set_cache_size(cache_size) - - def trimmed(): - dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count() - log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format( - dentry_count, dentry_pinned_count - )) - if dentry_count > cache_size or dentry_pinned_count > cache_size: - return False - - return True - - self.wait_until_true(trimmed, 30) - - @needs_trimming - def test_client_cache_size(self): - self._test_client_cache_size(False) - self._test_client_cache_size(True) diff --git a/src/ceph/qa/tasks/cephfs/test_client_recovery.py b/src/ceph/qa/tasks/cephfs/test_client_recovery.py deleted file mode 100644 index fd58c14..0000000 --- a/src/ceph/qa/tasks/cephfs/test_client_recovery.py +++ /dev/null @@ -1,474 +0,0 @@ - -""" -Teuthology task for exercising CephFS client recovery -""" - -import logging -from textwrap import dedent -import time -import distutils.version as version -import re -import os - -from teuthology.orchestra.run import CommandFailedError, ConnectionLostError -from tasks.cephfs.cephfs_test_case import CephFSTestCase -from teuthology.packaging import get_package_version - - -log = logging.getLogger(__name__) - - -# Arbitrary timeouts for operations involving restarting -# an MDS or waiting for it to come up -MDS_RESTART_GRACE = 60 - - -class TestClientNetworkRecovery(CephFSTestCase): - REQUIRE_KCLIENT_REMOTE = True - REQUIRE_ONE_CLIENT_REMOTE = True - CLIENTS_REQUIRED = 2 - - LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"] - - # Environment references - mds_session_timeout = None - mds_reconnect_timeout = None - ms_max_backoff = None - - def test_network_death(self): - """ - Simulate software freeze or temporary network failure. - - Check that the client blocks I/O during failure, and completes - I/O after failure. - """ - - # We only need one client - self.mount_b.umount_wait() - - # Initially our one client session should be visible - client_id = self.mount_a.get_global_id() - ls_data = self._session_list() - self.assert_session_count(1, ls_data) - self.assertEqual(ls_data[0]['id'], client_id) - self.assert_session_state(client_id, "open") - - # ...and capable of doing I/O without blocking - self.mount_a.create_files() - - # ...but if we turn off the network - self.fs.set_clients_block(True) - - # ...and try and start an I/O - write_blocked = self.mount_a.write_background() - - # ...then it should block - self.assertFalse(write_blocked.finished) - self.assert_session_state(client_id, "open") - time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale - self.assertFalse(write_blocked.finished) - self.assert_session_state(client_id, "stale") - - # ...until we re-enable I/O - self.fs.set_clients_block(False) - - # ...when it should complete promptly - a = time.time() - self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2) - write_blocked.wait() # Already know we're finished, wait() to raise exception on errors - recovery_time = time.time() - a - log.info("recovery time: {0}".format(recovery_time)) - self.assert_session_state(client_id, "open") - - -class TestClientRecovery(CephFSTestCase): - REQUIRE_KCLIENT_REMOTE = True - CLIENTS_REQUIRED = 2 - - LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"] - - # Environment references - mds_session_timeout = None - mds_reconnect_timeout = None - ms_max_backoff = None - - def test_basic(self): - # Check that two clients come up healthy and see each others' files - # ===================================================== - self.mount_a.create_files() - self.mount_a.check_files() - self.mount_a.umount_wait() - - self.mount_b.check_files() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # Check that the admin socket interface is correctly reporting - # two sessions - # ===================================================== - ls_data = self._session_list() - self.assert_session_count(2, ls_data) - - self.assertSetEqual( - set([l['id'] for l in ls_data]), - {self.mount_a.get_global_id(), self.mount_b.get_global_id()} - ) - - def test_restart(self): - # Check that after an MDS restart both clients reconnect and continue - # to handle I/O - # ===================================================== - self.fs.mds_fail_restart() - self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) - - self.mount_a.create_destroy() - self.mount_b.create_destroy() - - def _session_num_caps(self, client_id): - ls_data = self.fs.mds_asok(['session', 'ls']) - return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps']) - - def test_reconnect_timeout(self): - # Reconnect timeout - # ================= - # Check that if I stop an MDS and a client goes away, the MDS waits - # for the reconnect period - self.fs.mds_stop() - self.fs.mds_fail() - - mount_a_client_id = self.mount_a.get_global_id() - self.mount_a.umount_wait(force=True) - - self.fs.mds_restart() - - self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) - # Check that the MDS locally reports its state correctly - status = self.fs.mds_asok(['status']) - self.assertIn("reconnect_status", status) - - ls_data = self._session_list() - self.assert_session_count(2, ls_data) - - # The session for the dead client should have the 'reconnect' flag set - self.assertTrue(self.get_session(mount_a_client_id)['reconnecting']) - - # Wait for the reconnect state to clear, this should take the - # reconnect timeout period. - in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2) - # Check that the period we waited to enter active is within a factor - # of two of the reconnect timeout. - self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2, - "Should have been in reconnect phase for {0} but only took {1}".format( - self.mds_reconnect_timeout, in_reconnect_for - )) - - self.assert_session_count(1) - - # Check that the client that timed out during reconnect can - # mount again and do I/O - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.mount_a.create_destroy() - - self.assert_session_count(2) - - def test_reconnect_eviction(self): - # Eviction during reconnect - # ========================= - mount_a_client_id = self.mount_a.get_global_id() - - self.fs.mds_stop() - self.fs.mds_fail() - - # The mount goes away while the MDS is offline - self.mount_a.kill() - - self.fs.mds_restart() - - # Enter reconnect phase - self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) - self.assert_session_count(2) - - # Evict the stuck client - self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) - self.assert_session_count(1) - - # Observe that we proceed to active phase without waiting full reconnect timeout - evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) - # Once we evict the troublemaker, the reconnect phase should complete - # in well under the reconnect timeout. - self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5, - "reconnect did not complete soon enough after eviction, took {0}".format( - evict_til_active - )) - - # We killed earlier so must clean up before trying to use again - self.mount_a.kill_cleanup() - - # Bring the client back - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.mount_a.create_destroy() - - def test_stale_caps(self): - # Capability release from stale session - # ===================================== - cap_holder = self.mount_a.open_background() - - # Wait for the file to be visible from another client, indicating - # that mount_a has completed its network ops - self.mount_b.wait_for_visible() - - # Simulate client death - self.mount_a.kill() - - try: - # Now, after mds_session_timeout seconds, the waiter should - # complete their operation when the MDS marks the holder's - # session stale. - cap_waiter = self.mount_b.write_background() - a = time.time() - cap_waiter.wait() - b = time.time() - - # Should have succeeded - self.assertEqual(cap_waiter.exitstatus, 0) - - cap_waited = b - a - log.info("cap_waiter waited {0}s".format(cap_waited)) - self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0, - "Capability handover took {0}, expected approx {1}".format( - cap_waited, self.mds_session_timeout - )) - - cap_holder.stdin.close() - try: - cap_holder.wait() - except (CommandFailedError, ConnectionLostError): - # We killed it (and possibly its node), so it raises an error - pass - finally: - # teardown() doesn't quite handle this case cleanly, so help it out - self.mount_a.kill_cleanup() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - def test_evicted_caps(self): - # Eviction while holding a capability - # =================================== - - # Take out a write capability on a file on client A, - # and then immediately kill it. - cap_holder = self.mount_a.open_background() - mount_a_client_id = self.mount_a.get_global_id() - - # Wait for the file to be visible from another client, indicating - # that mount_a has completed its network ops - self.mount_b.wait_for_visible() - - # Simulate client death - self.mount_a.kill() - - try: - # The waiter should get stuck waiting for the capability - # held on the MDS by the now-dead client A - cap_waiter = self.mount_b.write_background() - time.sleep(5) - self.assertFalse(cap_waiter.finished) - - self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) - # Now, because I evicted the old holder of the capability, it should - # immediately get handed over to the waiter - a = time.time() - cap_waiter.wait() - b = time.time() - cap_waited = b - a - log.info("cap_waiter waited {0}s".format(cap_waited)) - # This is the check that it happened 'now' rather than waiting - # for the session timeout - self.assertLess(cap_waited, self.mds_session_timeout / 2.0, - "Capability handover took {0}, expected less than {1}".format( - cap_waited, self.mds_session_timeout / 2.0 - )) - - cap_holder.stdin.close() - try: - cap_holder.wait() - except (CommandFailedError, ConnectionLostError): - # We killed it (and possibly its node), so it raises an error - pass - finally: - self.mount_a.kill_cleanup() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - def test_trim_caps(self): - # Trim capability when reconnecting MDS - # =================================== - - count = 500 - # Create lots of files - for i in range(count): - self.mount_a.run_shell(["touch", "f{0}".format(i)]) - - # Populate mount_b's cache - self.mount_b.run_shell(["ls", "-l"]) - - client_id = self.mount_b.get_global_id() - num_caps = self._session_num_caps(client_id) - self.assertGreaterEqual(num_caps, count) - - # Restart MDS. client should trim its cache when reconnecting to the MDS - self.fs.mds_fail_restart() - self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) - - num_caps = self._session_num_caps(client_id) - self.assertLess(num_caps, count, - "should have less than {0} capabilities, have {1}".format( - count, num_caps - )) - - def _is_flockable(self): - a_version_str = get_package_version(self.mount_a.client_remote, "fuse") - b_version_str = get_package_version(self.mount_b.client_remote, "fuse") - flock_version_str = "2.9" - - version_regex = re.compile(r"[0-9\.]+") - a_result = version_regex.match(a_version_str) - self.assertTrue(a_result) - b_result = version_regex.match(b_version_str) - self.assertTrue(b_result) - a_version = version.StrictVersion(a_result.group()) - b_version = version.StrictVersion(b_result.group()) - flock_version=version.StrictVersion(flock_version_str) - - if (a_version >= flock_version and b_version >= flock_version): - log.info("flock locks are available") - return True - else: - log.info("not testing flock locks, machines have versions {av} and {bv}".format( - av=a_version_str,bv=b_version_str)) - return False - - def test_filelock(self): - """ - Check that file lock doesn't get lost after an MDS restart - """ - - flockable = self._is_flockable() - lock_holder = self.mount_a.lock_background(do_flock=flockable) - - self.mount_b.wait_for_visible("background_file-2") - self.mount_b.check_filelock(do_flock=flockable) - - self.fs.mds_fail_restart() - self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) - - self.mount_b.check_filelock(do_flock=flockable) - - # Tear down the background process - lock_holder.stdin.close() - try: - lock_holder.wait() - except (CommandFailedError, ConnectionLostError): - # We killed it, so it raises an error - pass - - def test_filelock_eviction(self): - """ - Check that file lock held by evicted client is given to - waiting client. - """ - if not self._is_flockable(): - self.skipTest("flock is not available") - - lock_holder = self.mount_a.lock_background() - self.mount_b.wait_for_visible("background_file-2") - self.mount_b.check_filelock() - - lock_taker = self.mount_b.lock_and_release() - # Check the taker is waiting (doesn't get it immediately) - time.sleep(2) - self.assertFalse(lock_holder.finished) - self.assertFalse(lock_taker.finished) - - try: - mount_a_client_id = self.mount_a.get_global_id() - self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) - - # Evicting mount_a should let mount_b's attempt to take the lock - # succeed - self.wait_until_true(lambda: lock_taker.finished, timeout=10) - finally: - # teardown() doesn't quite handle this case cleanly, so help it out - self.mount_a.kill() - self.mount_a.kill_cleanup() - - # Bring the client back - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - def test_dir_fsync(self): - self._test_fsync(True); - - def test_create_fsync(self): - self._test_fsync(False); - - def _test_fsync(self, dirfsync): - """ - That calls to fsync guarantee visibility of metadata to another - client immediately after the fsyncing client dies. - """ - - # Leave this guy out until he's needed - self.mount_b.umount_wait() - - # Create dir + child dentry on client A, and fsync the dir - path = os.path.join(self.mount_a.mountpoint, "subdir") - self.mount_a.run_python( - dedent(""" - import os - import time - - path = "{path}" - - print "Starting creation..." - start = time.time() - - os.mkdir(path) - dfd = os.open(path, os.O_DIRECTORY) - - fd = open(os.path.join(path, "childfile"), "w") - print "Finished creation in {{0}}s".format(time.time() - start) - - print "Starting fsync..." - start = time.time() - if {dirfsync}: - os.fsync(dfd) - else: - os.fsync(fd) - print "Finished fsync in {{0}}s".format(time.time() - start) - """.format(path=path,dirfsync=str(dirfsync))) - ) - - # Immediately kill the MDS and then client A - self.fs.mds_stop() - self.fs.mds_fail() - self.mount_a.kill() - self.mount_a.kill_cleanup() - - # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay - self.fs.mds_restart() - log.info("Waiting for reconnect...") - self.fs.wait_for_state("up:reconnect") - log.info("Waiting for active...") - self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout) - log.info("Reached active...") - - # Is the child dentry visible from mount B? - self.mount_b.mount() - self.mount_b.wait_until_mounted() - self.mount_b.run_shell(["ls", "subdir/childfile"]) diff --git a/src/ceph/qa/tasks/cephfs/test_config_commands.py b/src/ceph/qa/tasks/cephfs/test_config_commands.py deleted file mode 100644 index ce0619f..0000000 --- a/src/ceph/qa/tasks/cephfs/test_config_commands.py +++ /dev/null @@ -1,63 +0,0 @@ - -from unittest import case -from tasks.cephfs.cephfs_test_case import CephFSTestCase -from tasks.cephfs.fuse_mount import FuseMount - - -class TestConfigCommands(CephFSTestCase): - """ - Test that daemons and clients respond to the otherwise rarely-used - runtime config modification operations. - """ - - CLIENTS_REQUIRED = 1 - MDSS_REQUIRED = 1 - - def test_client_config(self): - """ - That I can successfully issue asok "config set" commands - - :return: - """ - - if not isinstance(self.mount_a, FuseMount): - raise case.SkipTest("Test only applies to FUSE clients") - - test_key = "client_cache_size" - test_val = "123" - self.mount_a.admin_socket(['config', 'set', test_key, test_val]) - out = self.mount_a.admin_socket(['config', 'get', test_key]) - self.assertEqual(out[test_key], test_val) - - self.mount_a.write_n_mb("file.bin", 1); - - # Implicitly asserting that things don't have lockdep error in shutdown - self.mount_a.umount_wait(require_clean=True) - self.fs.mds_stop() - - def test_mds_config_asok(self): - test_key = "mds_max_purge_ops" - test_val = "123" - self.fs.mds_asok(['config', 'set', test_key, test_val]) - out = self.fs.mds_asok(['config', 'get', test_key]) - self.assertEqual(out[test_key], test_val) - - # Implicitly asserting that things don't have lockdep error in shutdown - self.mount_a.umount_wait(require_clean=True) - self.fs.mds_stop() - - def test_mds_config_tell(self): - test_key = "mds_max_purge_ops" - test_val = "123" - - mds_id = self.fs.get_lone_mds_id() - self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "injectargs", - "--{0}={1}".format(test_key, test_val)) - - # Read it back with asok because there is no `tell` equivalent - out = self.fs.mds_asok(['config', 'get', test_key]) - self.assertEqual(out[test_key], test_val) - - # Implicitly asserting that things don't have lockdep error in shutdown - self.mount_a.umount_wait(require_clean=True) - self.fs.mds_stop() diff --git a/src/ceph/qa/tasks/cephfs/test_damage.py b/src/ceph/qa/tasks/cephfs/test_damage.py deleted file mode 100644 index 380b49c..0000000 --- a/src/ceph/qa/tasks/cephfs/test_damage.py +++ /dev/null @@ -1,548 +0,0 @@ -import json -import logging -import errno -import re -from teuthology.contextutil import MaxWhileTries -from teuthology.exceptions import CommandFailedError -from teuthology.orchestra.run import wait -from tasks.cephfs.fuse_mount import FuseMount -from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology - -DAMAGED_ON_START = "damaged_on_start" -DAMAGED_ON_LS = "damaged_on_ls" -CRASHED = "server crashed" -NO_DAMAGE = "no damage" -FAILED_CLIENT = "client failed" -FAILED_SERVER = "server failed" - -# An EIO in response to a stat from the client -EIO_ON_LS = "eio" - -# An EIO, but nothing in damage table (not ever what we expect) -EIO_NO_DAMAGE = "eio without damage entry" - - -log = logging.getLogger(__name__) - - -class TestDamage(CephFSTestCase): - def _simple_workload_write(self): - self.mount_a.run_shell(["mkdir", "subdir"]) - self.mount_a.write_n_mb("subdir/sixmegs", 6) - return self.mount_a.stat("subdir/sixmegs") - - def is_marked_damaged(self, rank): - mds_map = self.fs.get_mds_map() - return rank in mds_map['damaged'] - - @for_teuthology #459s - def test_object_deletion(self): - """ - That the MDS has a clean 'damaged' response to loss of any single metadata object - """ - - self._simple_workload_write() - - # Hmm, actually it would be nice to permute whether the metadata pool - # state contains sessions or not, but for the moment close this session - # to avoid waiting through reconnect on every MDS start. - self.mount_a.umount_wait() - for mds_name in self.fs.get_active_names(): - self.fs.mds_asok(["flush", "journal"], mds_name) - - self.fs.mds_stop() - self.fs.mds_fail() - - self.fs.rados(['export', '/tmp/metadata.bin']) - - def is_ignored(obj_id, dentry=None): - """ - A filter to avoid redundantly mutating many similar objects (e.g. - stray dirfrags) or similar dentries (e.g. stray dir dentries) - """ - if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000": - return True - - if dentry and obj_id == "100.00000000": - if re.match("stray.+_head", dentry) and dentry != "stray0_head": - return True - - return False - - def get_path(obj_id, dentry=None): - """ - What filesystem path does this object or dentry correspond to? i.e. - what should I poke to see EIO after damaging it? - """ - - if obj_id == "1.00000000" and dentry == "subdir_head": - return "./subdir" - elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head": - return "./subdir/sixmegs" - - # None means ls will do an "ls -R" in hope of seeing some errors - return None - - objects = self.fs.rados(["ls"]).split("\n") - objects = [o for o in objects if not is_ignored(o)] - - # Find all objects with an OMAP header - omap_header_objs = [] - for o in objects: - header = self.fs.rados(["getomapheader", o]) - # The rados CLI wraps the header output in a hex-printed style - header_bytes = int(re.match("header \((.+) bytes\)", header).group(1)) - if header_bytes > 0: - omap_header_objs.append(o) - - # Find all OMAP key/vals - omap_keys = [] - for o in objects: - keys_str = self.fs.rados(["listomapkeys", o]) - if keys_str: - for key in keys_str.split("\n"): - if not is_ignored(o, key): - omap_keys.append((o, key)) - - # Find objects that have data in their bodies - data_objects = [] - for obj_id in objects: - stat_out = self.fs.rados(["stat", obj_id]) - size = int(re.match(".+, size (.+)$", stat_out).group(1)) - if size > 0: - data_objects.append(obj_id) - - # Define the various forms of damage we will inflict - class MetadataMutation(object): - def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None): - self.obj_id = obj_id_ - self.desc = desc_ - self.mutate_fn = mutate_fn_ - self.expectation = expectation_ - if ls_path is None: - self.ls_path = "." - else: - self.ls_path = ls_path - - def __eq__(self, other): - return self.desc == other.desc - - def __hash__(self): - return hash(self.desc) - - junk = "deadbeef" * 10 - mutations = [] - - # Removals - for obj_id in objects: - if obj_id in [ - # JournalPointers are auto-replaced if missing (same path as upgrade) - "400.00000000", - # Missing dirfrags for non-system dirs result in empty directory - "10000000000.00000000", - # PurgeQueue is auto-created if not found on startup - "500.00000000" - ]: - expectation = NO_DAMAGE - else: - expectation = DAMAGED_ON_START - - log.info("Expectation on rm '{0}' will be '{1}'".format( - obj_id, expectation - )) - - mutations.append(MetadataMutation( - obj_id, - "Delete {0}".format(obj_id), - lambda o=obj_id: self.fs.rados(["rm", o]), - expectation - )) - - # Blatant corruptions - mutations.extend([ - MetadataMutation( - o, - "Corrupt {0}".format(o), - lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk), - DAMAGED_ON_START - ) for o in data_objects - ]) - - # Truncations - for obj_id in data_objects: - if obj_id == "500.00000000": - # The PurgeQueue is allowed to be empty: Journaler interprets - # an empty header object as an empty journal. - expectation = NO_DAMAGE - else: - expectation = DAMAGED_ON_START - - mutations.append( - MetadataMutation( - o, - "Truncate {0}".format(o), - lambda o=o: self.fs.rados(["truncate", o, "0"]), - DAMAGED_ON_START - )) - - # OMAP value corruptions - for o, k in omap_keys: - if o.startswith("100."): - # Anything in rank 0's 'mydir' - expectation = DAMAGED_ON_START - else: - expectation = EIO_ON_LS - - mutations.append( - MetadataMutation( - o, - "Corrupt omap key {0}:{1}".format(o, k), - lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]), - expectation, - get_path(o, k) - ) - ) - - # OMAP header corruptions - for obj_id in omap_header_objs: - if re.match("60.\.00000000", obj_id) \ - or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]: - expectation = DAMAGED_ON_START - else: - expectation = NO_DAMAGE - - log.info("Expectation on corrupt header '{0}' will be '{1}'".format( - obj_id, expectation - )) - - mutations.append( - MetadataMutation( - obj_id, - "Corrupt omap header on {0}".format(obj_id), - lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]), - expectation - ) - ) - - results = {} - - for mutation in mutations: - log.info("Applying mutation '{0}'".format(mutation.desc)) - - # Reset MDS state - self.mount_a.umount_wait(force=True) - self.fs.mds_stop() - self.fs.mds_fail() - self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') - - # Reset RADOS pool state - self.fs.rados(['import', '/tmp/metadata.bin']) - - # Inject the mutation - mutation.mutate_fn() - - # Try starting the MDS - self.fs.mds_restart() - - # How long we'll wait between starting a daemon and expecting - # it to make it through startup, and potentially declare itself - # damaged to the mon cluster. - startup_timeout = 60 - - if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE): - if mutation.expectation == DAMAGED_ON_START: - # The MDS may pass through active before making it to damaged - try: - self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout) - except RuntimeError: - pass - - # Wait for MDS to either come up or go into damaged state - try: - self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout) - except RuntimeError: - crashed = False - # Didn't make it to healthy or damaged, did it crash? - for daemon_id, daemon in self.fs.mds_daemons.items(): - if daemon.proc and daemon.proc.finished: - crashed = True - log.error("Daemon {0} crashed!".format(daemon_id)) - daemon.proc = None # So that subsequent stop() doesn't raise error - if not crashed: - # Didn't go health, didn't go damaged, didn't crash, so what? - raise - else: - log.info("Result: Mutation '{0}' led to crash".format(mutation.desc)) - results[mutation] = CRASHED - continue - if self.is_marked_damaged(0): - log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc)) - results[mutation] = DAMAGED_ON_START - continue - else: - log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc)) - else: - try: - self.wait_until_true(self.fs.are_daemons_healthy, 60) - except RuntimeError: - log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc)) - if self.is_marked_damaged(0): - results[mutation] = DAMAGED_ON_START - else: - results[mutation] = FAILED_SERVER - continue - log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc)) - - # MDS is up, should go damaged on ls or client mount - self.mount_a.mount() - self.mount_a.wait_until_mounted() - if mutation.ls_path == ".": - proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False) - else: - proc = self.mount_a.stat(mutation.ls_path, wait=False) - - if mutation.expectation == DAMAGED_ON_LS: - try: - self.wait_until_true(lambda: self.is_marked_damaged(0), 60) - log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc)) - results[mutation] = DAMAGED_ON_LS - except RuntimeError: - if self.fs.are_daemons_healthy(): - log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format( - mutation.desc)) - results[mutation] = NO_DAMAGE - else: - log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc)) - results[mutation] = FAILED_SERVER - - else: - try: - wait([proc], 20) - log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc)) - results[mutation] = NO_DAMAGE - except MaxWhileTries: - log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc)) - results[mutation] = FAILED_CLIENT - except CommandFailedError as e: - if e.exitstatus == errno.EIO: - log.info("Result: EIO on client") - results[mutation] = EIO_ON_LS - else: - log.info("Result: unexpected error {0} on client".format(e)) - results[mutation] = FAILED_CLIENT - - if mutation.expectation == EIO_ON_LS: - # EIOs mean something handled by DamageTable: assert that it has - # been populated - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty')) - if len(damage) == 0: - results[mutation] = EIO_NO_DAMAGE - - failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result] - if failures: - log.error("{0} mutations had unexpected outcomes:".format(len(failures))) - for mutation, result in failures: - log.error(" Expected '{0}' actually '{1}' from '{2}'".format( - mutation.expectation, result, mutation.desc - )) - raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures))) - else: - log.info("All {0} mutations had expected outcomes".format(len(mutations))) - - def test_damaged_dentry(self): - # Damage to dentrys is interesting because it leaves the - # directory's `complete` flag in a subtle state where - # we have marked the dir complete in order that folks - # can access it, but in actual fact there is a dentry - # missing - self.mount_a.run_shell(["mkdir", "subdir/"]) - - self.mount_a.run_shell(["touch", "subdir/file_undamaged"]) - self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"]) - - subdir_ino = self.mount_a.path_to_ino("subdir") - - self.mount_a.umount_wait() - for mds_name in self.fs.get_active_names(): - self.fs.mds_asok(["flush", "journal"], mds_name) - - self.fs.mds_stop() - self.fs.mds_fail() - - # Corrupt a dentry - junk = "deadbeef" * 10 - dirfrag_obj = "{0:x}.00000000".format(subdir_ino) - self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) - - # Start up and try to list it - self.fs.mds_restart() - self.fs.wait_for_daemons() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - dentries = self.mount_a.ls("subdir/") - - # The damaged guy should have disappeared - self.assertEqual(dentries, ["file_undamaged"]) - - # I should get ENOENT if I try and read it normally, because - # the dir is considered complete - try: - self.mount_a.stat("subdir/file_to_be_damaged", wait=True) - except CommandFailedError as e: - self.assertEqual(e.exitstatus, errno.ENOENT) - else: - raise AssertionError("Expected ENOENT") - - # The fact that there is damaged should have bee recorded - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "ls", '--format=json-pretty')) - self.assertEqual(len(damage), 1) - damage_id = damage[0]['id'] - - # If I try to create a dentry with the same name as the damaged guy - # then that should be forbidden - try: - self.mount_a.touch("subdir/file_to_be_damaged") - except CommandFailedError as e: - self.assertEqual(e.exitstatus, errno.EIO) - else: - raise AssertionError("Expected EIO") - - # Attempting that touch will clear the client's complete flag, now - # when I stat it I'll get EIO instead of ENOENT - try: - self.mount_a.stat("subdir/file_to_be_damaged", wait=True) - except CommandFailedError as e: - if isinstance(self.mount_a, FuseMount): - self.assertEqual(e.exitstatus, errno.EIO) - else: - # Kernel client handles this case differently - self.assertEqual(e.exitstatus, errno.ENOENT) - else: - raise AssertionError("Expected EIO") - - nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") - self.assertEqual(nfiles, "2") - - self.mount_a.umount_wait() - - # Now repair the stats - scrub_json = self.fs.mds_asok(["scrub_path", "/subdir", "repair"]) - log.info(json.dumps(scrub_json, indent=2)) - - self.assertEqual(scrub_json["passed_validation"], False) - self.assertEqual(scrub_json["raw_stats"]["checked"], True) - self.assertEqual(scrub_json["raw_stats"]["passed"], False) - - # Check that the file count is now correct - self.mount_a.mount() - self.mount_a.wait_until_mounted() - nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") - self.assertEqual(nfiles, "1") - - # Clean up the omap object - self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) - - # Clean up the damagetable entry - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "rm", "{did}".format(did=damage_id)) - - # Now I should be able to create a file with the same name as the - # damaged guy if I want. - self.mount_a.touch("subdir/file_to_be_damaged") - - def test_open_ino_errors(self): - """ - That errors encountered during opening inos are properly propagated - """ - - self.mount_a.run_shell(["mkdir", "dir1"]) - self.mount_a.run_shell(["touch", "dir1/file1"]) - self.mount_a.run_shell(["mkdir", "dir2"]) - self.mount_a.run_shell(["touch", "dir2/file2"]) - self.mount_a.run_shell(["mkdir", "testdir"]) - self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"]) - self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"]) - - file1_ino = self.mount_a.path_to_ino("dir1/file1") - file2_ino = self.mount_a.path_to_ino("dir2/file2") - dir2_ino = self.mount_a.path_to_ino("dir2") - - # Ensure everything is written to backing store - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"]) - - # Drop everything from the MDS cache - self.mds_cluster.mds_stop() - self.fs.journal_tool(['journal', 'reset']) - self.mds_cluster.mds_fail_restart() - self.fs.wait_for_daemons() - - self.mount_a.mount() - - # Case 1: un-decodeable backtrace - - # Validate that the backtrace is present and decodable - self.fs.read_backtrace(file1_ino) - # Go corrupt the backtrace of alpha/target (used for resolving - # bravo/hardlink). - self.fs._write_data_xattr(file1_ino, "parent", "rhubarb") - - # Check that touching the hardlink gives EIO - ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False) - try: - ran.wait() - except CommandFailedError: - self.assertTrue("Input/output error" in ran.stderr.getvalue()) - - # Check that an entry is created in the damage table - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "ls", '--format=json-pretty')) - self.assertEqual(len(damage), 1) - self.assertEqual(damage[0]['damage_type'], "backtrace") - self.assertEqual(damage[0]['ino'], file1_ino) - - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "rm", str(damage[0]['id'])) - - - # Case 2: missing dirfrag for the target inode - - self.fs.rados(["rm", "{0:x}.00000000".format(dir2_ino)]) - - # Check that touching the hardlink gives EIO - ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False) - try: - ran.wait() - except CommandFailedError: - self.assertTrue("Input/output error" in ran.stderr.getvalue()) - - # Check that an entry is created in the damage table - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "ls", '--format=json-pretty')) - self.assertEqual(len(damage), 2) - if damage[0]['damage_type'] == "backtrace" : - self.assertEqual(damage[0]['ino'], file2_ino) - self.assertEqual(damage[1]['damage_type'], "dir_frag") - self.assertEqual(damage[1]['ino'], dir2_ino) - else: - self.assertEqual(damage[0]['damage_type'], "dir_frag") - self.assertEqual(damage[0]['ino'], dir2_ino) - self.assertEqual(damage[1]['damage_type'], "backtrace") - self.assertEqual(damage[1]['ino'], file2_ino) - - for entry in damage: - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "rm", str(entry['id'])) diff --git a/src/ceph/qa/tasks/cephfs/test_data_scan.py b/src/ceph/qa/tasks/cephfs/test_data_scan.py deleted file mode 100644 index a2d3157..0000000 --- a/src/ceph/qa/tasks/cephfs/test_data_scan.py +++ /dev/null @@ -1,600 +0,0 @@ - -""" -Test our tools for recovering metadata from the data pool -""" -import json - -import logging -import os -from textwrap import dedent -import traceback -from collections import namedtuple, defaultdict - -from teuthology.orchestra.run import CommandFailedError -from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology - -log = logging.getLogger(__name__) - - -ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) - - -class Workload(object): - def __init__(self, filesystem, mount): - self._mount = mount - self._filesystem = filesystem - self._initial_state = None - - # Accumulate backtraces for every failed validation, and return them. Backtraces - # are rather verbose, but we only see them when something breaks, and they - # let us see which check failed without having to decorate each check with - # a string - self._errors = [] - - def assert_equal(self, a, b): - try: - if a != b: - raise AssertionError("{0} != {1}".format(a, b)) - except AssertionError as e: - self._errors.append( - ValidationError(e, traceback.format_exc(3)) - ) - - def write(self): - """ - Write the workload files to the mount - """ - raise NotImplementedError() - - def validate(self): - """ - Read from the mount and validate that the workload files are present (i.e. have - survived or been reconstructed from the test scenario) - """ - raise NotImplementedError() - - def damage(self): - """ - Damage the filesystem pools in ways that will be interesting to recover from. By - default just wipe everything in the metadata pool - """ - # Delete every object in the metadata pool - objects = self._filesystem.rados(["ls"]).split("\n") - for o in objects: - self._filesystem.rados(["rm", o]) - - def flush(self): - """ - Called after client unmount, after write: flush whatever you want - """ - self._filesystem.mds_asok(["flush", "journal"]) - - -class SimpleWorkload(Workload): - """ - Single file, single directory, check that it gets recovered and so does its size - """ - def write(self): - self._mount.run_shell(["mkdir", "subdir"]) - self._mount.write_n_mb("subdir/sixmegs", 6) - self._initial_state = self._mount.stat("subdir/sixmegs") - - def validate(self): - self._mount.run_shell(["ls", "subdir"]) - st = self._mount.stat("subdir/sixmegs") - self.assert_equal(st['st_size'], self._initial_state['st_size']) - return self._errors - - -class MovedFile(Workload): - def write(self): - # Create a file whose backtrace disagrees with his eventual position - # in the metadata. We will see that he gets reconstructed in his - # original position according to his backtrace. - self._mount.run_shell(["mkdir", "subdir_alpha"]) - self._mount.run_shell(["mkdir", "subdir_bravo"]) - self._mount.write_n_mb("subdir_alpha/sixmegs", 6) - self._filesystem.mds_asok(["flush", "journal"]) - self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"]) - self._initial_state = self._mount.stat("subdir_bravo/sixmegs") - - def flush(self): - pass - - def validate(self): - self.assert_equal(self._mount.ls(), ["subdir_alpha"]) - st = self._mount.stat("subdir_alpha/sixmegs") - self.assert_equal(st['st_size'], self._initial_state['st_size']) - return self._errors - - -class BacktracelessFile(Workload): - def write(self): - self._mount.run_shell(["mkdir", "subdir"]) - self._mount.write_n_mb("subdir/sixmegs", 6) - self._initial_state = self._mount.stat("subdir/sixmegs") - - def flush(self): - # Never flush metadata, so backtrace won't be written - pass - - def validate(self): - ino_name = "%x" % self._initial_state["st_ino"] - - # The inode should be linked into lost+found because we had no path for it - self.assert_equal(self._mount.ls(), ["lost+found"]) - self.assert_equal(self._mount.ls("lost+found"), [ino_name]) - st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name)) - - # We might not have got the name or path, but we should still get the size - self.assert_equal(st['st_size'], self._initial_state['st_size']) - - return self._errors - - -class StripedStashedLayout(Workload): - def __init__(self, fs, m): - super(StripedStashedLayout, self).__init__(fs, m) - - # Nice small stripes so we can quickly do our writes+validates - self.sc = 4 - self.ss = 65536 - self.os = 262144 - - self.interesting_sizes = [ - # Exactly stripe_count objects will exist - self.os * self.sc, - # Fewer than stripe_count objects will exist - self.os * self.sc / 2, - self.os * (self.sc - 1) + self.os / 2, - self.os * (self.sc - 1) + self.os / 2 - 1, - self.os * (self.sc + 1) + self.os / 2, - self.os * (self.sc + 1) + self.os / 2 + 1, - # More than stripe_count objects will exist - self.os * self.sc + self.os * self.sc / 2 - ] - - def write(self): - # Create a dir with a striped layout set on it - self._mount.run_shell(["mkdir", "stripey"]) - - self._mount.setfattr("./stripey", "ceph.dir.layout", - "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format( - ss=self.ss, os=self.os, sc=self.sc, - pool=self._filesystem.get_data_pool_name() - )) - - # Write files, then flush metadata so that its layout gets written into an xattr - for i, n_bytes in enumerate(self.interesting_sizes): - self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) - # This is really just validating the validator - self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) - self._filesystem.mds_asok(["flush", "journal"]) - - # Write another file in the same way, but this time don't flush the metadata, - # so that it won't have the layout xattr - self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512) - self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512) - - self._initial_state = { - "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file") - } - - def flush(self): - # Pass because we already selectively flushed during write - pass - - def validate(self): - # The first files should have been recovered into its original location - # with the correct layout: read back correct data - for i, n_bytes in enumerate(self.interesting_sizes): - try: - self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) - except CommandFailedError as e: - self._errors.append( - ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3)) - ) - - # The unflushed file should have been recovered into lost+found without - # the correct layout: read back junk - ino_name = "%x" % self._initial_state["unflushed_ino"] - self.assert_equal(self._mount.ls("lost+found"), [ino_name]) - try: - self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512) - except CommandFailedError: - pass - else: - self._errors.append( - ValidationError("Unexpectedly valid data in unflushed striped file", "") - ) - - return self._errors - - -class ManyFilesWorkload(Workload): - def __init__(self, filesystem, mount, file_count): - super(ManyFilesWorkload, self).__init__(filesystem, mount) - self.file_count = file_count - - def write(self): - self._mount.run_shell(["mkdir", "subdir"]) - for n in range(0, self.file_count): - self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) - - def validate(self): - for n in range(0, self.file_count): - try: - self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) - except CommandFailedError as e: - self._errors.append( - ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3)) - ) - - return self._errors - - -class MovedDir(Workload): - def write(self): - # Create a nested dir that we will then move. Two files with two different - # backtraces referring to the moved dir, claiming two different locations for - # it. We will see that only one backtrace wins and the dir ends up with - # single linkage. - self._mount.run_shell(["mkdir", "-p", "grandmother/parent"]) - self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1) - self._filesystem.mds_asok(["flush", "journal"]) - self._mount.run_shell(["mkdir", "grandfather"]) - self._mount.run_shell(["mv", "grandmother/parent", "grandfather"]) - self._mount.write_n_mb("grandfather/parent/new_pos_file", 2) - self._filesystem.mds_asok(["flush", "journal"]) - - self._initial_state = ( - self._mount.stat("grandfather/parent/orig_pos_file"), - self._mount.stat("grandfather/parent/new_pos_file") - ) - - def validate(self): - root_files = self._mount.ls() - self.assert_equal(len(root_files), 1) - self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True) - winner = root_files[0] - st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner)) - st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner)) - - self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size']) - self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size']) - - -class MissingZerothObject(Workload): - def write(self): - self._mount.run_shell(["mkdir", "subdir"]) - self._mount.write_n_mb("subdir/sixmegs", 6) - self._initial_state = self._mount.stat("subdir/sixmegs") - - def damage(self): - super(MissingZerothObject, self).damage() - zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino']) - self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name()) - - def validate(self): - st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino'])) - self.assert_equal(st['st_size'], self._initial_state['st_size']) - - -class NonDefaultLayout(Workload): - """ - Check that the reconstruction copes with files that have a different - object size in their layout - """ - def write(self): - self._mount.run_shell(["touch", "datafile"]) - self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608") - self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"]) - self._initial_state = self._mount.stat("datafile") - - def validate(self): - # Check we got the layout reconstructed properly - object_size = int(self._mount.getfattr( - "./datafile", "ceph.file.layout.object_size")) - self.assert_equal(object_size, 8388608) - - # Check we got the file size reconstructed properly - st = self._mount.stat("datafile") - self.assert_equal(st['st_size'], self._initial_state['st_size']) - - -class TestDataScan(CephFSTestCase): - MDSS_REQUIRED = 2 - - def is_marked_damaged(self, rank): - mds_map = self.fs.get_mds_map() - return rank in mds_map['damaged'] - - def _rebuild_metadata(self, workload, workers=1): - """ - That when all objects in metadata pool are removed, we can rebuild a metadata pool - based on the contents of a data pool, and a client can see and read our files. - """ - - # First, inject some files - - workload.write() - - # Unmount the client and flush the journal: the tool should also cope with - # situations where there is dirty metadata, but we'll test that separately - self.mount_a.umount_wait() - workload.flush() - - # Stop the MDS - self.fs.mds_stop() - self.fs.mds_fail() - - # After recovery, we need the MDS to not be strict about stats (in production these options - # are off by default, but in QA we need to explicitly disable them) - self.fs.set_ceph_conf('mds', 'mds verify scatter', False) - self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) - - # Apply any data damage the workload wants - workload.damage() - - # Reset the MDS map in case multiple ranks were in play: recovery procedure - # only understands how to rebuild metadata under rank 0 - self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, - '--yes-i-really-mean-it') - - self.fs.mds_restart() - - def get_state(mds_id): - info = self.mds_cluster.get_mds_info(mds_id) - return info['state'] if info is not None else None - - self.wait_until_true(lambda: self.is_marked_damaged(0), 60) - for mds_id in self.fs.mds_ids: - self.wait_until_equal( - lambda: get_state(mds_id), - "up:standby", - timeout=60) - - self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) - self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) - self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) - - # Run the recovery procedure - if False: - with self.assertRaises(CommandFailedError): - # Normal reset should fail when no objects are present, we'll use --force instead - self.fs.journal_tool(["journal", "reset"]) - - self.fs.journal_tool(["journal", "reset", "--force"]) - self.fs.data_scan(["init"]) - self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers) - self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers) - - # Mark the MDS repaired - self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') - - # Start the MDS - self.fs.mds_restart() - self.fs.wait_for_daemons() - log.info(str(self.mds_cluster.status())) - - # Mount a client - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # See that the files are present and correct - errors = workload.validate() - if errors: - log.error("Validation errors found: {0}".format(len(errors))) - for e in errors: - log.error(e.exception) - log.error(e.backtrace) - raise AssertionError("Validation failed, first error: {0}\n{1}".format( - errors[0].exception, errors[0].backtrace - )) - - def test_rebuild_simple(self): - self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a)) - - def test_rebuild_moved_file(self): - self._rebuild_metadata(MovedFile(self.fs, self.mount_a)) - - def test_rebuild_backtraceless(self): - self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a)) - - def test_rebuild_moved_dir(self): - self._rebuild_metadata(MovedDir(self.fs, self.mount_a)) - - def test_rebuild_missing_zeroth(self): - self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a)) - - def test_rebuild_nondefault_layout(self): - self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a)) - - def test_stashed_layout(self): - self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a)) - - def _dirfrag_keys(self, object_id): - keys_str = self.fs.rados(["listomapkeys", object_id]) - if keys_str: - return keys_str.split("\n") - else: - return [] - - def test_fragmented_injection(self): - """ - That when injecting a dentry into a fragmented directory, we put it in the right fragment. - """ - - self.fs.set_allow_dirfrags(True) - - file_count = 100 - file_names = ["%s" % n for n in range(0, file_count)] - - # Create a directory of `file_count` files, each named after its - # decimal number and containing the string of its decimal number - self.mount_a.run_python(dedent(""" - import os - path = os.path.join("{path}", "subdir") - os.mkdir(path) - for n in range(0, {file_count}): - open(os.path.join(path, "%s" % n), 'w').write("%s" % n) - """.format( - path=self.mount_a.mountpoint, - file_count=file_count - ))) - - dir_ino = self.mount_a.path_to_ino("subdir") - - # Only one MDS should be active! - self.assertEqual(len(self.fs.get_active_names()), 1) - - # Ensure that one directory is fragmented - mds_id = self.fs.get_active_names()[0] - self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id) - - # Flush journal and stop MDS - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"], mds_id) - self.fs.mds_stop() - self.fs.mds_fail() - - # Pick a dentry and wipe out its key - # Because I did a 1 bit split, I know one frag will be named <inode>.01000000 - frag_obj_id = "{0:x}.01000000".format(dir_ino) - keys = self._dirfrag_keys(frag_obj_id) - victim_key = keys[7] # arbitrary choice - log.info("victim_key={0}".format(victim_key)) - victim_dentry = victim_key.split("_head")[0] - self.fs.rados(["rmomapkey", frag_obj_id, victim_key]) - - # Start filesystem back up, observe that the file appears to be gone in an `ls` - self.fs.mds_restart() - self.fs.wait_for_daemons() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n") - self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry])))) - - # Stop the filesystem - self.mount_a.umount_wait() - self.fs.mds_stop() - self.fs.mds_fail() - - # Run data-scan, observe that it inserts our dentry back into the correct fragment - # by checking the omap now has the dentry's key again - self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) - self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()]) - self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id)) - - # Start the filesystem and check that the dentry we deleted is now once again visible - # and points to the correct file data. - self.fs.mds_restart() - self.fs.wait_for_daemons() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip() - self.assertEqual(out, victim_dentry) - - # Finally, close the loop by checking our injected dentry survives a merge - mds_id = self.fs.get_active_names()[0] - self.mount_a.ls("subdir") # Do an ls to ensure both frags are in cache so the merge will work - self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id) - self.fs.mds_asok(["flush", "journal"], mds_id) - frag_obj_id = "{0:x}.00000000".format(dir_ino) - keys = self._dirfrag_keys(frag_obj_id) - self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names])) - - @for_teuthology - def test_parallel_execution(self): - self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7) - - def test_pg_files(self): - """ - That the pg files command tells us which files are associated with - a particular PG - """ - file_count = 20 - self.mount_a.run_shell(["mkdir", "mydir"]) - self.mount_a.create_n_files("mydir/myfile", file_count) - - # Some files elsewhere in the system that we will ignore - # to check that the tool is filtering properly - self.mount_a.run_shell(["mkdir", "otherdir"]) - self.mount_a.create_n_files("otherdir/otherfile", file_count) - - pgs_to_files = defaultdict(list) - # Rough (slow) reimplementation of the logic - for i in range(0, file_count): - file_path = "mydir/myfile_{0}".format(i) - ino = self.mount_a.path_to_ino(file_path) - obj = "{0:x}.{1:08x}".format(ino, 0) - pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd( - "osd", "map", self.fs.get_data_pool_name(), obj, - "--format=json-pretty" - ))['pgid'] - pgs_to_files[pgid].append(file_path) - log.info("{0}: {1}".format(file_path, pgid)) - - pg_count = self.fs.get_pgs_per_fs_pool() - for pg_n in range(0, pg_count): - pg_str = "{0}.{1}".format(self.fs.get_data_pool_id(), pg_n) - out = self.fs.data_scan(["pg_files", "mydir", pg_str]) - lines = [l for l in out.split("\n") if l] - log.info("{0}: {1}".format(pg_str, lines)) - self.assertSetEqual(set(lines), set(pgs_to_files[pg_str])) - - def test_scan_links(self): - """ - The scan_links command fixes linkage errors - """ - self.mount_a.run_shell(["mkdir", "testdir1"]) - self.mount_a.run_shell(["mkdir", "testdir2"]) - dir1_ino = self.mount_a.path_to_ino("testdir1") - dir2_ino = self.mount_a.path_to_ino("testdir2") - dirfrag1_oid = "{0:x}.00000000".format(dir1_ino) - dirfrag2_oid = "{0:x}.00000000".format(dir2_ino) - - self.mount_a.run_shell(["touch", "testdir1/file1"]) - self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"]) - self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"]) - - mds_id = self.fs.get_active_names()[0] - self.fs.mds_asok(["flush", "journal"], mds_id) - - dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid) - - # introduce duplicated primary link - file1_key = "file1_head" - self.assertIn(file1_key, dirfrag1_keys) - file1_omap_data = self.fs.rados(["getomapval", dirfrag1_oid, file1_key, '-']) - self.fs.rados(["setomapval", dirfrag2_oid, file1_key], stdin_data=file1_omap_data) - self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) - - # remove a remote link, make inode link count incorrect - link1_key = 'link1_head' - self.assertIn(link1_key, dirfrag1_keys) - self.fs.rados(["rmomapkey", dirfrag1_oid, link1_key]) - - # increase good primary link's version - self.mount_a.run_shell(["touch", "testdir1/file1"]) - self.mount_a.umount_wait() - - self.fs.mds_asok(["flush", "journal"], mds_id) - self.fs.mds_stop() - self.fs.mds_fail() - - # repair linkage errors - self.fs.data_scan(["scan_links"]) - - # primary link in testdir2 was deleted? - self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) - - self.fs.mds_restart() - self.fs.wait_for_daemons() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # link count was adjusted? - file1_nlink = self.mount_a.path_to_nlink("testdir1/file1") - self.assertEqual(file1_nlink, 2) diff --git a/src/ceph/qa/tasks/cephfs/test_dump_tree.py b/src/ceph/qa/tasks/cephfs/test_dump_tree.py deleted file mode 100644 index 6d943f9..0000000 --- a/src/ceph/qa/tasks/cephfs/test_dump_tree.py +++ /dev/null @@ -1,66 +0,0 @@ -from tasks.cephfs.cephfs_test_case import CephFSTestCase -import random -import os - -class TestDumpTree(CephFSTestCase): - def get_paths_to_ino(self): - inos = {} - p = self.mount_a.run_shell(["find", "./"]) - paths = p.stdout.getvalue().strip().split() - for path in paths: - inos[path] = self.mount_a.path_to_ino(path, False) - - return inos - - def populate(self): - self.mount_a.run_shell(["git", "clone", - "https://github.com/ceph/ceph-qa-suite"]) - - def test_basic(self): - self.mount_a.run_shell(["mkdir", "parent"]) - self.mount_a.run_shell(["mkdir", "parent/child"]) - self.mount_a.run_shell(["touch", "parent/child/file"]) - self.mount_a.run_shell(["mkdir", "parent/child/grandchild"]) - self.mount_a.run_shell(["touch", "parent/child/grandchild/file"]) - - inos = self.get_paths_to_ino() - tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"]) - - target_inos = [inos["./parent/child"], inos["./parent/child/file"], - inos["./parent/child/grandchild"]] - - for ino in tree: - del target_inos[target_inos.index(ino['ino'])] # don't catch! - - assert(len(target_inos) == 0) - - def test_random(self): - random.seed(0) - - self.populate() - inos = self.get_paths_to_ino() - target = random.choice(inos.keys()) - - if target != "./": - target = os.path.dirname(target) - - subtree = [path for path in inos.keys() if path.startswith(target)] - target_inos = [inos[path] for path in subtree] - tree = self.fs.mds_asok(["dump", "tree", target[1:]]) - - for ino in tree: - del target_inos[target_inos.index(ino['ino'])] # don't catch! - - assert(len(target_inos) == 0) - - target_depth = target.count('/') - maxdepth = max([path.count('/') for path in subtree]) - target_depth - depth = random.randint(0, maxdepth) - target_inos = [inos[path] for path in subtree \ - if path.count('/') <= depth + target_depth] - tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)]) - - for ino in tree: - del target_inos[target_inos.index(ino['ino'])] # don't catch! - - assert(len(target_inos) == 0) diff --git a/src/ceph/qa/tasks/cephfs/test_exports.py b/src/ceph/qa/tasks/cephfs/test_exports.py deleted file mode 100644 index 913999d..0000000 --- a/src/ceph/qa/tasks/cephfs/test_exports.py +++ /dev/null @@ -1,107 +0,0 @@ -import logging -import time -from tasks.cephfs.fuse_mount import FuseMount -from tasks.cephfs.cephfs_test_case import CephFSTestCase - -log = logging.getLogger(__name__) - -class TestExports(CephFSTestCase): - MDSS_REQUIRED = 2 - - def _wait_subtrees(self, status, rank, test): - timeout = 30 - pause = 2 - test = sorted(test) - for i in range(timeout/pause): - subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name']) - subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees) - filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees]) - log.info("%s =?= %s", filtered, test) - if filtered == test: - # Confirm export_pin in output is correct: - for s in subtrees: - self.assertTrue(s['export_pin'] == s['auth_first']) - return subtrees - time.sleep(pause) - raise RuntimeError("rank {0} failed to reach desired subtree state", rank) - - def test_export_pin(self): - self.fs.set_max_mds(2) - self.fs.wait_for_daemons() - - status = self.fs.status() - - self.mount_a.run_shell(["mkdir", "-p", "1/2/3"]) - self._wait_subtrees(status, 0, []) - - # NOP - self.mount_a.setfattr("1", "ceph.dir.pin", "-1") - self._wait_subtrees(status, 0, []) - - # NOP (rank < -1) - self.mount_a.setfattr("1", "ceph.dir.pin", "-2341") - self._wait_subtrees(status, 0, []) - - # pin /1 to rank 1 - self.mount_a.setfattr("1", "ceph.dir.pin", "1") - self._wait_subtrees(status, 1, [('/1', 1)]) - - # Check export_targets is set properly - status = self.fs.status() - log.info(status) - r0 = status.get_rank(self.fs.id, 0) - self.assertTrue(sorted(r0['export_targets']) == [1]) - - # redundant pin /1/2 to rank 1 - self.mount_a.setfattr("1/2", "ceph.dir.pin", "1") - self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)]) - - # change pin /1/2 to rank 0 - self.mount_a.setfattr("1/2", "ceph.dir.pin", "0") - self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)]) - self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)]) - - # change pin /1/2/3 to (presently) non-existent rank 2 - self.mount_a.setfattr("1/2/3", "ceph.dir.pin", "2") - self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)]) - self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)]) - - # change pin /1/2 back to rank 1 - self.mount_a.setfattr("1/2", "ceph.dir.pin", "1") - self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)]) - - # add another directory pinned to 1 - self.mount_a.run_shell(["mkdir", "-p", "1/4/5"]) - self.mount_a.setfattr("1/4/5", "ceph.dir.pin", "1") - self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1), ('/1/4/5', 1)]) - - # change pin /1 to 0 - self.mount_a.setfattr("1", "ceph.dir.pin", "0") - self._wait_subtrees(status, 0, [('/1', 0), ('/1/2', 1), ('/1/4/5', 1)]) - - # change pin /1/2 to default (-1); does the subtree root properly respect it's parent pin? - self.mount_a.setfattr("1/2", "ceph.dir.pin", "-1") - self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1)]) - - if len(list(status.get_standbys())): - self.fs.set_max_mds(3) - self.fs.wait_for_state('up:active', rank=2) - self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2)]) - - # Check export_targets is set properly - status = self.fs.status() - log.info(status) - r0 = status.get_rank(self.fs.id, 0) - self.assertTrue(sorted(r0['export_targets']) == [1,2]) - r1 = status.get_rank(self.fs.id, 1) - self.assertTrue(sorted(r1['export_targets']) == [0]) - r2 = status.get_rank(self.fs.id, 2) - self.assertTrue(sorted(r2['export_targets']) == []) - - # Test rename - self.mount_a.run_shell(["mkdir", "-p", "a/b", "aa/bb"]) - self.mount_a.setfattr("a", "ceph.dir.pin", "1") - self.mount_a.setfattr("aa/bb", "ceph.dir.pin", "0") - self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/aa/bb', 0)]) - self.mount_a.run_shell(["mv", "aa", "a/b/"]) - self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/a/b/aa/bb', 0)]) diff --git a/src/ceph/qa/tasks/cephfs/test_failover.py b/src/ceph/qa/tasks/cephfs/test_failover.py deleted file mode 100644 index 9d3392c..0000000 --- a/src/ceph/qa/tasks/cephfs/test_failover.py +++ /dev/null @@ -1,645 +0,0 @@ -import json -import logging -from unittest import case, SkipTest - -from cephfs_test_case import CephFSTestCase -from teuthology.exceptions import CommandFailedError -from teuthology import misc as teuthology -from tasks.cephfs.fuse_mount import FuseMount - -log = logging.getLogger(__name__) - - -class TestFailover(CephFSTestCase): - CLIENTS_REQUIRED = 1 - MDSS_REQUIRED = 2 - - def test_simple(self): - """ - That when the active MDS is killed, a standby MDS is promoted into - its rank after the grace period. - - This is just a simple unit test, the harder cases are covered - in thrashing tests. - """ - - # Need all my standbys up as well as the active daemons - self.wait_for_daemon_start() - - (original_active, ) = self.fs.get_active_names() - original_standbys = self.mds_cluster.get_standby_daemons() - - # Kill the rank 0 daemon's physical process - self.fs.mds_stop(original_active) - - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - - # Wait until the monitor promotes his replacement - def promoted(): - active = self.fs.get_active_names() - return active and active[0] in original_standbys - - log.info("Waiting for promotion of one of the original standbys {0}".format( - original_standbys)) - self.wait_until_true( - promoted, - timeout=grace*2) - - # Start the original rank 0 daemon up again, see that he becomes a standby - self.fs.mds_restart(original_active) - self.wait_until_true( - lambda: original_active in self.mds_cluster.get_standby_daemons(), - timeout=60 # Approximately long enough for MDS to start and mon to notice - ) - - def test_client_abort(self): - """ - That a client will respect fuse_require_active_mds and error out - when the cluster appears to be unavailable. - """ - - if not isinstance(self.mount_a, FuseMount): - raise SkipTest("Requires FUSE client to inject client metadata") - - require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true" - if not require_active: - raise case.SkipTest("fuse_require_active_mds is not set") - - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - - # Check it's not laggy to begin with - (original_active, ) = self.fs.get_active_names() - self.assertNotIn("laggy_since", self.fs.mon_manager.get_mds_status(original_active)) - - self.mounts[0].umount_wait() - - # Control: that we can mount and unmount usually, while the cluster is healthy - self.mounts[0].mount() - self.mounts[0].wait_until_mounted() - self.mounts[0].umount_wait() - - # Stop the daemon processes - self.fs.mds_stop() - - # Wait for everyone to go laggy - def laggy(): - mdsmap = self.fs.get_mds_map() - for info in mdsmap['info'].values(): - if "laggy_since" not in info: - return False - - return True - - self.wait_until_true(laggy, grace * 2) - with self.assertRaises(CommandFailedError): - self.mounts[0].mount() - - def test_standby_count_wanted(self): - """ - That cluster health warnings are generated by insufficient standbys available. - """ - - # Need all my standbys up as well as the active daemons - self.wait_for_daemon_start() - - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - - standbys = self.mds_cluster.get_standby_daemons() - self.assertGreaterEqual(len(standbys), 1) - self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys))) - - # Kill a standby and check for warning - victim = standbys.pop() - self.fs.mds_stop(victim) - log.info("waiting for insufficient standby daemon warning") - self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2) - - # restart the standby, see that he becomes a standby, check health clears - self.fs.mds_restart(victim) - self.wait_until_true( - lambda: victim in self.mds_cluster.get_standby_daemons(), - timeout=60 # Approximately long enough for MDS to start and mon to notice - ) - self.wait_for_health_clear(timeout=30) - - # Set it one greater than standbys ever seen - standbys = self.mds_cluster.get_standby_daemons() - self.assertGreaterEqual(len(standbys), 1) - self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1)) - log.info("waiting for insufficient standby daemon warning") - self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2) - - # Set it to 0 - self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0') - self.wait_for_health_clear(timeout=30) - - - - -class TestStandbyReplay(CephFSTestCase): - MDSS_REQUIRED = 4 - REQUIRE_FILESYSTEM = False - - def set_standby_for(self, leader, follower, replay): - self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader) - if replay: - self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true") - - def get_info_by_name(self, mds_name): - status = self.mds_cluster.status() - info = status.get_mds(mds_name) - if info is None: - log.warn(str(status)) - raise RuntimeError("MDS '{0}' not found".format(mds_name)) - else: - return info - - def test_standby_replay_unused(self): - # Pick out exactly 3 daemons to be run during test - use_daemons = sorted(self.mds_cluster.mds_ids[0:3]) - mds_a, mds_b, mds_c = use_daemons - log.info("Using MDS daemons: {0}".format(use_daemons)) - - # B and C should both follow A, but only one will - # really get into standby replay state. - self.set_standby_for(mds_a, mds_b, True) - self.set_standby_for(mds_a, mds_c, True) - - # Create FS and start A - fs_a = self.mds_cluster.newfs("alpha") - self.mds_cluster.mds_restart(mds_a) - fs_a.wait_for_daemons() - self.assertEqual(fs_a.get_active_names(), [mds_a]) - - # Start B, he should go into standby replay - self.mds_cluster.mds_restart(mds_b) - self.wait_for_daemon_start([mds_b]) - info_b = self.get_info_by_name(mds_b) - self.assertEqual(info_b['state'], "up:standby-replay") - self.assertEqual(info_b['standby_for_name'], mds_a) - self.assertEqual(info_b['rank'], 0) - - # Start C, he should go into standby (*not* replay) - self.mds_cluster.mds_restart(mds_c) - self.wait_for_daemon_start([mds_c]) - info_c = self.get_info_by_name(mds_c) - self.assertEqual(info_c['state'], "up:standby") - self.assertEqual(info_c['standby_for_name'], mds_a) - self.assertEqual(info_c['rank'], -1) - - # Kill B, C should go into standby replay - self.mds_cluster.mds_stop(mds_b) - self.mds_cluster.mds_fail(mds_b) - self.wait_until_equal( - lambda: self.get_info_by_name(mds_c)['state'], - "up:standby-replay", - 60) - info_c = self.get_info_by_name(mds_c) - self.assertEqual(info_c['state'], "up:standby-replay") - self.assertEqual(info_c['standby_for_name'], mds_a) - self.assertEqual(info_c['rank'], 0) - - def test_standby_failure(self): - """ - That the failure of a standby-replay daemon happens cleanly - and doesn't interrupt anything else. - """ - # Pick out exactly 2 daemons to be run during test - use_daemons = sorted(self.mds_cluster.mds_ids[0:2]) - mds_a, mds_b = use_daemons - log.info("Using MDS daemons: {0}".format(use_daemons)) - - # Configure two pairs of MDSs that are standby for each other - self.set_standby_for(mds_a, mds_b, True) - self.set_standby_for(mds_b, mds_a, False) - - # Create FS alpha and get mds_a to come up as active - fs_a = self.mds_cluster.newfs("alpha") - self.mds_cluster.mds_restart(mds_a) - fs_a.wait_for_daemons() - self.assertEqual(fs_a.get_active_names(), [mds_a]) - - # Start the standbys - self.mds_cluster.mds_restart(mds_b) - self.wait_for_daemon_start([mds_b]) - - # See the standby come up as the correct rank - info_b = self.get_info_by_name(mds_b) - self.assertEqual(info_b['state'], "up:standby-replay") - self.assertEqual(info_b['standby_for_name'], mds_a) - self.assertEqual(info_b['rank'], 0) - - # Kill the standby - self.mds_cluster.mds_stop(mds_b) - self.mds_cluster.mds_fail(mds_b) - - # See that the standby is gone and the active remains - self.assertEqual(fs_a.get_active_names(), [mds_a]) - mds_map = fs_a.get_mds_map() - self.assertEqual(len(mds_map['info']), 1) - self.assertEqual(mds_map['failed'], []) - self.assertEqual(mds_map['damaged'], []) - self.assertEqual(mds_map['stopped'], []) - - def test_rank_stopped(self): - """ - That when a rank is STOPPED, standby replays for - that rank get torn down - """ - # Pick out exactly 2 daemons to be run during test - use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) - mds_a, mds_b, mds_a_s, mds_b_s = use_daemons - log.info("Using MDS daemons: {0}".format(use_daemons)) - - # a and b both get a standby - self.set_standby_for(mds_a, mds_a_s, True) - self.set_standby_for(mds_b, mds_b_s, True) - - # Create FS alpha and get mds_a to come up as active - fs_a = self.mds_cluster.newfs("alpha") - fs_a.set_max_mds(2) - - self.mds_cluster.mds_restart(mds_a) - self.wait_until_equal(lambda: fs_a.get_active_names(), [mds_a], 30) - self.mds_cluster.mds_restart(mds_b) - fs_a.wait_for_daemons() - self.assertEqual(sorted(fs_a.get_active_names()), [mds_a, mds_b]) - - # Start the standbys - self.mds_cluster.mds_restart(mds_b_s) - self.wait_for_daemon_start([mds_b_s]) - self.mds_cluster.mds_restart(mds_a_s) - self.wait_for_daemon_start([mds_a_s]) - info_b_s = self.get_info_by_name(mds_b_s) - self.assertEqual(info_b_s['state'], "up:standby-replay") - info_a_s = self.get_info_by_name(mds_a_s) - self.assertEqual(info_a_s['state'], "up:standby-replay") - - # Shrink the cluster - fs_a.set_max_mds(1) - fs_a.mon_manager.raw_cluster_cmd("mds", "stop", "{0}:1".format(fs_a.name)) - self.wait_until_equal( - lambda: fs_a.get_active_names(), [mds_a], - 60 - ) - - # Both 'b' and 'b_s' should go back to being standbys - self.wait_until_equal( - lambda: self.mds_cluster.get_standby_daemons(), {mds_b, mds_b_s}, - 60 - ) - - -class TestMultiFilesystems(CephFSTestCase): - CLIENTS_REQUIRED = 2 - MDSS_REQUIRED = 4 - - # We'll create our own filesystems and start our own daemons - REQUIRE_FILESYSTEM = False - - def setUp(self): - super(TestMultiFilesystems, self).setUp() - self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set", - "enable_multiple", "true", - "--yes-i-really-mean-it") - - def _setup_two(self): - fs_a = self.mds_cluster.newfs("alpha") - fs_b = self.mds_cluster.newfs("bravo") - - self.mds_cluster.mds_restart() - - # Wait for both filesystems to go healthy - fs_a.wait_for_daemons() - fs_b.wait_for_daemons() - - # Reconfigure client auth caps - for mount in self.mounts: - self.mds_cluster.mon_manager.raw_cluster_cmd_result( - 'auth', 'caps', "client.{0}".format(mount.client_id), - 'mds', 'allow', - 'mon', 'allow r', - 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( - fs_a.get_data_pool_name(), fs_b.get_data_pool_name())) - - return fs_a, fs_b - - def test_clients(self): - fs_a, fs_b = self._setup_two() - - # Mount a client on fs_a - self.mount_a.mount(mount_fs_name=fs_a.name) - self.mount_a.write_n_mb("pad.bin", 1) - self.mount_a.write_n_mb("test.bin", 2) - a_created_ino = self.mount_a.path_to_ino("test.bin") - self.mount_a.create_files() - - # Mount a client on fs_b - self.mount_b.mount(mount_fs_name=fs_b.name) - self.mount_b.write_n_mb("test.bin", 1) - b_created_ino = self.mount_b.path_to_ino("test.bin") - self.mount_b.create_files() - - # Check that a non-default filesystem mount survives an MDS - # failover (i.e. that map subscription is continuous, not - # just the first time), reproduces #16022 - old_fs_b_mds = fs_b.get_active_names()[0] - self.mds_cluster.mds_stop(old_fs_b_mds) - self.mds_cluster.mds_fail(old_fs_b_mds) - fs_b.wait_for_daemons() - background = self.mount_b.write_background() - # Raise exception if the write doesn't finish (i.e. if client - # has not kept up with MDS failure) - try: - self.wait_until_true(lambda: background.finished, timeout=30) - except RuntimeError: - # The mount is stuck, we'll have to force it to fail cleanly - background.stdin.close() - self.mount_b.umount_wait(force=True) - raise - - self.mount_a.umount_wait() - self.mount_b.umount_wait() - - # See that the client's files went into the correct pool - self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024)) - self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024)) - - def test_standby(self): - fs_a, fs_b = self._setup_two() - - # Assert that the remaining two MDS daemons are now standbys - a_daemons = fs_a.get_active_names() - b_daemons = fs_b.get_active_names() - self.assertEqual(len(a_daemons), 1) - self.assertEqual(len(b_daemons), 1) - original_a = a_daemons[0] - original_b = b_daemons[0] - expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons)) - - # Need all my standbys up as well as the active daemons - self.wait_for_daemon_start() - self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons()) - - # Kill fs_a's active MDS, see a standby take over - self.mds_cluster.mds_stop(original_a) - self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a) - self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30, - reject_fn=lambda v: v > 1) - # Assert that it's a *different* daemon that has now appeared in the map for fs_a - self.assertNotEqual(fs_a.get_active_names()[0], original_a) - - # Kill fs_b's active MDS, see a standby take over - self.mds_cluster.mds_stop(original_b) - self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b) - self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30, - reject_fn=lambda v: v > 1) - # Assert that it's a *different* daemon that has now appeared in the map for fs_a - self.assertNotEqual(fs_b.get_active_names()[0], original_b) - - # Both of the original active daemons should be gone, and all standbys used up - self.assertEqual(self.mds_cluster.get_standby_daemons(), set()) - - # Restart the ones I killed, see them reappear as standbys - self.mds_cluster.mds_restart(original_a) - self.mds_cluster.mds_restart(original_b) - self.wait_until_true( - lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(), - timeout=30 - ) - - def test_grow_shrink(self): - # Usual setup... - fs_a, fs_b = self._setup_two() - - # Increase max_mds on fs_b, see a standby take up the role - fs_b.set_max_mds(2) - self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30, - reject_fn=lambda v: v > 2 or v < 1) - - # Increase max_mds on fs_a, see a standby take up the role - fs_a.set_max_mds(2) - self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30, - reject_fn=lambda v: v > 2 or v < 1) - - # Shrink fs_b back to 1, see a daemon go back to standby - fs_b.set_max_mds(1) - fs_b.deactivate(1) - self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30, - reject_fn=lambda v: v > 2 or v < 1) - - # Grow fs_a up to 3, see the former fs_b daemon join it. - fs_a.set_max_mds(3) - self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60, - reject_fn=lambda v: v > 3 or v < 2) - - def test_standby_for_name(self): - # Pick out exactly 4 daemons to be run during test - use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) - mds_a, mds_b, mds_c, mds_d = use_daemons - log.info("Using MDS daemons: {0}".format(use_daemons)) - - def set_standby_for(leader, follower, replay): - self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader) - if replay: - self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true") - - # Configure two pairs of MDSs that are standby for each other - set_standby_for(mds_a, mds_b, True) - set_standby_for(mds_b, mds_a, False) - set_standby_for(mds_c, mds_d, True) - set_standby_for(mds_d, mds_c, False) - - # Create FS alpha and get mds_a to come up as active - fs_a = self.mds_cluster.newfs("alpha") - self.mds_cluster.mds_restart(mds_a) - fs_a.wait_for_daemons() - self.assertEqual(fs_a.get_active_names(), [mds_a]) - - # Create FS bravo and get mds_c to come up as active - fs_b = self.mds_cluster.newfs("bravo") - self.mds_cluster.mds_restart(mds_c) - fs_b.wait_for_daemons() - self.assertEqual(fs_b.get_active_names(), [mds_c]) - - # Start the standbys - self.mds_cluster.mds_restart(mds_b) - self.mds_cluster.mds_restart(mds_d) - self.wait_for_daemon_start([mds_b, mds_d]) - - def get_info_by_name(fs, mds_name): - mds_map = fs.get_mds_map() - for gid_str, info in mds_map['info'].items(): - if info['name'] == mds_name: - return info - - log.warn(json.dumps(mds_map, indent=2)) - raise RuntimeError("MDS '{0}' not found in filesystem MDSMap".format(mds_name)) - - # See both standbys come up as standby replay for the correct ranks - # mds_b should be in filesystem alpha following mds_a - info_b = get_info_by_name(fs_a, mds_b) - self.assertEqual(info_b['state'], "up:standby-replay") - self.assertEqual(info_b['standby_for_name'], mds_a) - self.assertEqual(info_b['rank'], 0) - # mds_d should be in filesystem alpha following mds_c - info_d = get_info_by_name(fs_b, mds_d) - self.assertEqual(info_d['state'], "up:standby-replay") - self.assertEqual(info_d['standby_for_name'], mds_c) - self.assertEqual(info_d['rank'], 0) - - # Kill both active daemons - self.mds_cluster.mds_stop(mds_a) - self.mds_cluster.mds_fail(mds_a) - self.mds_cluster.mds_stop(mds_c) - self.mds_cluster.mds_fail(mds_c) - - # Wait for standbys to take over - fs_a.wait_for_daemons() - self.assertEqual(fs_a.get_active_names(), [mds_b]) - fs_b.wait_for_daemons() - self.assertEqual(fs_b.get_active_names(), [mds_d]) - - # Start the original active daemons up again - self.mds_cluster.mds_restart(mds_a) - self.mds_cluster.mds_restart(mds_c) - self.wait_for_daemon_start([mds_a, mds_c]) - - self.assertEqual(set(self.mds_cluster.get_standby_daemons()), - {mds_a, mds_c}) - - def test_standby_for_rank(self): - use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) - mds_a, mds_b, mds_c, mds_d = use_daemons - log.info("Using MDS daemons: {0}".format(use_daemons)) - - def set_standby_for(leader_rank, leader_fs, follower_id): - self.set_conf("mds.{0}".format(follower_id), - "mds_standby_for_rank", leader_rank) - - fscid = leader_fs.get_namespace_id() - self.set_conf("mds.{0}".format(follower_id), - "mds_standby_for_fscid", fscid) - - fs_a = self.mds_cluster.newfs("alpha") - fs_b = self.mds_cluster.newfs("bravo") - set_standby_for(0, fs_a, mds_a) - set_standby_for(0, fs_a, mds_b) - set_standby_for(0, fs_b, mds_c) - set_standby_for(0, fs_b, mds_d) - - self.mds_cluster.mds_restart(mds_a) - fs_a.wait_for_daemons() - self.assertEqual(fs_a.get_active_names(), [mds_a]) - - self.mds_cluster.mds_restart(mds_c) - fs_b.wait_for_daemons() - self.assertEqual(fs_b.get_active_names(), [mds_c]) - - self.mds_cluster.mds_restart(mds_b) - self.mds_cluster.mds_restart(mds_d) - self.wait_for_daemon_start([mds_b, mds_d]) - - self.mds_cluster.mds_stop(mds_a) - self.mds_cluster.mds_fail(mds_a) - self.mds_cluster.mds_stop(mds_c) - self.mds_cluster.mds_fail(mds_c) - - fs_a.wait_for_daemons() - self.assertEqual(fs_a.get_active_names(), [mds_b]) - fs_b.wait_for_daemons() - self.assertEqual(fs_b.get_active_names(), [mds_d]) - - def test_standby_for_fscid(self): - """ - That I can set a standby FSCID with no rank, and the result is - that daemons join any rank for that filesystem. - """ - use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) - mds_a, mds_b, mds_c, mds_d = use_daemons - - log.info("Using MDS daemons: {0}".format(use_daemons)) - - def set_standby_for(leader_fs, follower_id): - fscid = leader_fs.get_namespace_id() - self.set_conf("mds.{0}".format(follower_id), - "mds_standby_for_fscid", fscid) - - # Create two filesystems which should have two ranks each - fs_a = self.mds_cluster.newfs("alpha") - - fs_b = self.mds_cluster.newfs("bravo") - - fs_a.set_max_mds(2) - fs_b.set_max_mds(2) - - # Set all the daemons to have a FSCID assignment but no other - # standby preferences. - set_standby_for(fs_a, mds_a) - set_standby_for(fs_a, mds_b) - set_standby_for(fs_b, mds_c) - set_standby_for(fs_b, mds_d) - - # Now when we start all daemons at once, they should fall into - # ranks in the right filesystem - self.mds_cluster.mds_restart(mds_a) - self.mds_cluster.mds_restart(mds_b) - self.mds_cluster.mds_restart(mds_c) - self.mds_cluster.mds_restart(mds_d) - self.wait_for_daemon_start([mds_a, mds_b, mds_c, mds_d]) - fs_a.wait_for_daemons() - fs_b.wait_for_daemons() - self.assertEqual(set(fs_a.get_active_names()), {mds_a, mds_b}) - self.assertEqual(set(fs_b.get_active_names()), {mds_c, mds_d}) - - def test_standby_for_invalid_fscid(self): - """ - That an invalid standby_fscid does not cause a mon crash - """ - use_daemons = sorted(self.mds_cluster.mds_ids[0:3]) - mds_a, mds_b, mds_c = use_daemons - log.info("Using MDS daemons: {0}".format(use_daemons)) - - def set_standby_for_rank(leader_rank, follower_id): - self.set_conf("mds.{0}".format(follower_id), - "mds_standby_for_rank", leader_rank) - - # Create one fs - fs_a = self.mds_cluster.newfs("cephfs") - - # Get configured mons in the cluster, so we can see if any - # crashed later. - configured_mons = fs_a.mon_manager.get_mon_quorum() - - # Set all the daemons to have a rank assignment but no other - # standby preferences. - set_standby_for_rank(0, mds_a) - set_standby_for_rank(0, mds_b) - - # Set third daemon to have invalid fscid assignment and no other - # standby preferences - invalid_fscid = 123 - self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid) - - #Restart all the daemons to make the standby preference applied - self.mds_cluster.mds_restart(mds_a) - self.mds_cluster.mds_restart(mds_b) - self.mds_cluster.mds_restart(mds_c) - self.wait_for_daemon_start([mds_a, mds_b, mds_c]) - - #Stop active mds daemon service of fs - if (fs_a.get_active_names(), [mds_a]): - self.mds_cluster.mds_stop(mds_a) - self.mds_cluster.mds_fail(mds_a) - fs_a.wait_for_daemons() - else: - self.mds_cluster.mds_stop(mds_b) - self.mds_cluster.mds_fail(mds_b) - fs_a.wait_for_daemons() - - #Get active mons from cluster - active_mons = fs_a.mon_manager.get_mon_quorum() - - #Check for active quorum mon status and configured mon status - self.assertEqual(active_mons, configured_mons, - "Not all mons are in quorum Invalid standby invalid fscid test failed!") diff --git a/src/ceph/qa/tasks/cephfs/test_flush.py b/src/ceph/qa/tasks/cephfs/test_flush.py deleted file mode 100644 index 1f84e42..0000000 --- a/src/ceph/qa/tasks/cephfs/test_flush.py +++ /dev/null @@ -1,113 +0,0 @@ - -from textwrap import dedent -from tasks.cephfs.cephfs_test_case import CephFSTestCase -from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO - - -class TestFlush(CephFSTestCase): - def test_flush(self): - self.mount_a.run_shell(["mkdir", "mydir"]) - self.mount_a.run_shell(["touch", "mydir/alpha"]) - dir_ino = self.mount_a.path_to_ino("mydir") - file_ino = self.mount_a.path_to_ino("mydir/alpha") - - # Unmount the client so that it isn't still holding caps - self.mount_a.umount_wait() - - # Before flush, the dirfrag object does not exist - with self.assertRaises(ObjectNotFound): - self.fs.list_dirfrag(dir_ino) - - # Before flush, the file's backtrace has not been written - with self.assertRaises(ObjectNotFound): - self.fs.read_backtrace(file_ino) - - # Before flush, there are no dentries in the root - self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) - - # Execute flush - flush_data = self.fs.mds_asok(["flush", "journal"]) - self.assertEqual(flush_data['return_code'], 0) - - # After flush, the dirfrag object has been created - dir_list = self.fs.list_dirfrag(dir_ino) - self.assertEqual(dir_list, ["alpha_head"]) - - # And the 'mydir' dentry is in the root - self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head']) - - # ...and the data object has its backtrace - backtrace = self.fs.read_backtrace(file_ino) - self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']]) - self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']]) - self.assertEqual(file_ino, backtrace['ino']) - - # ...and the journal is truncated to just a single subtreemap from the - # newly created segment - summary_output = self.fs.journal_tool(["event", "get", "summary"]) - try: - self.assertEqual(summary_output, - dedent( - """ - Events by type: - SUBTREEMAP: 1 - Errors: 0 - """ - ).strip()) - except AssertionError: - # In some states, flushing the journal will leave you - # an extra event from locks a client held. This is - # correct behaviour: the MDS is flushing the journal, - # it's just that new events are getting added too. - # In this case, we should nevertheless see a fully - # empty journal after a second flush. - self.assertEqual(summary_output, - dedent( - """ - Events by type: - SUBTREEMAP: 1 - UPDATE: 1 - Errors: 0 - """ - ).strip()) - flush_data = self.fs.mds_asok(["flush", "journal"]) - self.assertEqual(flush_data['return_code'], 0) - self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]), - dedent( - """ - Events by type: - SUBTREEMAP: 1 - Errors: 0 - """ - ).strip()) - - # Now for deletion! - # We will count the RADOS deletions and MDS file purges, to verify that - # the expected behaviour is happening as a result of the purge - initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] - - # Use a client to delete a file - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.mount_a.run_shell(["rm", "-rf", "mydir"]) - - # Flush the journal so that the directory inode can be purged - flush_data = self.fs.mds_asok(["flush", "journal"]) - self.assertEqual(flush_data['return_code'], 0) - - # We expect to see a single file purge - self.wait_until_true( - lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] - initial_purges >= 2, - 60) - - # We expect two deletions, one of the dirfrag and one of the backtrace - self.wait_until_true( - lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2, - 60) # timeout is fairly long to allow for tick+rados latencies - - with self.assertRaises(ObjectNotFound): - self.fs.list_dirfrag(dir_ino) - with self.assertRaises(ObjectNotFound): - self.fs.read_backtrace(file_ino) - self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) diff --git a/src/ceph/qa/tasks/cephfs/test_forward_scrub.py b/src/ceph/qa/tasks/cephfs/test_forward_scrub.py deleted file mode 100644 index ac912dd..0000000 --- a/src/ceph/qa/tasks/cephfs/test_forward_scrub.py +++ /dev/null @@ -1,291 +0,0 @@ - -""" -Test that the forward scrub functionality can traverse metadata and apply -requested tags, on well formed metadata. - -This is *not* the real testing for forward scrub, which will need to test -how the functionality responds to damaged metadata. - -""" -import json - -import logging -from collections import namedtuple -from textwrap import dedent - -from teuthology.orchestra.run import CommandFailedError -from tasks.cephfs.cephfs_test_case import CephFSTestCase - -import struct - -log = logging.getLogger(__name__) - - -ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) - - -class TestForwardScrub(CephFSTestCase): - MDSS_REQUIRED = 1 - - def _read_str_xattr(self, pool, obj, attr): - """ - Read a ceph-encoded string from a rados xattr - """ - output = self.fs.rados(["getxattr", obj, attr], pool=pool) - strlen = struct.unpack('i', output[0:4])[0] - return output[4:(4 + strlen)] - - def _get_paths_to_ino(self): - inos = {} - p = self.mount_a.run_shell(["find", "./"]) - paths = p.stdout.getvalue().strip().split() - for path in paths: - inos[path] = self.mount_a.path_to_ino(path) - - return inos - - def test_apply_tag(self): - self.mount_a.run_shell(["mkdir", "parentdir"]) - self.mount_a.run_shell(["mkdir", "parentdir/childdir"]) - self.mount_a.run_shell(["touch", "rfile"]) - self.mount_a.run_shell(["touch", "parentdir/pfile"]) - self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"]) - - # Build a structure mapping path to inode, as we will later want - # to check object by object and objects are named after ino number - inos = self._get_paths_to_ino() - - # Flush metadata: this is a friendly test of forward scrub so we're skipping - # the part where it's meant to cope with dirty metadata - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"]) - - tag = "mytag" - - # Execute tagging forward scrub - self.fs.mds_asok(["tag", "path", "/parentdir", tag]) - # Wait for completion - import time - time.sleep(10) - # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll - # watch that instead - - # Check that dirs were tagged - for dirpath in ["./parentdir", "./parentdir/childdir"]: - self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name()) - - # Check that files were tagged - for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]: - self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name()) - - # This guy wasn't in the tag path, shouldn't have been tagged - self.assertUntagged(inos["./rfile"]) - - def assertUntagged(self, ino): - file_obj_name = "{0:x}.00000000".format(ino) - with self.assertRaises(CommandFailedError): - self._read_str_xattr( - self.fs.get_data_pool_name(), - file_obj_name, - "scrub_tag" - ) - - def assertTagged(self, ino, tag, pool): - file_obj_name = "{0:x}.00000000".format(ino) - wrote = self._read_str_xattr( - pool, - file_obj_name, - "scrub_tag" - ) - self.assertEqual(wrote, tag) - - def _validate_linkage(self, expected): - inos = self._get_paths_to_ino() - try: - self.assertDictEqual(inos, expected) - except AssertionError: - log.error("Expected: {0}".format(json.dumps(expected, indent=2))) - log.error("Actual: {0}".format(json.dumps(inos, indent=2))) - raise - - def test_orphan_scan(self): - # Create some files whose metadata we will flush - self.mount_a.run_python(dedent(""" - import os - mount_point = "{mount_point}" - parent = os.path.join(mount_point, "parent") - os.mkdir(parent) - flushed = os.path.join(parent, "flushed") - os.mkdir(flushed) - for f in ["alpha", "bravo", "charlie"]: - open(os.path.join(flushed, f), 'w').write(f) - """.format(mount_point=self.mount_a.mountpoint))) - - inos = self._get_paths_to_ino() - - # Flush journal - # Umount before flush to avoid cap releases putting - # things we don't want in the journal later. - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"]) - - # Create a new inode that's just in the log, i.e. would - # look orphaned to backward scan if backward scan wisnae - # respectin' tha scrub_tag xattr. - self.mount_a.mount() - self.mount_a.run_shell(["mkdir", "parent/unflushed"]) - self.mount_a.run_shell(["dd", "if=/dev/urandom", - "of=./parent/unflushed/jfile", - "bs=1M", "count=8"]) - inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed") - inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile") - self.mount_a.umount_wait() - - # Orphan an inode by deleting its dentry - # Our victim will be.... bravo. - self.mount_a.umount_wait() - self.fs.mds_stop() - self.fs.mds_fail() - self.fs.set_ceph_conf('mds', 'mds verify scatter', False) - self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) - frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"]) - self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"]) - - self.fs.mds_restart() - self.fs.wait_for_daemons() - - # See that the orphaned file is indeed missing from a client's POV - self.mount_a.mount() - damaged_state = self._get_paths_to_ino() - self.assertNotIn("./parent/flushed/bravo", damaged_state) - self.mount_a.umount_wait() - - # Run a tagging forward scrub - tag = "mytag123" - self.fs.mds_asok(["tag", "path", "/parent", tag]) - - # See that the orphan wisnae tagged - self.assertUntagged(inos['./parent/flushed/bravo']) - - # See that the flushed-metadata-and-still-present files are tagged - self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name()) - self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name()) - - # See that journalled-but-not-flushed file *was* tagged - self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name()) - - # Run cephfs-data-scan targeting only orphans - self.fs.mds_stop() - self.fs.mds_fail() - self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) - self.fs.data_scan([ - "scan_inodes", - "--filter-tag", tag, - self.fs.get_data_pool_name() - ]) - - # After in-place injection stats should be kosher again - self.fs.set_ceph_conf('mds', 'mds verify scatter', True) - self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True) - - # And we should have all the same linkage we started with, - # and no lost+found, and no extra inodes! - self.fs.mds_restart() - self.fs.wait_for_daemons() - self.mount_a.mount() - self._validate_linkage(inos) - - def _stash_inotable(self): - # Get all active ranks - ranks = self.fs.get_all_mds_rank() - - inotable_dict = {} - for rank in ranks: - inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable" - print "Trying to fetch inotable object: " + inotable_oid - - #self.fs.get_metadata_object("InoTable", "mds0_inotable") - inotable_raw = self.fs.get_metadata_object_raw(inotable_oid) - inotable_dict[inotable_oid] = inotable_raw - return inotable_dict - - def test_inotable_sync(self): - self.mount_a.write_n_mb("file1_sixmegs", 6) - - # Flush journal - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"]) - - inotable_copy = self._stash_inotable() - - self.mount_a.mount() - - self.mount_a.write_n_mb("file2_sixmegs", 6) - self.mount_a.write_n_mb("file3_sixmegs", 6) - - inos = self._get_paths_to_ino() - - # Flush journal - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"]) - - self.mount_a.umount_wait() - - with self.assert_cluster_log("inode table repaired", invert_match=True): - self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"]) - - self.mds_cluster.mds_stop() - self.mds_cluster.mds_fail() - - # Truncate the journal (to ensure the inotable on disk - # is all that will be in the InoTable in memory) - - self.fs.journal_tool(["event", "splice", - "--inode={0}".format(inos["./file2_sixmegs"]), "summary"]) - - self.fs.journal_tool(["event", "splice", - "--inode={0}".format(inos["./file3_sixmegs"]), "summary"]) - - # Revert to old inotable. - for key, value in inotable_copy.iteritems(): - self.fs.put_metadata_object_raw(key, value) - - self.mds_cluster.mds_restart() - self.fs.wait_for_daemons() - - with self.assert_cluster_log("inode table repaired"): - self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"]) - - self.mds_cluster.mds_stop() - table_text = self.fs.table_tool(["0", "show", "inode"]) - table = json.loads(table_text) - self.assertGreater( - table['0']['data']['inotable']['free'][0]['start'], - inos['./file3_sixmegs']) - - def test_backtrace_repair(self): - """ - That the MDS can repair an inodes backtrace in the data pool - if it is found to be damaged. - """ - # Create a file for subsequent checks - self.mount_a.run_shell(["mkdir", "parent_a"]) - self.mount_a.run_shell(["touch", "parent_a/alpha"]) - file_ino = self.mount_a.path_to_ino("parent_a/alpha") - - # That backtrace and layout are written after initial flush - self.fs.mds_asok(["flush", "journal"]) - backtrace = self.fs.read_backtrace(file_ino) - self.assertEqual(['alpha', 'parent_a'], - [a['dname'] for a in backtrace['ancestors']]) - - # Go corrupt the backtrace - self.fs._write_data_xattr(file_ino, "parent", - "oh i'm sorry did i overwrite your xattr?") - - with self.assert_cluster_log("bad backtrace on inode"): - self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"]) - self.fs.mds_asok(["flush", "journal"]) - backtrace = self.fs.read_backtrace(file_ino) - self.assertEqual(['alpha', 'parent_a'], - [a['dname'] for a in backtrace['ancestors']]) diff --git a/src/ceph/qa/tasks/cephfs/test_fragment.py b/src/ceph/qa/tasks/cephfs/test_fragment.py deleted file mode 100644 index a62ef74..0000000 --- a/src/ceph/qa/tasks/cephfs/test_fragment.py +++ /dev/null @@ -1,232 +0,0 @@ - - -from tasks.cephfs.cephfs_test_case import CephFSTestCase -from teuthology.orchestra import run - -import logging -log = logging.getLogger(__name__) - - -class TestFragmentation(CephFSTestCase): - CLIENTS_REQUIRED = 1 - MDSS_REQUIRED = 1 - - def get_splits(self): - return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split'] - - def get_merges(self): - return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge'] - - def get_dir_ino(self, path): - dir_cache = self.fs.read_cache(path, 0) - dir_ino = None - dir_inono = self.mount_a.path_to_ino(path.strip("/")) - for ino in dir_cache: - if ino['ino'] == dir_inono: - dir_ino = ino - break - self.assertIsNotNone(dir_ino) - return dir_ino - - def _configure(self, **kwargs): - """ - Apply kwargs as MDS configuration settings, enable dirfrags - and restart the MDSs. - """ - kwargs['mds_bal_frag'] = "true" - - for k, v in kwargs.items(): - self.ceph_cluster.set_ceph_conf("mds", k, v.__str__()) - - self.fs.set_allow_dirfrags(True) - - self.mds_cluster.mds_fail_restart() - self.fs.wait_for_daemons() - - def test_oversize(self): - """ - That a directory is split when it becomes too large. - """ - - split_size = 20 - merge_size = 5 - - self._configure( - mds_bal_split_size=split_size, - mds_bal_merge_size=merge_size, - mds_bal_split_bits=1 - ) - - self.assertEqual(self.get_splits(), 0) - - self.mount_a.create_n_files("splitdir/file", split_size + 1) - - self.wait_until_true( - lambda: self.get_splits() == 1, - timeout=30 - ) - - frags = self.get_dir_ino("/splitdir")['dirfrags'] - self.assertEqual(len(frags), 2) - self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*") - self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*") - self.assertEqual( - sum([len(f['dentries']) for f in frags]), - split_size + 1 - ) - - self.assertEqual(self.get_merges(), 0) - - self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")]) - - self.wait_until_true( - lambda: self.get_merges() == 1, - timeout=30 - ) - - self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1) - - def test_rapid_creation(self): - """ - That the fast-splitting limit of 1.5x normal limit is - applied when creating dentries quickly. - """ - - split_size = 100 - merge_size = 1 - - self._configure( - mds_bal_split_size=split_size, - mds_bal_merge_size=merge_size, - mds_bal_split_bits=3, - mds_bal_fragment_size_max=int(split_size * 1.5 + 2) - ) - - # We test this only at a single split level. If a client was sending - # IO so fast that it hit a second split before the first split - # was complete, it could violate mds_bal_fragment_size_max -- there - # is a window where the child dirfrags of a split are unfrozen - # (so they can grow), but still have STATE_FRAGMENTING (so they - # can't be split). - - # By writing 4x the split size when the split bits are set - # to 3 (i.e. 4-ways), I am reasonably sure to see precisely - # one split. The test is to check whether that split - # happens soon enough that the client doesn't exceed - # 2x the split_size (the "immediate" split mode should - # kick in at 1.5x the split size). - - self.assertEqual(self.get_splits(), 0) - self.mount_a.create_n_files("splitdir/file", split_size * 4) - self.wait_until_equal( - self.get_splits, - 1, - reject_fn=lambda s: s > 1, - timeout=30 - ) - - def test_deep_split(self): - """ - That when the directory grows many times larger than split size, - the fragments get split again. - """ - - split_size = 100 - merge_size = 1 # i.e. don't merge frag unless its empty - split_bits = 1 - - branch_factor = 2**split_bits - - # Arbitrary: how many levels shall we try fragmenting before - # ending the test? - max_depth = 5 - - self._configure( - mds_bal_split_size=split_size, - mds_bal_merge_size=merge_size, - mds_bal_split_bits=split_bits - ) - - # Each iteration we will create another level of fragments. The - # placement of dentries into fragments is by hashes (i.e. pseudo - # random), so we rely on statistics to get the behaviour that - # by writing about 1.5x as many dentries as the split_size times - # the number of frags, we will get them all to exceed their - # split size and trigger a split. - depth = 0 - files_written = 0 - splits_expected = 0 - while depth < max_depth: - log.info("Writing files for depth {0}".format(depth)) - target_files = branch_factor**depth * int(split_size * 1.5) - create_files = target_files - files_written - - self.ceph_cluster.mon_manager.raw_cluster_cmd("log", - "{0} Writing {1} files (depth={2})".format( - self.__class__.__name__, create_files, depth - )) - self.mount_a.create_n_files("splitdir/file_{0}".format(depth), - create_files) - self.ceph_cluster.mon_manager.raw_cluster_cmd("log", - "{0} Done".format(self.__class__.__name__)) - - files_written += create_files - log.info("Now have {0} files".format(files_written)) - - splits_expected += branch_factor**depth - log.info("Waiting to see {0} splits".format(splits_expected)) - try: - self.wait_until_equal( - self.get_splits, - splits_expected, - timeout=30, - reject_fn=lambda x: x > splits_expected - ) - - frags = self.get_dir_ino("/splitdir")['dirfrags'] - self.assertEqual(len(frags), branch_factor**(depth+1)) - self.assertEqual( - sum([len(f['dentries']) for f in frags]), - target_files - ) - except: - # On failures, log what fragmentation we actually ended - # up with. This block is just for logging, at the end - # we raise the exception again. - frags = self.get_dir_ino("/splitdir")['dirfrags'] - log.info("depth={0} splits_expected={1} files_written={2}".format( - depth, splits_expected, files_written - )) - log.info("Dirfrags:") - for f in frags: - log.info("{0}: {1}".format( - f['dirfrag'], len(f['dentries']) - )) - raise - - depth += 1 - - # Remember the inode number because we will be checking for - # objects later. - dir_inode_no = self.mount_a.path_to_ino("splitdir") - - self.mount_a.run_shell(["rm", "-rf", "splitdir/"]) - self.mount_a.umount_wait() - - self.fs.mds_asok(['flush', 'journal']) - - # Wait for all strays to purge - self.wait_until_equal( - lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'] - )['mds_cache']['num_strays'], - 0, - timeout=1200 - ) - # Check that the metadata pool objects for all the myriad - # child fragments are gone - metadata_objs = self.fs.rados(["ls"]) - frag_objs = [] - for o in metadata_objs: - if o.startswith("{0:x}.".format(dir_inode_no)): - frag_objs.append(o) - self.assertListEqual(frag_objs, []) diff --git a/src/ceph/qa/tasks/cephfs/test_full.py b/src/ceph/qa/tasks/cephfs/test_full.py deleted file mode 100644 index e69ccb3..0000000 --- a/src/ceph/qa/tasks/cephfs/test_full.py +++ /dev/null @@ -1,414 +0,0 @@ - - -import json -import logging -import os -from textwrap import dedent -import time -from teuthology.orchestra.run import CommandFailedError -from tasks.cephfs.fuse_mount import FuseMount -from tasks.cephfs.cephfs_test_case import CephFSTestCase - - -log = logging.getLogger(__name__) - - -class FullnessTestCase(CephFSTestCase): - CLIENTS_REQUIRED = 2 - - # Subclasses define whether they're filling whole cluster or just data pool - data_only = False - - # Subclasses define how many bytes should be written to achieve fullness - pool_capacity = None - fill_mb = None - - # Subclasses define what fullness means to them - def is_full(self): - raise NotImplementedError() - - def setUp(self): - CephFSTestCase.setUp(self) - - # These tests just use a single active MDS throughout, so remember its ID - # for use in mds_asok calls - self.active_mds_id = self.fs.get_active_names()[0] - - # Capture the initial OSD map epoch for later use - self.initial_osd_epoch = json.loads( - self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip() - )['epoch'] - - # Check the initial barrier epoch on the MDS: this should be - # set to the latest map at MDS startup. We do this check in - # setUp to get in there before subclasses might touch things - # in their own setUp functions. - self.assertGreaterEqual(self.fs.mds_asok(["status"], mds_id=self.active_mds_id)['osdmap_epoch_barrier'], - self.initial_osd_epoch) - - def test_barrier(self): - """ - That when an OSD epoch barrier is set on an MDS, subsequently - issued capabilities cause clients to update their OSD map to that - epoch. - """ - - # Sync up clients with initial MDS OSD map barrier - self.mount_a.open_no_data("foo") - self.mount_b.open_no_data("bar") - - # Grab mounts' initial OSD epochs: later we will check that - # it hasn't advanced beyond this point. - mount_a_initial_epoch = self.mount_a.get_osd_epoch()[0] - mount_b_initial_epoch = self.mount_b.get_osd_epoch()[0] - - # Freshly mounted at start of test, should be up to date with OSD map - self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch) - self.assertGreaterEqual(mount_b_initial_epoch, self.initial_osd_epoch) - - # Set and unset a flag to cause OSD epoch to increment - self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause") - self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause") - - out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip() - new_epoch = json.loads(out)['epoch'] - self.assertNotEqual(self.initial_osd_epoch, new_epoch) - - # Do a metadata operation on clients, witness that they end up with - # the old OSD map from startup time (nothing has prompted client - # to update its map) - self.mount_a.open_no_data("alpha") - self.mount_b.open_no_data("bravo1") - - # Sleep long enough that if the OSD map was propagating it would - # have done so (this is arbitrary because we are 'waiting' for something - # to *not* happen). - time.sleep(30) - - mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() - self.assertEqual(mount_a_epoch, mount_a_initial_epoch) - mount_b_epoch, mount_b_barrier = self.mount_b.get_osd_epoch() - self.assertEqual(mount_b_epoch, mount_b_initial_epoch) - - # Set a barrier on the MDS - self.fs.mds_asok(["osdmap", "barrier", new_epoch.__str__()], mds_id=self.active_mds_id) - - # Do an operation on client B, witness that it ends up with - # the latest OSD map from the barrier. This shouldn't generate any - # cap revokes to A because B was already the last one to touch - # a file in root. - self.mount_b.run_shell(["touch", "bravo2"]) - self.mount_b.open_no_data("bravo2") - - # Some time passes here because the metadata part of the operation - # completes immediately, while the resulting OSD map update happens - # asynchronously (it's an Objecter::_maybe_request_map) as a result - # of seeing the new epoch barrier. - self.wait_until_equal( - lambda: self.mount_b.get_osd_epoch(), - (new_epoch, new_epoch), - 30, - lambda x: x[0] > new_epoch or x[1] > new_epoch) - - # ...and none of this should have affected the oblivious mount a, - # because it wasn't doing any data or metadata IO - mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() - self.assertEqual(mount_a_epoch, mount_a_initial_epoch) - - def _data_pool_name(self): - data_pool_names = self.fs.get_data_pool_names() - if len(data_pool_names) > 1: - raise RuntimeError("This test can't handle multiple data pools") - else: - return data_pool_names[0] - - def _test_full(self, easy_case): - """ - - That a client trying to write data to a file is prevented - from doing so with an -EFULL result - - That they are also prevented from creating new files by the MDS. - - That they may delete another file to get the system healthy again - - :param easy_case: if true, delete a successfully written file to - free up space. else, delete the file that experienced - the failed write. - """ - - osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd')) - - log.info("Writing {0}MB should fill this cluster".format(self.fill_mb)) - - # Fill up the cluster. This dd may or may not fail, as it depends on - # how soon the cluster recognises its own fullness - self.mount_a.write_n_mb("large_file_a", self.fill_mb / 2) - try: - self.mount_a.write_n_mb("large_file_b", self.fill_mb / 2) - except CommandFailedError: - log.info("Writing file B failed (full status happened already)") - assert self.is_full() - else: - log.info("Writing file B succeeded (full status will happen soon)") - self.wait_until_true(lambda: self.is_full(), - timeout=osd_mon_report_interval_max * 5) - - # Attempting to write more data should give me ENOSPC - with self.assertRaises(CommandFailedError) as ar: - self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb / 2) - self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space" - - # Wait for the MDS to see the latest OSD map so that it will reliably - # be applying the policy of rejecting non-deletion metadata operations - # while in the full state. - osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] - self.wait_until_true( - lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch, - timeout=10) - - if not self.data_only: - with self.assertRaises(CommandFailedError): - self.mount_a.write_n_mb("small_file_1", 0) - - # Clear out some space - if easy_case: - self.mount_a.run_shell(['rm', '-f', 'large_file_a']) - self.mount_a.run_shell(['rm', '-f', 'large_file_b']) - else: - # In the hard case it is the file that filled the system. - # Before the new #7317 (ENOSPC, epoch barrier) changes, this - # would fail because the last objects written would be - # stuck in the client cache as objecter operations. - self.mount_a.run_shell(['rm', '-f', 'large_file_b']) - self.mount_a.run_shell(['rm', '-f', 'large_file_a']) - - # Here we are waiting for two things to happen: - # * The MDS to purge the stray folder and execute object deletions - # * The OSDs to inform the mon that they are no longer full - self.wait_until_true(lambda: not self.is_full(), - timeout=osd_mon_report_interval_max * 5) - - # Wait for the MDS to see the latest OSD map so that it will reliably - # be applying the free space policy - osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] - self.wait_until_true( - lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch, - timeout=10) - - # Now I should be able to write again - self.mount_a.write_n_mb("large_file", 50, seek=0) - - # Ensure that the MDS keeps its OSD epoch barrier across a restart - - def test_full_different_file(self): - self._test_full(True) - - def test_full_same_file(self): - self._test_full(False) - - def _remote_write_test(self, template): - """ - Run some remote python in a way that's useful for - testing free space behaviour (see test_* methods using this) - """ - file_path = os.path.join(self.mount_a.mountpoint, "full_test_file") - - # Enough to trip the full flag - osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd')) - mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon")) - - # Sufficient data to cause RADOS cluster to go 'full' - log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb)) - - # Long enough for RADOS cluster to notice it is full and set flag on mons - # (report_interval for mon to learn PG stats, tick interval for it to update OSD map, - # factor of 1.5 for I/O + network latency in committing OSD map and distributing it - # to the OSDs) - full_wait = (osd_mon_report_interval_max + mon_tick_interval) * 1.5 - - # Configs for this test should bring this setting down in order to - # run reasonably quickly - if osd_mon_report_interval_max > 10: - log.warn("This test may run rather slowly unless you decrease" - "osd_mon_report_interval_max (5 is a good setting)!") - - self.mount_a.run_python(template.format( - fill_mb=self.fill_mb, - file_path=file_path, - full_wait=full_wait, - is_fuse=isinstance(self.mount_a, FuseMount) - )) - - def test_full_fclose(self): - # A remote script which opens a file handle, fills up the filesystem, and then - # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync - remote_script = dedent(""" - import time - import datetime - import subprocess - import os - - # Write some buffered data through before going full, all should be well - print "writing some data through which we expect to succeed" - bytes = 0 - f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) - bytes += os.write(f, 'a' * 4096) - os.fsync(f) - print "fsync'ed data successfully, will now attempt to fill fs" - - # Okay, now we're going to fill up the filesystem, and then keep - # writing until we see an error from fsync. As long as we're doing - # buffered IO, the error should always only appear from fsync and not - # from write - full = False - - for n in range(0, {fill_mb}): - bytes += os.write(f, 'x' * 1024 * 1024) - print "wrote bytes via buffered write, may repeat" - print "done writing bytes" - - # OK, now we should sneak in under the full condition - # due to the time it takes the OSDs to report to the - # mons, and get a successful fsync on our full-making data - os.fsync(f) - print "successfully fsync'ed prior to getting full state reported" - - # Now wait for the full flag to get set so that our - # next flush IO will fail - time.sleep(30) - - # A buffered IO, should succeed - print "starting buffered write we expect to succeed" - os.write(f, 'x' * 4096) - print "wrote, now waiting 30s and then doing a close we expect to fail" - - # Wait long enough for a background flush that should fail - time.sleep(30) - - if {is_fuse}: - # ...and check that the failed background flush is reflected in fclose - try: - os.close(f) - except OSError: - print "close() returned an error as expected" - else: - raise RuntimeError("close() failed to raise error") - else: - # The kernel cephfs client does not raise errors on fclose - os.close(f) - - os.unlink("{file_path}") - """) - self._remote_write_test(remote_script) - - def test_full_fsync(self): - """ - That when the full flag is encountered during asynchronous - flushes, such that an fwrite() succeeds but an fsync/fclose() - should return the ENOSPC error. - """ - - # A remote script which opens a file handle, fills up the filesystem, and then - # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync - remote_script = dedent(""" - import time - import datetime - import subprocess - import os - - # Write some buffered data through before going full, all should be well - print "writing some data through which we expect to succeed" - bytes = 0 - f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) - bytes += os.write(f, 'a' * 4096) - os.fsync(f) - print "fsync'ed data successfully, will now attempt to fill fs" - - # Okay, now we're going to fill up the filesystem, and then keep - # writing until we see an error from fsync. As long as we're doing - # buffered IO, the error should always only appear from fsync and not - # from write - full = False - - for n in range(0, {fill_mb} + 1): - try: - bytes += os.write(f, 'x' * 1024 * 1024) - print "wrote bytes via buffered write, moving on to fsync" - except OSError as e: - print "Unexpected error %s from write() instead of fsync()" % e - raise - - try: - os.fsync(f) - print "fsync'ed successfully" - except OSError as e: - print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)) - full = True - break - else: - print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0)) - - if n > {fill_mb} * 0.8: - # Be cautious in the last region where we expect to hit - # the full condition, so that we don't overshoot too dramatically - print "sleeping a bit as we've exceeded 80% of our expected full ratio" - time.sleep({full_wait}) - - if not full: - raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes) - - # close() should not raise an error because we already caught it in - # fsync. There shouldn't have been any more writeback errors - # since then because all IOs got cancelled on the full flag. - print "calling close" - os.close(f) - print "close() did not raise error" - - os.unlink("{file_path}") - """) - - self._remote_write_test(remote_script) - - -class TestQuotaFull(FullnessTestCase): - """ - Test per-pool fullness, which indicates quota limits exceeded - """ - pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit - fill_mb = pool_capacity / (1024 * 1024) - - # We are only testing quota handling on the data pool, not the metadata - # pool. - data_only = True - - def setUp(self): - super(TestQuotaFull, self).setUp() - - pool_name = self.fs.get_data_pool_name() - self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name, - "max_bytes", "{0}".format(self.pool_capacity)) - - def is_full(self): - return self.fs.is_pool_full(self.fs.get_data_pool_name()) - - -class TestClusterFull(FullnessTestCase): - """ - Test cluster-wide fullness, which indicates that an OSD has become too full - """ - pool_capacity = None - REQUIRE_MEMSTORE = True - - def setUp(self): - super(TestClusterFull, self).setUp() - - if self.pool_capacity is None: - # This is a hack to overcome weird fluctuations in the reported - # `max_avail` attribute of pools that sometimes occurs in between - # tests (reason as yet unclear, but this dodges the issue) - TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail'] - TestClusterFull.fill_mb = int(1.05 * (self.pool_capacity / (1024.0 * 1024.0))) - - def is_full(self): - return self.fs.is_full() - -# Hide the parent class so that unittest.loader doesn't try to run it. -del globals()['FullnessTestCase'] diff --git a/src/ceph/qa/tasks/cephfs/test_journal_migration.py b/src/ceph/qa/tasks/cephfs/test_journal_migration.py deleted file mode 100644 index 64fe939..0000000 --- a/src/ceph/qa/tasks/cephfs/test_journal_migration.py +++ /dev/null @@ -1,118 +0,0 @@ - -from StringIO import StringIO -from tasks.cephfs.cephfs_test_case import CephFSTestCase -from tasks.workunit import task as workunit - -JOURNAL_FORMAT_LEGACY = 0 -JOURNAL_FORMAT_RESILIENT = 1 - - -class TestJournalMigration(CephFSTestCase): - CLIENTS_REQUIRED = 1 - MDSS_REQUIRED = 2 - - def test_journal_migration(self): - old_journal_version = JOURNAL_FORMAT_LEGACY - new_journal_version = JOURNAL_FORMAT_RESILIENT - - # Pick out two daemons to use - mds_a, mds_b = sorted(self.mds_cluster.mds_ids[0:2]) - - self.mount_a.umount_wait() - self.fs.mds_stop() - - # Enable standby replay, to cover the bug case #8811 where - # a standby replay might mistakenly end up trying to rewrite - # the journal at the same time as an active daemon. - self.fs.set_ceph_conf('mds', 'mds standby replay', "true") - self.fs.set_ceph_conf('mds', 'mds standby for rank', "0") - - # Create a filesystem using the older journal format. - self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version) - self.fs.recreate() - self.fs.mds_restart(mds_id=mds_a) - self.fs.wait_for_daemons() - self.assertEqual(self.fs.get_active_names(), [mds_a]) - - def replay_names(): - return [s['name'] - for s in self.fs.status().get_replays(fscid = self.fs.id)] - - # Start the standby and wait for it to come up - self.fs.mds_restart(mds_id=mds_b) - self.wait_until_equal( - replay_names, - [mds_b], - timeout = 30) - - # Do some client work so that the log is populated with something. - with self.mount_a.mounted(): - self.mount_a.create_files() - self.mount_a.check_files() # sanity, this should always pass - - # Run a more substantial workunit so that the length of the log to be - # coverted is going span at least a few segments - workunit(self.ctx, { - 'clients': { - "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"], - }, - "timeout": "3h" - }) - - # Modify the ceph.conf to ask the MDS to use the new journal format. - self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version) - - # Restart the MDS. - self.fs.mds_fail_restart(mds_id=mds_a) - self.fs.mds_fail_restart(mds_id=mds_b) - - # This ensures that all daemons come up into a valid state - self.fs.wait_for_daemons() - - # Check that files created in the initial client workload are still visible - # in a client mount. - with self.mount_a.mounted(): - self.mount_a.check_files() - - # Verify that the journal really has been rewritten. - journal_version = self.fs.get_journal_version() - if journal_version != new_journal_version: - raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format( - new_journal_version, journal_version() - )) - - # Verify that cephfs-journal-tool can now read the rewritten journal - inspect_out = self.fs.journal_tool(["journal", "inspect"]) - if not inspect_out.endswith(": OK"): - raise RuntimeError("Unexpected journal-tool result: '{0}'".format( - inspect_out - )) - - self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"]) - p = self.fs.tool_remote.run( - args=[ - "python", - "-c", - "import json; print len(json.load(open('/tmp/journal.json')))" - ], - stdout=StringIO()) - event_count = int(p.stdout.getvalue().strip()) - if event_count < 1000: - # Approximate value of "lots", expected from having run fsstress - raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count)) - - # Do some client work to check that writing the log is still working - with self.mount_a.mounted(): - workunit(self.ctx, { - 'clients': { - "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"], - }, - "timeout": "3h" - }) - - # Check that both an active and a standby replay are still up - self.assertEqual(len(replay_names()), 1) - self.assertEqual(len(self.fs.get_active_names()), 1) - self.assertTrue(self.mds_cluster.mds_daemons[mds_a].running()) - self.assertTrue(self.mds_cluster.mds_daemons[mds_b].running()) - diff --git a/src/ceph/qa/tasks/cephfs/test_journal_repair.py b/src/ceph/qa/tasks/cephfs/test_journal_repair.py deleted file mode 100644 index 62cbbb0..0000000 --- a/src/ceph/qa/tasks/cephfs/test_journal_repair.py +++ /dev/null @@ -1,443 +0,0 @@ - -""" -Test our tools for recovering the content of damaged journals -""" - -import json -import logging -from textwrap import dedent -import time - -from teuthology.exceptions import CommandFailedError, ConnectionLostError -from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO -from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology -from tasks.workunit import task as workunit - -log = logging.getLogger(__name__) - - -class TestJournalRepair(CephFSTestCase): - MDSS_REQUIRED = 2 - - def test_inject_to_empty(self): - """ - That when some dentries in the journal but nothing is in - the backing store, we correctly populate the backing store - from the journalled dentries. - """ - - # Inject metadata operations - self.mount_a.run_shell(["touch", "rootfile"]) - self.mount_a.run_shell(["mkdir", "subdir"]) - self.mount_a.run_shell(["touch", "subdir/subdirfile"]) - # There are several different paths for handling hardlinks, depending - # on whether an existing dentry (being overwritten) is also a hardlink - self.mount_a.run_shell(["mkdir", "linkdir"]) - - # Test inode -> remote transition for a dentry - self.mount_a.run_shell(["touch", "linkdir/link0"]) - self.mount_a.run_shell(["rm", "-f", "linkdir/link0"]) - self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"]) - - # Test nothing -> remote transition - self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"]) - - # Test remote -> inode transition - self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"]) - self.mount_a.run_shell(["rm", "-f", "linkdir/link2"]) - self.mount_a.run_shell(["touch", "linkdir/link2"]) - - # Test remote -> diff remote transition - self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"]) - self.mount_a.run_shell(["rm", "-f", "linkdir/link3"]) - self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"]) - - # Test an empty directory - self.mount_a.run_shell(["mkdir", "subdir/subsubdir"]) - self.mount_a.run_shell(["sync"]) - - # Before we unmount, make a note of the inode numbers, later we will - # check that they match what we recover from the journal - rootfile_ino = self.mount_a.path_to_ino("rootfile") - subdir_ino = self.mount_a.path_to_ino("subdir") - linkdir_ino = self.mount_a.path_to_ino("linkdir") - subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile") - subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir") - - self.mount_a.umount_wait() - - # Stop the MDS - self.fs.mds_stop() - self.fs.mds_fail() - - # Now, the journal should contain the operations, but the backing - # store shouldn't - with self.assertRaises(ObjectNotFound): - self.fs.list_dirfrag(subdir_ino) - self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) - - # Execute the dentry recovery, this should populate the backing store - self.fs.journal_tool(['event', 'recover_dentries', 'list']) - - # Dentries in ROOT_INO are present - self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head'])) - self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head']) - self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)), - sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head'])) - - # Now check the MDS can read what we wrote: truncate the journal - # and start the mds. - self.fs.journal_tool(['journal', 'reset']) - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - # List files - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # First ls -R to populate MDCache, such that hardlinks will - # resolve properly (recover_dentries does not create backtraces, - # so ordinarily hardlinks to inodes that happen not to have backtraces - # will be invisible in readdir). - # FIXME: hook in forward scrub here to regenerate backtraces - proc = self.mount_a.run_shell(['ls', '-R']) - self.mount_a.umount_wait() # remount to clear client cache before our second ls - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - proc = self.mount_a.run_shell(['ls', '-R']) - self.assertEqual(proc.stdout.getvalue().strip(), - dedent(""" - .: - linkdir - rootfile - subdir - - ./linkdir: - link0 - link1 - link2 - link3 - - ./subdir: - subdirfile - subsubdir - - ./subdir/subsubdir: - """).strip()) - - # Check the correct inos were preserved by path - self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile")) - self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir")) - self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile")) - self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir")) - - # Check that the hard link handling came out correctly - self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino) - self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino) - self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino) - self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino) - - # Create a new file, ensure it is not issued the same ino as one of the - # recovered ones - self.mount_a.run_shell(["touch", "afterwards"]) - new_ino = self.mount_a.path_to_ino("afterwards") - self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino]) - - # Check that we can do metadata ops in the recovered directory - self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"]) - - @for_teuthology # 308s - def test_reset(self): - """ - That after forcibly modifying the backing store, we can get back into - a good state by resetting the MDSMap. - - The scenario is that we have two active MDSs, and we lose the journals. Once - we have completely lost confidence in the integrity of the metadata, we want to - return the system to a single-MDS state to go into a scrub to recover what we - can. - """ - - # Set max_mds to 2 - self.fs.set_max_mds(2) - - # See that we have two active MDSs - self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, - reject_fn=lambda v: v > 2 or v < 1) - active_mds_names = self.fs.get_active_names() - - # Switch off any unneeded MDS daemons - for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names): - self.mds_cluster.mds_stop(unneeded_mds) - self.mds_cluster.mds_fail(unneeded_mds) - - # Create a dir on each rank - self.mount_a.run_shell(["mkdir", "alpha"]) - self.mount_a.run_shell(["mkdir", "bravo"]) - self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0") - self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1") - - def subtrees_assigned(): - got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0]) - - for s in got_subtrees: - if s['dir']['path'] == '/bravo': - if s['auth_first'] == 1: - return True - else: - # Should not happen - raise RuntimeError("/bravo is subtree but not rank 1!") - - return False - - # Ensure the pinning has taken effect and the /bravo dir is now - # migrated to rank 1. - self.wait_until_true(subtrees_assigned, 30) - - # Do some IO (this should be split across ranks according to - # the rank-pinned dirs) - self.mount_a.create_n_files("alpha/file", 1000) - self.mount_a.create_n_files("bravo/file", 1000) - - # Flush the journals so that we have some backing store data - # belonging to one MDS, and some to the other MDS. - for mds_name in active_mds_names: - self.fs.mds_asok(["flush", "journal"], mds_name) - - # Stop (hard) the second MDS daemon - self.fs.mds_stop(active_mds_names[1]) - - # Wipe out the tables for MDS rank 1 so that it is broken and can't start - # (this is the simulated failure that we will demonstrate that the disaster - # recovery tools can get us back from) - self.fs.erase_metadata_objects(prefix="mds1_") - - # Try to access files from the client - blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False) - - # Check that this "ls -R" blocked rather than completing: indicates - # it got stuck trying to access subtrees which were on the now-dead MDS. - log.info("Sleeping to check ls is blocked...") - time.sleep(60) - self.assertFalse(blocked_ls.finished) - - # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1 - # is not coming back. Kill it. - log.info("Killing mount, it's blocked on the MDS we killed") - self.mount_a.kill() - self.mount_a.kill_cleanup() - try: - # Now that the mount is dead, the ls -R should error out. - blocked_ls.wait() - except (CommandFailedError, ConnectionLostError): - # The ConnectionLostError case is for kernel client, where - # killing the mount also means killing the node. - pass - - # See that the second MDS will crash when it starts and tries to - # acquire rank 1 - damaged_id = active_mds_names[1] - self.fs.mds_restart(damaged_id) - - # The daemon taking the damaged rank should start starting, then - # restart back into standby after asking the mon to mark the rank - # damaged. - def is_marked_damaged(): - mds_map = self.fs.get_mds_map() - return 1 in mds_map['damaged'] - - self.wait_until_true(is_marked_damaged, 60) - - def get_state(): - info = self.mds_cluster.get_mds_info(damaged_id) - return info['state'] if info is not None else None - - self.wait_until_equal( - get_state, - "up:standby", - timeout=60) - - self.fs.mds_stop(damaged_id) - self.fs.mds_fail(damaged_id) - - # Now give up and go through a disaster recovery procedure - self.fs.mds_stop(active_mds_names[0]) - self.fs.mds_fail(active_mds_names[0]) - # Invoke recover_dentries quietly, because otherwise log spews millions of lines - self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True) - self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True) - self.fs.table_tool(["0", "reset", "session"]) - self.fs.journal_tool(["journal", "reset"], rank=0) - self.fs.erase_mds_objects(1) - self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, - '--yes-i-really-mean-it') - - # Bring an MDS back online, mount a client, and see that we can walk the full - # filesystem tree again - self.fs.mds_fail_restart(active_mds_names[0]) - self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30, - reject_fn=lambda v: len(v) > 1) - self.mount_a.mount() - self.mount_a.run_shell(["ls", "-R"], wait=True) - - def test_table_tool(self): - active_mdss = self.fs.get_active_names() - self.assertEqual(len(active_mdss), 1) - mds_name = active_mdss[0] - - self.mount_a.run_shell(["touch", "foo"]) - self.fs.mds_asok(["flush", "journal"], mds_name) - - log.info(self.fs.table_tool(["all", "show", "inode"])) - log.info(self.fs.table_tool(["all", "show", "snap"])) - log.info(self.fs.table_tool(["all", "show", "session"])) - - # Inode table should always be the same because initial state - # and choice of inode are deterministic. - # Should see one inode consumed - self.assertEqual( - json.loads(self.fs.table_tool(["all", "show", "inode"])), - {"0": { - "data": { - "version": 2, - "inotable": { - "projected_free": [ - {"start": 1099511628777, - "len": 1099511626775}], - "free": [ - {"start": 1099511628777, - "len": 1099511626775}]}}, - "result": 0}} - - ) - - # Should see one session - session_data = json.loads(self.fs.table_tool( - ["all", "show", "session"])) - self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1) - self.assertEqual(session_data["0"]["result"], 0) - - # Should see no snaps - self.assertEqual( - json.loads(self.fs.table_tool(["all", "show", "snap"])), - {"version": 0, - "snapserver": {"last_snap": 1, - "pending_noop": [], - "snaps": [], - "need_to_purge": {}, - "pending_update": [], - "pending_destroy": []}, - "result": 0} - ) - - # Reset everything - for table in ["session", "inode", "snap"]: - self.fs.table_tool(["all", "reset", table]) - - log.info(self.fs.table_tool(["all", "show", "inode"])) - log.info(self.fs.table_tool(["all", "show", "snap"])) - log.info(self.fs.table_tool(["all", "show", "session"])) - - # Should see 0 sessions - session_data = json.loads(self.fs.table_tool( - ["all", "show", "session"])) - self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0) - self.assertEqual(session_data["0"]["result"], 0) - - # Should see entire inode range now marked free - self.assertEqual( - json.loads(self.fs.table_tool(["all", "show", "inode"])), - {"0": {"data": {"version": 1, - "inotable": {"projected_free": [ - {"start": 1099511627776, - "len": 1099511627776}], - "free": [ - {"start": 1099511627776, - "len": 1099511627776}]}}, - "result": 0}} - ) - - # Should see no snaps - self.assertEqual( - json.loads(self.fs.table_tool(["all", "show", "snap"])), - {"version": 1, - "snapserver": {"last_snap": 1, - "pending_noop": [], - "snaps": [], - "need_to_purge": {}, - "pending_update": [], - "pending_destroy": []}, - "result": 0} - ) - - def test_table_tool_take_inos(self): - initial_range_start = 1099511627776 - initial_range_len = 1099511627776 - # Initially a completely clear range - self.assertEqual( - json.loads(self.fs.table_tool(["all", "show", "inode"])), - {"0": {"data": {"version": 0, - "inotable": {"projected_free": [ - {"start": initial_range_start, - "len": initial_range_len}], - "free": [ - {"start": initial_range_start, - "len": initial_range_len}]}}, - "result": 0}} - ) - - # Remove some - self.assertEqual( - json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])), - {"0": {"data": {"version": 1, - "inotable": {"projected_free": [ - {"start": initial_range_start + 101, - "len": initial_range_len - 101}], - "free": [ - {"start": initial_range_start + 101, - "len": initial_range_len - 101}]}}, - "result": 0}} - ) - - @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth - def test_journal_smoke(self): - workunit(self.ctx, { - 'clients': { - "client.{0}".format(self.mount_a.client_id): [ - "fs/misc/trivial_sync.sh"], - }, - "timeout": "1h" - }) - - for mount in self.mounts: - mount.umount_wait() - - self.fs.mds_stop() - self.fs.mds_fail() - - # journal tool smoke - workunit(self.ctx, { - 'clients': { - "client.{0}".format(self.mount_a.client_id): [ - "suites/cephfs_journal_tool_smoke.sh"], - }, - "timeout": "1h" - }) - - - - self.fs.mds_restart() - self.fs.wait_for_daemons() - - self.mount_a.mount() - - # trivial sync moutn a - workunit(self.ctx, { - 'clients': { - "client.{0}".format(self.mount_a.client_id): [ - "fs/misc/trivial_sync.sh"], - }, - "timeout": "1h" - }) - diff --git a/src/ceph/qa/tasks/cephfs/test_mantle.py b/src/ceph/qa/tasks/cephfs/test_mantle.py deleted file mode 100644 index 6cd86ad..0000000 --- a/src/ceph/qa/tasks/cephfs/test_mantle.py +++ /dev/null @@ -1,109 +0,0 @@ -from tasks.cephfs.cephfs_test_case import CephFSTestCase -import json -import logging - -log = logging.getLogger(__name__) -failure = "using old balancer; mantle failed for balancer=" -success = "mantle balancer version changed: " - -class TestMantle(CephFSTestCase): - def start_mantle(self): - self.wait_for_health_clear(timeout=30) - self.fs.set_max_mds(2) - self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, - reject_fn=lambda v: v > 2 or v < 1) - - for m in self.fs.get_active_names(): - self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m) - self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m) - self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m) - self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m) - - def push_balancer(self, obj, lua_code, expect): - self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj) - self.fs.rados(["put", obj, "-"], stdin_data=lua_code) - with self.assert_cluster_log(failure + obj + " " + expect): - log.info("run a " + obj + " balancer that expects=" + expect) - - def test_version_empty(self): - self.start_mantle() - expect = " : (2) No such file or directory" - - ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer') - assert(ret == 22) # EINVAL - - self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ") - with self.assert_cluster_log(failure + " " + expect): pass - - def test_version_not_in_rados(self): - self.start_mantle() - expect = failure + "ghost.lua : (2) No such file or directory" - self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua") - with self.assert_cluster_log(expect): pass - - def test_balancer_invalid(self): - self.start_mantle() - expect = ": (22) Invalid argument" - - lua_code = "this is invalid lua code!" - self.push_balancer("invalid.lua", lua_code, expect) - - lua_code = "BAL_LOG()" - self.push_balancer("invalid_log.lua", lua_code, expect) - - lua_code = "BAL_LOG(0)" - self.push_balancer("invalid_log_again.lua", lua_code, expect) - - def test_balancer_valid(self): - self.start_mantle() - lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}" - self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua") - self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code) - with self.assert_cluster_log(success + "valid.lua"): - log.info("run a valid.lua balancer") - - def test_return_invalid(self): - self.start_mantle() - expect = ": (22) Invalid argument" - - lua_code = "return \"hello\"" - self.push_balancer("string.lua", lua_code, expect) - - lua_code = "return 3" - self.push_balancer("number.lua", lua_code, expect) - - lua_code = "return {}" - self.push_balancer("dict_empty.lua", lua_code, expect) - - lua_code = "return {\"this\", \"is\", \"a\", \"test\"}" - self.push_balancer("dict_of_strings.lua", lua_code, expect) - - lua_code = "return {3, \"test\"}" - self.push_balancer("dict_of_mixed.lua", lua_code, expect) - - lua_code = "return {3}" - self.push_balancer("not_enough_numbers.lua", lua_code, expect) - - lua_code = "return {3, 4, 5, 6, 7, 8, 9}" - self.push_balancer("too_many_numbers.lua", lua_code, expect) - - def test_dead_osd(self): - self.start_mantle() - expect = " : (110) Connection timed out" - - # kill the OSDs so that the balancer pull from RADOS times out - osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty')) - for i in range(0, len(osd_map['osds'])): - self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i)) - self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i)) - - # trigger a pull from RADOS - self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua") - - # make the timeout a little longer since dead OSDs spam ceph -w - with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30): - log.info("run a balancer that should timeout") - - # cleanup - for i in range(0, len(osd_map['osds'])): - self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i)) diff --git a/src/ceph/qa/tasks/cephfs/test_misc.py b/src/ceph/qa/tasks/cephfs/test_misc.py deleted file mode 100644 index d857cfd..0000000 --- a/src/ceph/qa/tasks/cephfs/test_misc.py +++ /dev/null @@ -1,149 +0,0 @@ - -from unittest import SkipTest -from tasks.cephfs.fuse_mount import FuseMount -from tasks.cephfs.cephfs_test_case import CephFSTestCase -from teuthology.orchestra.run import CommandFailedError -import errno -import time -import json - - -class TestMisc(CephFSTestCase): - CLIENTS_REQUIRED = 2 - - LOAD_SETTINGS = ["mds_session_autoclose"] - mds_session_autoclose = None - - def test_getattr_caps(self): - """ - Check if MDS recognizes the 'mask' parameter of open request. - The paramter allows client to request caps when opening file - """ - - if not isinstance(self.mount_a, FuseMount): - raise SkipTest("Require FUSE client") - - # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED - # on lookup/open - self.mount_b.umount_wait() - self.set_conf('client', 'client debug getattr caps', 'true') - self.mount_b.mount() - self.mount_b.wait_until_mounted() - - # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_* - # to mount_a - p = self.mount_a.open_background("testfile") - self.mount_b.wait_for_visible("testfile") - - # this tiggers a lookup request and an open request. The debug - # code will check if lookup/open reply contains xattrs - self.mount_b.run_shell(["cat", "testfile"]) - - self.mount_a.kill_background(p) - - def test_fs_new(self): - data_pool_name = self.fs.get_data_pool_name() - - self.fs.mds_stop() - self.fs.mds_fail() - - self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, - '--yes-i-really-mean-it') - - self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', - self.fs.metadata_pool_name, - self.fs.metadata_pool_name, - '--yes-i-really-really-mean-it') - self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', - self.fs.metadata_pool_name, - self.fs.get_pgs_per_fs_pool().__str__()) - - dummyfile = '/etc/fstab' - - self.fs.put_metadata_object_raw("key", dummyfile) - - def get_pool_df(fs, name): - try: - return fs.get_pool_df(name)['objects'] > 0 - except RuntimeError as e: - return False - - self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30) - - try: - self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, - self.fs.metadata_pool_name, - data_pool_name) - except CommandFailedError as e: - self.assertEqual(e.exitstatus, errno.EINVAL) - else: - raise AssertionError("Expected EINVAL") - - self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, - self.fs.metadata_pool_name, - data_pool_name, "--force") - - self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, - '--yes-i-really-mean-it') - - - self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', - self.fs.metadata_pool_name, - self.fs.metadata_pool_name, - '--yes-i-really-really-mean-it') - self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', - self.fs.metadata_pool_name, - self.fs.get_pgs_per_fs_pool().__str__()) - self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, - self.fs.metadata_pool_name, - data_pool_name) - - def test_evict_client(self): - """ - Check that a slow client session won't get evicted if it's the - only session - """ - - self.mount_b.umount_wait() - ls_data = self.fs.mds_asok(['session', 'ls']) - self.assert_session_count(1, ls_data) - - self.mount_a.kill() - self.mount_a.kill_cleanup() - - time.sleep(self.mds_session_autoclose * 1.5) - ls_data = self.fs.mds_asok(['session', 'ls']) - self.assert_session_count(1, ls_data) - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.mount_b.mount() - self.mount_b.wait_until_mounted() - - ls_data = self._session_list() - self.assert_session_count(2, ls_data) - - self.mount_a.kill() - self.mount_a.kill_cleanup() - - time.sleep(self.mds_session_autoclose * 1.5) - ls_data = self.fs.mds_asok(['session', 'ls']) - self.assert_session_count(1, ls_data) - - def test_filtered_df(self): - pool_name = self.fs.get_data_pool_name() - raw_df = self.fs.get_pool_df(pool_name) - raw_avail = float(raw_df["max_avail"]) - out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get', - pool_name, 'size', - '-f', 'json-pretty') - j = json.loads(out) - pool_size = int(j['size']) - - proc = self.mount_a.run_shell(['df', '.']) - output = proc.stdout.getvalue() - fs_avail = output.split('\n')[1].split()[3] - fs_avail = float(fs_avail) * 1024 - - ratio = raw_avail / fs_avail - assert 0.9 < ratio < 1.1 diff --git a/src/ceph/qa/tasks/cephfs/test_pool_perm.py b/src/ceph/qa/tasks/cephfs/test_pool_perm.py deleted file mode 100644 index 22775e7..0000000 --- a/src/ceph/qa/tasks/cephfs/test_pool_perm.py +++ /dev/null @@ -1,113 +0,0 @@ -from textwrap import dedent -from teuthology.exceptions import CommandFailedError -from tasks.cephfs.cephfs_test_case import CephFSTestCase -import os - - -class TestPoolPerm(CephFSTestCase): - def test_pool_perm(self): - self.mount_a.run_shell(["touch", "test_file"]) - - file_path = os.path.join(self.mount_a.mountpoint, "test_file") - - remote_script = dedent(""" - import os - import errno - - fd = os.open("{path}", os.O_RDWR) - try: - if {check_read}: - ret = os.read(fd, 1024) - else: - os.write(fd, 'content') - except OSError, e: - if e.errno != errno.EPERM: - raise - else: - raise RuntimeError("client does not check permission of data pool") - """) - - client_name = "client.{0}".format(self.mount_a.client_id) - - # set data pool read only - self.fs.mon_manager.raw_cluster_cmd_result( - 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd', - 'allow r pool={0}'.format(self.fs.get_data_pool_name())) - - self.mount_a.umount_wait() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # write should fail - self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False))) - - # set data pool write only - self.fs.mon_manager.raw_cluster_cmd_result( - 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd', - 'allow w pool={0}'.format(self.fs.get_data_pool_name())) - - self.mount_a.umount_wait() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # read should fail - self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True))) - - def test_forbidden_modification(self): - """ - That a client who does not have the capability for setting - layout pools is prevented from doing so. - """ - - # Set up - client_name = "client.{0}".format(self.mount_a.client_id) - new_pool_name = "data_new" - self.fs.add_data_pool(new_pool_name) - - self.mount_a.run_shell(["touch", "layoutfile"]) - self.mount_a.run_shell(["mkdir", "layoutdir"]) - - # Set MDS 'rw' perms: missing 'p' means no setting pool layouts - self.fs.mon_manager.raw_cluster_cmd_result( - 'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r', - 'osd', - 'allow rw pool={0},allow rw pool={1}'.format( - self.fs.get_data_pool_names()[0], - self.fs.get_data_pool_names()[1], - )) - - self.mount_a.umount_wait() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - with self.assertRaises(CommandFailedError): - self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool", - new_pool_name) - with self.assertRaises(CommandFailedError): - self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool", - new_pool_name) - self.mount_a.umount_wait() - - # Set MDS 'rwp' perms: should now be able to set layouts - self.fs.mon_manager.raw_cluster_cmd_result( - 'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r', - 'osd', - 'allow rw pool={0},allow rw pool={1}'.format( - self.fs.get_data_pool_names()[0], - self.fs.get_data_pool_names()[1], - )) - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool", - new_pool_name) - self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool", - new_pool_name) - self.mount_a.umount_wait() - - def tearDown(self): - self.fs.mon_manager.raw_cluster_cmd_result( - 'auth', 'caps', "client.{0}".format(self.mount_a.client_id), - 'mds', 'allow', 'mon', 'allow r', 'osd', - 'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0])) - super(TestPoolPerm, self).tearDown() - diff --git a/src/ceph/qa/tasks/cephfs/test_quota.py b/src/ceph/qa/tasks/cephfs/test_quota.py deleted file mode 100644 index ee11c58..0000000 --- a/src/ceph/qa/tasks/cephfs/test_quota.py +++ /dev/null @@ -1,106 +0,0 @@ - -from cephfs_test_case import CephFSTestCase - -from teuthology.exceptions import CommandFailedError - -class TestQuota(CephFSTestCase): - CLIENTS_REQUIRED = 2 - MDSS_REQUIRED = 1 - - def test_remote_update_getfattr(self): - """ - That quota changes made from one client are visible to another - client looking at ceph.quota xattrs - """ - self.mount_a.run_shell(["mkdir", "subdir"]) - - self.assertEqual( - self.mount_a.getfattr("./subdir", "ceph.quota.max_files"), - None) - self.assertEqual( - self.mount_b.getfattr("./subdir", "ceph.quota.max_files"), - None) - - self.mount_a.setfattr("./subdir", "ceph.quota.max_files", "10") - self.assertEqual( - self.mount_a.getfattr("./subdir", "ceph.quota.max_files"), - "10") - - # Should be visible as soon as setxattr operation completes on - # mds (we get here sooner because setfattr gets an early reply) - self.wait_until_equal( - lambda: self.mount_b.getfattr("./subdir", "ceph.quota.max_files"), - "10", timeout=10) - - def test_remote_update_df(self): - """ - That when a client modifies the quota on a directory used - as another client's root, the other client sees the change - reflected in their statfs output. - """ - - self.mount_b.umount_wait() - - self.mount_a.run_shell(["mkdir", "subdir"]) - - size_before = 1024 * 1024 * 128 - self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", - "%s" % size_before) - - self.mount_b.mount(mount_path="/subdir") - - self.assertDictEqual( - self.mount_b.df(), - { - "total": size_before, - "used": 0, - "available": size_before - }) - - size_after = 1024 * 1024 * 256 - self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", - "%s" % size_after) - - # Should be visible as soon as setxattr operation completes on - # mds (we get here sooner because setfattr gets an early reply) - self.wait_until_equal( - lambda: self.mount_b.df(), - { - "total": size_after, - "used": 0, - "available": size_after - }, - timeout=10 - ) - - def test_remote_update_write(self): - """ - That when a client modifies the quota on a directory used - as another client's root, the other client sees the effect - of the change when writing data. - """ - - self.mount_a.run_shell(["mkdir", "subdir_files"]) - self.mount_a.run_shell(["mkdir", "subdir_data"]) - - # Set some nice high quotas that mount_b's initial operations - # will be well within - self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "100") - self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "104857600") - - # Do some writes within my quota - self.mount_b.create_n_files("subdir_files/file", 20) - self.mount_b.write_n_mb("subdir_data/file", 20) - - # Set quotas lower than what mount_b already wrote, it should - # refuse to write more once it's seen them - self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "10") - self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "1048576") - - # Do some writes that would have been okay within the old quota, - # but are forbidden under the new quota - with self.assertRaises(CommandFailedError): - self.mount_b.create_n_files("subdir_files/file", 40) - with self.assertRaises(CommandFailedError): - self.mount_b.write_n_mb("subdir_data/file", 40) - diff --git a/src/ceph/qa/tasks/cephfs/test_readahead.py b/src/ceph/qa/tasks/cephfs/test_readahead.py deleted file mode 100644 index 31e7bf1..0000000 --- a/src/ceph/qa/tasks/cephfs/test_readahead.py +++ /dev/null @@ -1,31 +0,0 @@ -import logging -from tasks.cephfs.fuse_mount import FuseMount -from tasks.cephfs.cephfs_test_case import CephFSTestCase - -log = logging.getLogger(__name__) - - -class TestReadahead(CephFSTestCase): - def test_flush(self): - if not isinstance(self.mount_a, FuseMount): - self.skipTest("FUSE needed for measuring op counts") - - # Create 32MB file - self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"]) - - # Unmount and remount the client to flush cache - self.mount_a.umount_wait() - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - initial_op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r'] - self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"]) - op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r'] - assert op_r >= initial_op_r - op_r -= initial_op_r - log.info("read operations: {0}".format(op_r)) - - # with exponentially increasing readahead, we should see fewer than 10 operations - # but this test simply checks if the client is doing a remote read for each local read - if op_r >= 32: - raise RuntimeError("readahead not working") diff --git a/src/ceph/qa/tasks/cephfs/test_recovery_pool.py b/src/ceph/qa/tasks/cephfs/test_recovery_pool.py deleted file mode 100644 index 097342a..0000000 --- a/src/ceph/qa/tasks/cephfs/test_recovery_pool.py +++ /dev/null @@ -1,220 +0,0 @@ - -""" -Test our tools for recovering metadata from the data pool into an alternate pool -""" -import json - -import logging -import os -from textwrap import dedent -import traceback -from collections import namedtuple, defaultdict - -from teuthology.orchestra.run import CommandFailedError -from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology - -log = logging.getLogger(__name__) - - -ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) - - -class OverlayWorkload(object): - def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount): - self._orig_fs = orig_fs - self._recovery_fs = recovery_fs - self._orig_mount = orig_mount - self._recovery_mount = recovery_mount - self._initial_state = None - - # Accumulate backtraces for every failed validation, and return them. Backtraces - # are rather verbose, but we only see them when something breaks, and they - # let us see which check failed without having to decorate each check with - # a string - self._errors = [] - - def assert_equal(self, a, b): - try: - if a != b: - raise AssertionError("{0} != {1}".format(a, b)) - except AssertionError as e: - self._errors.append( - ValidationError(e, traceback.format_exc(3)) - ) - - def write(self): - """ - Write the workload files to the mount - """ - raise NotImplementedError() - - def validate(self): - """ - Read from the mount and validate that the workload files are present (i.e. have - survived or been reconstructed from the test scenario) - """ - raise NotImplementedError() - - def damage(self): - """ - Damage the filesystem pools in ways that will be interesting to recover from. By - default just wipe everything in the metadata pool - """ - # Delete every object in the metadata pool - objects = self._orig_fs.rados(["ls"]).split("\n") - for o in objects: - self._orig_fs.rados(["rm", o]) - - def flush(self): - """ - Called after client unmount, after write: flush whatever you want - """ - self._orig_fs.mds_asok(["flush", "journal"]) - self._recovery_fs.mds_asok(["flush", "journal"]) - - -class SimpleOverlayWorkload(OverlayWorkload): - """ - Single file, single directory, check that it gets recovered and so does its size - """ - def write(self): - self._orig_mount.run_shell(["mkdir", "subdir"]) - self._orig_mount.write_n_mb("subdir/sixmegs", 6) - self._initial_state = self._orig_mount.stat("subdir/sixmegs") - - def validate(self): - self._recovery_mount.run_shell(["ls", "subdir"]) - st = self._recovery_mount.stat("subdir/sixmegs") - self.assert_equal(st['st_size'], self._initial_state['st_size']) - return self._errors - -class TestRecoveryPool(CephFSTestCase): - MDSS_REQUIRED = 2 - CLIENTS_REQUIRED = 2 - REQUIRE_RECOVERY_FILESYSTEM = True - - def is_marked_damaged(self, rank): - mds_map = self.fs.get_mds_map() - return rank in mds_map['damaged'] - - def _rebuild_metadata(self, workload, other_pool=None, workers=1): - """ - That when all objects in metadata pool are removed, we can rebuild a metadata pool - based on the contents of a data pool, and a client can see and read our files. - """ - - # First, inject some files - - workload.write() - - # Unmount the client and flush the journal: the tool should also cope with - # situations where there is dirty metadata, but we'll test that separately - self.mount_a.umount_wait() - self.mount_b.umount_wait() - workload.flush() - - # Create the alternate pool if requested - recovery_fs = self.recovery_fs.name - recovery_pool = self.recovery_fs.get_metadata_pool_name() - self.recovery_fs.data_scan(['init', '--force-init', - '--filesystem', recovery_fs, - '--alternate-pool', recovery_pool]) - self.recovery_fs.mon_manager.raw_cluster_cmd('-s') - self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"]) - self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"]) - self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"]) - - # Stop the MDS - self.fs.mds_stop() - self.fs.mds_fail() - - # After recovery, we need the MDS to not be strict about stats (in production these options - # are off by default, but in QA we need to explicitly disable them) - self.fs.set_ceph_conf('mds', 'mds verify scatter', False) - self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) - - # Apply any data damage the workload wants - workload.damage() - - # Reset the MDS map in case multiple ranks were in play: recovery procedure - # only understands how to rebuild metadata under rank 0 - self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, - '--yes-i-really-mean-it') - - def get_state(mds_id): - info = self.mds_cluster.get_mds_info(mds_id) - return info['state'] if info is not None else None - - self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) - self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) - self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) - - # Run the recovery procedure - if False: - with self.assertRaises(CommandFailedError): - # Normal reset should fail when no objects are present, we'll use --force instead - self.fs.journal_tool(["journal", "reset"]) - - self.fs.mds_stop() - self.fs.data_scan(['scan_extents', '--alternate-pool', - recovery_pool, '--filesystem', self.fs.name, - self.fs.get_data_pool_name()]) - self.fs.data_scan(['scan_inodes', '--alternate-pool', - recovery_pool, '--filesystem', self.fs.name, - '--force-corrupt', '--force-init', - self.fs.get_data_pool_name()]) - self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event', - 'recover_dentries', 'list', - '--alternate-pool', recovery_pool]) - - self.fs.data_scan(['init', '--force-init', '--filesystem', - self.fs.name]) - self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name, - '--force-corrupt', '--force-init', - self.fs.get_data_pool_name()]) - self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event', - 'recover_dentries', 'list']) - - self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal', - 'reset', '--force']) - self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal', - 'reset', '--force']) - self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', - recovery_fs + ":0") - - # Mark the MDS repaired - self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') - - # Start the MDS - self.fs.mds_restart() - self.recovery_fs.mds_restart() - self.fs.wait_for_daemons() - self.recovery_fs.wait_for_daemons() - for mds_id in self.recovery_fs.mds_ids: - self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id, - 'injectargs', '--debug-mds=20') - self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id, - 'scrub_path', '/', - 'recursive', 'repair') - log.info(str(self.mds_cluster.status())) - - # Mount a client - self.mount_a.mount() - self.mount_b.mount(mount_fs_name=recovery_fs) - self.mount_a.wait_until_mounted() - self.mount_b.wait_until_mounted() - - # See that the files are present and correct - errors = workload.validate() - if errors: - log.error("Validation errors found: {0}".format(len(errors))) - for e in errors: - log.error(e.exception) - log.error(e.backtrace) - raise AssertionError("Validation failed, first error: {0}\n{1}".format( - errors[0].exception, errors[0].backtrace - )) - - def test_rebuild_simple(self): - self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs, - self.mount_a, self.mount_b)) diff --git a/src/ceph/qa/tasks/cephfs/test_scrub_checks.py b/src/ceph/qa/tasks/cephfs/test_scrub_checks.py deleted file mode 100644 index a2de527..0000000 --- a/src/ceph/qa/tasks/cephfs/test_scrub_checks.py +++ /dev/null @@ -1,245 +0,0 @@ -""" -MDS admin socket scrubbing-related tests. -""" -import json -import logging -import errno -import time -from teuthology.exceptions import CommandFailedError -import os -from tasks.cephfs.cephfs_test_case import CephFSTestCase - -log = logging.getLogger(__name__) - - -class TestScrubChecks(CephFSTestCase): - """ - Run flush and scrub commands on the specified files in the filesystem. This - task will run through a sequence of operations, but it is not comprehensive - on its own -- it doesn't manipulate the mds cache state to test on both - in- and out-of-memory parts of the hierarchy. So it's designed to be run - multiple times within a single test run, so that the test can manipulate - memory state. - - Usage: - mds_scrub_checks: - mds_rank: 0 - path: path/to/test/dir - client: 0 - run_seq: [0-9]+ - - Increment the run_seq on subsequent invocations within a single test run; - it uses that value to generate unique folder and file names. - """ - - MDSS_REQUIRED = 1 - CLIENTS_REQUIRED = 1 - - def test_scrub_checks(self): - self._checks(0) - self._checks(1) - - def _checks(self, run_seq): - mds_rank = 0 - test_dir = "scrub_test_path" - - abs_test_path = "/{0}".format(test_dir) - - log.info("mountpoint: {0}".format(self.mount_a.mountpoint)) - client_path = os.path.join(self.mount_a.mountpoint, test_dir) - log.info("client_path: {0}".format(client_path)) - - log.info("Cloning repo into place") - repo_path = self.clone_repo(self.mount_a, client_path) - - log.info("Initiating mds_scrub_checks on mds.{id_}, " + - "test_path {path}, run_seq {seq}".format( - id_=mds_rank, path=abs_test_path, seq=run_seq) - ) - - - success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0) - - nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path) - self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep), - lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) - self.asok_command(mds_rank, "scrub_path {nep}".format(nep=nep), - lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) - - test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path) - dirpath = "{repo_path}/suites".format(repo_path=test_repo_path) - - if run_seq == 0: - log.info("First run: flushing {dirpath}".format(dirpath=dirpath)) - command = "flush_path {dirpath}".format(dirpath=dirpath) - self.asok_command(mds_rank, command, success_validator) - command = "scrub_path {dirpath}".format(dirpath=dirpath) - self.asok_command(mds_rank, command, success_validator) - - filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format( - repo_path=test_repo_path) - if run_seq == 0: - log.info("First run: flushing {filepath}".format(filepath=filepath)) - command = "flush_path {filepath}".format(filepath=filepath) - self.asok_command(mds_rank, command, success_validator) - command = "scrub_path {filepath}".format(filepath=filepath) - self.asok_command(mds_rank, command, success_validator) - - filepath = "{repo_path}/suites/fs/basic/clusters/fixed-3-cephfs.yaml". \ - format(repo_path=test_repo_path) - command = "scrub_path {filepath}".format(filepath=filepath) - self.asok_command(mds_rank, command, - lambda j, r: self.json_validator(j, r, "performed_validation", - False)) - - if run_seq == 0: - log.info("First run: flushing base dir /") - command = "flush_path /" - self.asok_command(mds_rank, command, success_validator) - command = "scrub_path /" - self.asok_command(mds_rank, command, success_validator) - - new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq) - test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path, - i=run_seq) - self.mount_a.run_shell(["mkdir", new_dir]) - command = "flush_path {dir}".format(dir=test_new_dir) - self.asok_command(mds_rank, command, success_validator) - - new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path, - i=run_seq) - test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path, - i=run_seq) - self.mount_a.write_n_mb(new_file, 1) - - command = "flush_path {file}".format(file=test_new_file) - self.asok_command(mds_rank, command, success_validator) - - # check that scrub fails on errors - ino = self.mount_a.path_to_ino(new_file) - rados_obj_name = "{ino:x}.00000000".format(ino=ino) - command = "scrub_path {file}".format(file=test_new_file) - - # Missing parent xattr -> ENODATA - self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name()) - self.asok_command(mds_rank, command, - lambda j, r: self.json_validator(j, r, "return_code", -errno.ENODATA)) - - # Missing object -> ENOENT - self.fs.rados(["rm", rados_obj_name], pool=self.fs.get_data_pool_name()) - self.asok_command(mds_rank, command, - lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) - - command = "flush_path /" - self.asok_command(mds_rank, command, success_validator) - - def test_scrub_repair(self): - mds_rank = 0 - test_dir = "scrub_repair_path" - - self.mount_a.run_shell(["sudo", "mkdir", test_dir]) - self.mount_a.run_shell(["sudo", "touch", "{0}/file".format(test_dir)]) - dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir)) - - self.mount_a.umount_wait() - - # flush journal entries to dirfrag objects, and expire journal - self.fs.mds_asok(['flush', 'journal']) - self.fs.mds_stop() - - # remove the dentry from dirfrag, cause incorrect fragstat/rstat - self.fs.rados(["rmomapkey", dir_objname, "file_head"], - pool=self.fs.get_metadata_pool_name()) - - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # fragstat indicates the directory is not empty, rmdir should fail - with self.assertRaises(CommandFailedError) as ar: - self.mount_a.run_shell(["sudo", "rmdir", test_dir]) - self.assertEqual(ar.exception.exitstatus, 1) - - self.asok_command(mds_rank, "scrub_path /{0} repair".format(test_dir), - lambda j, r: self.json_validator(j, r, "return_code", 0)) - - # wait a few second for background repair - time.sleep(10) - - # fragstat should be fixed - self.mount_a.run_shell(["sudo", "rmdir", test_dir]) - - @staticmethod - def json_validator(json_out, rc, element, expected_value): - if rc != 0: - return False, "asok command returned error {rc}".format(rc=rc) - element_value = json_out.get(element) - if element_value != expected_value: - return False, "unexpectedly got {jv} instead of {ev}!".format( - jv=element_value, ev=expected_value) - return True, "Succeeded" - - def asok_command(self, mds_rank, command, validator): - log.info("Running command '{command}'".format(command=command)) - - command_list = command.split() - - # we just assume there's an active mds for every rank - mds_id = self.fs.get_active_names()[mds_rank] - proc = self.fs.mon_manager.admin_socket('mds', mds_id, - command_list, check_status=False) - rout = proc.exitstatus - sout = proc.stdout.getvalue() - - if sout.strip(): - jout = json.loads(sout) - else: - jout = None - - log.info("command '{command}' got response code " + - "'{rout}' and stdout '{sout}'".format( - command=command, rout=rout, sout=sout)) - - success, errstring = validator(jout, rout) - - if not success: - raise AsokCommandFailedError(command, rout, jout, errstring) - - return jout - - def clone_repo(self, client_mount, path): - repo = "ceph-qa-suite" - repo_path = os.path.join(path, repo) - client_mount.run_shell(["mkdir", "-p", path]) - - try: - client_mount.stat(repo_path) - except CommandFailedError: - client_mount.run_shell([ - "git", "clone", '--branch', 'giant', - "http://github.com/ceph/{repo}".format(repo=repo), - "{path}/{repo}".format(path=path, repo=repo) - ]) - - return repo_path - - -class AsokCommandFailedError(Exception): - """ - Exception thrown when we get an unexpected response - on an admin socket command - """ - - def __init__(self, command, rc, json_out, errstring): - self.command = command - self.rc = rc - self.json = json_out - self.errstring = errstring - - def __str__(self): - return "Admin socket: {command} failed with rc={rc}," + \ - "json output={json}, because '{es}'".format( - command=self.command, rc=self.rc, - json=self.json, es=self.errstring) diff --git a/src/ceph/qa/tasks/cephfs/test_sessionmap.py b/src/ceph/qa/tasks/cephfs/test_sessionmap.py deleted file mode 100644 index 9d12ab6..0000000 --- a/src/ceph/qa/tasks/cephfs/test_sessionmap.py +++ /dev/null @@ -1,235 +0,0 @@ -from StringIO import StringIO -import json -import logging -from unittest import SkipTest - -from tasks.cephfs.fuse_mount import FuseMount -from teuthology.exceptions import CommandFailedError -from tasks.cephfs.cephfs_test_case import CephFSTestCase - -log = logging.getLogger(__name__) - - -class TestSessionMap(CephFSTestCase): - CLIENTS_REQUIRED = 2 - MDSS_REQUIRED = 2 - - def test_tell_session_drop(self): - """ - That when a `tell` command is sent using the python CLI, - its MDS session is gone after it terminates - """ - self.mount_a.umount_wait() - self.mount_b.umount_wait() - - mds_id = self.fs.get_lone_mds_id() - self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls") - - ls_data = self.fs.mds_asok(['session', 'ls']) - self.assertEqual(len(ls_data), 0) - - def _get_thread_count(self, mds_id): - remote = self.fs.mds_daemons[mds_id].remote - - ps_txt = remote.run( - args=["ps", "-ww", "axo", "nlwp,cmd"], - stdout=StringIO() - ).stdout.getvalue().strip() - lines = ps_txt.split("\n")[1:] - - for line in lines: - if "ceph-mds" in line and not "daemon-helper" in line: - if line.find("-i {0}".format(mds_id)) != -1: - log.info("Found ps line for daemon: {0}".format(line)) - return int(line.split()[0]) - - raise RuntimeError("No process found in ps output for MDS {0}: {1}".format( - mds_id, ps_txt - )) - - def test_tell_conn_close(self): - """ - That when a `tell` command is sent using the python CLI, - the thread count goes back to where it started (i.e. we aren't - leaving connections open) - """ - self.mount_a.umount_wait() - self.mount_b.umount_wait() - - mds_id = self.fs.get_lone_mds_id() - - initial_thread_count = self._get_thread_count(mds_id) - self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls") - final_thread_count = self._get_thread_count(mds_id) - - self.assertEqual(initial_thread_count, final_thread_count) - - def test_mount_conn_close(self): - """ - That when a client unmounts, the thread count on the MDS goes back - to what it was before the client mounted - """ - self.mount_a.umount_wait() - self.mount_b.umount_wait() - - mds_id = self.fs.get_lone_mds_id() - - initial_thread_count = self._get_thread_count(mds_id) - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.assertGreater(self._get_thread_count(mds_id), initial_thread_count) - self.mount_a.umount_wait() - final_thread_count = self._get_thread_count(mds_id) - - self.assertEqual(initial_thread_count, final_thread_count) - - def test_version_splitting(self): - """ - That when many sessions are updated, they are correctly - split into multiple versions to obey mds_sessionmap_keys_per_op - """ - - # Start umounted - self.mount_a.umount_wait() - self.mount_b.umount_wait() - - # Configure MDS to write one OMAP key at once - self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1) - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - # I would like two MDSs, so that I can do an export dir later - self.fs.set_max_mds(2) - self.fs.wait_for_daemons() - - active_mds_names = self.fs.get_active_names() - rank_0_id = active_mds_names[0] - rank_1_id = active_mds_names[1] - log.info("Ranks 0 and 1 are {0} and {1}".format( - rank_0_id, rank_1_id)) - - # Bring the clients back - self.mount_a.mount() - self.mount_b.mount() - self.mount_a.create_files() # Kick the client into opening sessions - self.mount_b.create_files() - - # See that they've got sessions - self.assert_session_count(2, mds_id=rank_0_id) - - # See that we persist their sessions - self.fs.mds_asok(["flush", "journal"], rank_0_id) - table_json = json.loads(self.fs.table_tool(["0", "show", "session"])) - log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2))) - self.assertEqual(table_json['0']['result'], 0) - self.assertEqual(len(table_json['0']['data']['Sessions']), 2) - - # Now, induce a "force_open_sessions" event by exporting a dir - self.mount_a.run_shell(["mkdir", "bravo"]) - self.mount_a.run_shell(["touch", "bravo/file"]) - self.mount_b.run_shell(["ls", "-l", "bravo/file"]) - - def get_omap_wrs(): - return self.fs.mds_asok(['perf', 'dump', 'objecter'], rank_1_id)['objecter']['omap_wr'] - - # Flush so that there are no dirty sessions on rank 1 - self.fs.mds_asok(["flush", "journal"], rank_1_id) - - # Export so that we get a force_open to rank 1 for the two sessions from rank 0 - initial_omap_wrs = get_omap_wrs() - self.fs.mds_asok(['export', 'dir', '/bravo', '1'], rank_0_id) - - # This is the critical (if rather subtle) check: that in the process of doing an export dir, - # we hit force_open_sessions, and as a result we end up writing out the sessionmap. There - # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see - # a single session get written out (the first of the two, triggered by the second getting marked - # dirty) - # The number of writes is two per session, because the header (sessionmap version) update and - # KV write both count. - self.wait_until_true( - lambda: get_omap_wrs() - initial_omap_wrs == 2, - timeout=10 # Long enough for an export to get acked - ) - - # Now end our sessions and check the backing sessionmap is updated correctly - self.mount_a.umount_wait() - self.mount_b.umount_wait() - - # In-memory sessionmap check - self.assert_session_count(0, mds_id=rank_0_id) - - # On-disk sessionmap check - self.fs.mds_asok(["flush", "journal"], rank_0_id) - table_json = json.loads(self.fs.table_tool(["0", "show", "session"])) - log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2))) - self.assertEqual(table_json['0']['result'], 0) - self.assertEqual(len(table_json['0']['data']['Sessions']), 0) - - def _sudo_write_file(self, remote, path, data): - """ - Write data to a remote file as super user - - :param remote: Remote site. - :param path: Path on the remote being written to. - :param data: Data to be written. - - Both perms and owner are passed directly to chmod. - """ - remote.run( - args=[ - 'sudo', - 'python', - '-c', - 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', - path, - ], - stdin=data, - ) - - def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None): - """ - Set up auth credentials for a client mount, and write out the keyring - for the client to use. - """ - - if osd_caps is None: - osd_caps = "allow rw" - - if mon_caps is None: - mon_caps = "allow r" - - out = self.fs.mon_manager.raw_cluster_cmd( - "auth", "get-or-create", "client.{name}".format(name=id_name), - "mds", mds_caps, - "osd", osd_caps, - "mon", mon_caps - ) - mount.client_id = id_name - self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out) - self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path()) - - def test_session_reject(self): - if not isinstance(self.mount_a, FuseMount): - raise SkipTest("Requires FUSE client to inject client metadata") - - self.mount_a.run_shell(["mkdir", "foo"]) - self.mount_a.run_shell(["mkdir", "foo/bar"]) - self.mount_a.umount_wait() - - # Mount B will be my rejected client - self.mount_b.umount_wait() - - # Configure a client that is limited to /foo/bar - self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar") - # Check he can mount that dir and do IO - self.mount_b.mount(mount_path="/foo/bar") - self.mount_b.wait_until_mounted() - self.mount_b.create_destroy() - self.mount_b.umount_wait() - - # Configure the client to claim that its mount point metadata is /baz - self.set_conf("client.badguy", "client_metadata", "root=/baz") - # Try to mount the client, see that it fails - with self.assert_cluster_log("client session with invalid root '/baz' denied"): - with self.assertRaises(CommandFailedError): - self.mount_b.mount(mount_path="/foo/bar") diff --git a/src/ceph/qa/tasks/cephfs/test_strays.py b/src/ceph/qa/tasks/cephfs/test_strays.py deleted file mode 100644 index b64f3e9..0000000 --- a/src/ceph/qa/tasks/cephfs/test_strays.py +++ /dev/null @@ -1,1049 +0,0 @@ -import json -import time -import logging -from textwrap import dedent -import datetime -import gevent -import datetime - -from teuthology.orchestra.run import CommandFailedError, Raw -from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology - -log = logging.getLogger(__name__) - - -class TestStrays(CephFSTestCase): - MDSS_REQUIRED = 2 - - OPS_THROTTLE = 1 - FILES_THROTTLE = 2 - - # Range of different file sizes used in throttle test's workload - throttle_workload_size_range = 16 - - @for_teuthology - def test_ops_throttle(self): - self._test_throttling(self.OPS_THROTTLE) - - @for_teuthology - def test_files_throttle(self): - self._test_throttling(self.FILES_THROTTLE) - - def test_dir_deletion(self): - """ - That when deleting a bunch of dentries and the containing - directory, everything gets purged. - Catches cases where the client might e.g. fail to trim - the unlinked dir from its cache. - """ - file_count = 1000 - create_script = dedent(""" - import os - - mount_path = "{mount_path}" - subdir = "delete_me" - size = {size} - file_count = {file_count} - os.mkdir(os.path.join(mount_path, subdir)) - for i in xrange(0, file_count): - filename = "{{0}}_{{1}}.bin".format(i, size) - f = open(os.path.join(mount_path, subdir, filename), 'w') - f.write(size * 'x') - f.close() - """.format( - mount_path=self.mount_a.mountpoint, - size=1024, - file_count=file_count - )) - - self.mount_a.run_python(create_script) - - # That the dirfrag object is created - self.fs.mds_asok(["flush", "journal"]) - dir_ino = self.mount_a.path_to_ino("delete_me") - self.assertTrue(self.fs.dirfrag_exists(dir_ino, 0)) - - # Remove everything - self.mount_a.run_shell(["rm", "-rf", "delete_me"]) - self.fs.mds_asok(["flush", "journal"]) - - # That all the removed files get created as strays - strays = self.get_mdc_stat("strays_created") - self.assertEqual(strays, file_count + 1) - - # That the strays all get enqueued for purge - self.wait_until_equal( - lambda: self.get_mdc_stat("strays_enqueued"), - strays, - timeout=600 - - ) - - # That all the purge operations execute - self.wait_until_equal( - lambda: self.get_stat("purge_queue", "pq_executed"), - strays, - timeout=600 - ) - - # That finally, the directory metadata object is gone - self.assertFalse(self.fs.dirfrag_exists(dir_ino, 0)) - - # That finally, the data objects are all gone - self.await_data_pool_empty() - - def _test_throttling(self, throttle_type): - self.data_log = [] - try: - return self._do_test_throttling(throttle_type) - except: - for l in self.data_log: - log.info(",".join([l_.__str__() for l_ in l])) - raise - - def _do_test_throttling(self, throttle_type): - """ - That the mds_max_purge_ops setting is respected - """ - - def set_throttles(files, ops): - """ - Helper for updating ops/files limits, and calculating effective - ops_per_pg setting to give the same ops limit. - """ - self.set_conf('mds', 'mds_max_purge_files', "%d" % files) - self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops) - - pgs = self.fs.mon_manager.get_pool_property( - self.fs.get_data_pool_name(), - "pg_num" - ) - ops_per_pg = float(ops) / pgs - self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg) - - # Test conditions depend on what we're going to be exercising. - # * Lift the threshold on whatever throttle we are *not* testing, so - # that the throttle of interest is the one that will be the bottleneck - # * Create either many small files (test file count throttling) or fewer - # large files (test op throttling) - if throttle_type == self.OPS_THROTTLE: - set_throttles(files=100000000, ops=16) - size_unit = 1024 * 1024 # big files, generate lots of ops - file_multiplier = 100 - elif throttle_type == self.FILES_THROTTLE: - # The default value of file limit is pretty permissive, so to avoid - # the test running too fast, create lots of files and set the limit - # pretty low. - set_throttles(ops=100000000, files=6) - size_unit = 1024 # small, numerous files - file_multiplier = 200 - else: - raise NotImplemented(throttle_type) - - # Pick up config changes - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - create_script = dedent(""" - import os - - mount_path = "{mount_path}" - subdir = "delete_me" - size_unit = {size_unit} - file_multiplier = {file_multiplier} - os.mkdir(os.path.join(mount_path, subdir)) - for i in xrange(0, file_multiplier): - for size in xrange(0, {size_range}*size_unit, size_unit): - filename = "{{0}}_{{1}}.bin".format(i, size / size_unit) - f = open(os.path.join(mount_path, subdir, filename), 'w') - f.write(size * 'x') - f.close() - """.format( - mount_path=self.mount_a.mountpoint, - size_unit=size_unit, - file_multiplier=file_multiplier, - size_range=self.throttle_workload_size_range - )) - - self.mount_a.run_python(create_script) - - # We will run the deletion in the background, to reduce the risk of it completing before - # we have started monitoring the stray statistics. - def background(): - self.mount_a.run_shell(["rm", "-rf", "delete_me"]) - self.fs.mds_asok(["flush", "journal"]) - - background_thread = gevent.spawn(background) - - total_inodes = file_multiplier * self.throttle_workload_size_range + 1 - mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds')) - mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds')) - - # During this phase we look for the concurrent ops to exceed half - # the limit (a heuristic) and not exceed the limit (a correctness - # condition). - purge_timeout = 600 - elapsed = 0 - files_high_water = 0 - ops_high_water = 0 - - while True: - stats = self.fs.mds_asok(['perf', 'dump']) - mdc_stats = stats['mds_cache'] - pq_stats = stats['purge_queue'] - if elapsed >= purge_timeout: - raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats)) - - num_strays = mdc_stats['num_strays'] - num_strays_purging = pq_stats['pq_executing'] - num_purge_ops = pq_stats['pq_executing_ops'] - - self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops]) - - files_high_water = max(files_high_water, num_strays_purging) - ops_high_water = max(ops_high_water, num_purge_ops) - - total_strays_created = mdc_stats['strays_created'] - total_strays_purged = pq_stats['pq_executed'] - - if total_strays_purged == total_inodes: - log.info("Complete purge in {0} seconds".format(elapsed)) - break - elif total_strays_purged > total_inodes: - raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats)) - else: - if throttle_type == self.OPS_THROTTLE: - # 11 is filer_max_purge_ops plus one for the backtrace: - # limit is allowed to be overshot by this much. - if num_purge_ops > mds_max_purge_ops + 11: - raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format( - num_purge_ops, mds_max_purge_ops - )) - elif throttle_type == self.FILES_THROTTLE: - if num_strays_purging > mds_max_purge_files: - raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format( - num_strays_purging, mds_max_purge_files - )) - else: - raise NotImplemented(throttle_type) - - log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format( - num_strays_purging, num_strays, - total_strays_purged, total_strays_created - )) - time.sleep(1) - elapsed += 1 - - background_thread.join() - - # Check that we got up to a respectable rate during the purge. This is totally - # racy, but should be safeish unless the cluster is pathologically slow, or - # insanely fast such that the deletions all pass before we have polled the - # statistics. - if throttle_type == self.OPS_THROTTLE: - if ops_high_water < mds_max_purge_ops / 2: - raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format( - ops_high_water, mds_max_purge_ops - )) - elif throttle_type == self.FILES_THROTTLE: - if files_high_water < mds_max_purge_files / 2: - raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format( - ops_high_water, mds_max_purge_files - )) - - # Sanity check all MDC stray stats - stats = self.fs.mds_asok(['perf', 'dump']) - mdc_stats = stats['mds_cache'] - pq_stats = stats['purge_queue'] - self.assertEqual(mdc_stats['num_strays'], 0) - self.assertEqual(mdc_stats['num_strays_delayed'], 0) - self.assertEqual(pq_stats['pq_executing'], 0) - self.assertEqual(pq_stats['pq_executing_ops'], 0) - self.assertEqual(mdc_stats['strays_created'], total_inodes) - self.assertEqual(mdc_stats['strays_enqueued'], total_inodes) - self.assertEqual(pq_stats['pq_executed'], total_inodes) - - def get_mdc_stat(self, name, mds_id=None): - return self.get_stat("mds_cache", name, mds_id) - - def get_stat(self, subsys, name, mds_id=None): - return self.fs.mds_asok(['perf', 'dump', subsys, name], - mds_id=mds_id)[subsys][name] - - def _wait_for_counter(self, subsys, counter, expect_val, timeout=60, - mds_id=None): - self.wait_until_equal( - lambda: self.get_stat(subsys, counter, mds_id), - expect_val=expect_val, timeout=timeout, - reject_fn=lambda x: x > expect_val - ) - - def test_open_inode(self): - """ - That the case of a dentry unlinked while a client holds an - inode open is handled correctly. - - The inode should be moved into a stray dentry, while the original - dentry and directory should be purged. - - The inode's data should be purged when the client eventually closes - it. - """ - mount_a_client_id = self.mount_a.get_global_id() - - # Write some bytes to a file - size_mb = 8 - - # Hold the file open - p = self.mount_a.open_background("open_file") - self.mount_a.write_n_mb("open_file", size_mb) - open_file_ino = self.mount_a.path_to_ino("open_file") - - self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2) - - # Unlink the dentry - self.mount_a.run_shell(["rm", "-f", "open_file"]) - - # Wait to see the stray count increment - self.wait_until_equal( - lambda: self.get_mdc_stat("num_strays"), - expect_val=1, timeout=60, reject_fn=lambda x: x > 1) - - # See that while the stray count has incremented, none have passed - # on to the purge queue - self.assertEqual(self.get_mdc_stat("strays_created"), 1) - self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0) - - # See that the client still holds 2 caps - self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2) - - # See that the data objects remain in the data pool - self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024)) - - # Now close the file - self.mount_a.kill_background(p) - - # Wait to see the client cap count decrement - self.wait_until_equal( - lambda: self.get_session(mount_a_client_id)['num_caps'], - expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1 - ) - # Wait to see the purge counter increment, stray count go to zero - self._wait_for_counter("mds_cache", "strays_enqueued", 1) - self.wait_until_equal( - lambda: self.get_mdc_stat("num_strays"), - expect_val=0, timeout=6, reject_fn=lambda x: x > 1 - ) - self._wait_for_counter("purge_queue", "pq_executed", 1) - - # See that the data objects no longer exist - self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024)) - - self.await_data_pool_empty() - - def test_hardlink_reintegration(self): - """ - That removal of primary dentry of hardlinked inode results - in reintegration of inode into the previously-remote dentry, - rather than lingering as a stray indefinitely. - """ - # Write some bytes to file_a - size_mb = 8 - self.mount_a.run_shell(["mkdir", "dir_1"]) - self.mount_a.write_n_mb("dir_1/file_a", size_mb) - ino = self.mount_a.path_to_ino("dir_1/file_a") - - # Create a hardlink named file_b - self.mount_a.run_shell(["mkdir", "dir_2"]) - self.mount_a.run_shell(["ln", "dir_1/file_a", "dir_2/file_b"]) - self.assertEqual(self.mount_a.path_to_ino("dir_2/file_b"), ino) - - # Flush journal - self.fs.mds_asok(['flush', 'journal']) - - # See that backtrace for the file points to the file_a path - pre_unlink_bt = self.fs.read_backtrace(ino) - self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a") - - # empty mds cache. otherwise mds reintegrates stray when unlink finishes - self.mount_a.umount_wait() - self.fs.mds_asok(['flush', 'journal']) - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - self.mount_a.mount() - - # Unlink file_a - self.mount_a.run_shell(["rm", "-f", "dir_1/file_a"]) - - # See that a stray was created - self.assertEqual(self.get_mdc_stat("num_strays"), 1) - self.assertEqual(self.get_mdc_stat("strays_created"), 1) - - # Wait, see that data objects are still present (i.e. that the - # stray did not advance to purging given time) - time.sleep(30) - self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024)) - self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0) - - # See that before reintegration, the inode's backtrace points to a stray dir - self.fs.mds_asok(['flush', 'journal']) - self.assertTrue(self.get_backtrace_path(ino).startswith("stray")) - - last_reintegrated = self.get_mdc_stat("strays_reintegrated") - - # Do a metadata operation on the remaining link (mv is heavy handed, but - # others like touch may be satisfied from caps without poking MDS) - self.mount_a.run_shell(["mv", "dir_2/file_b", "dir_2/file_c"]) - - # Stray reintegration should happen as a result of the eval_remote call - # on responding to a client request. - self.wait_until_equal( - lambda: self.get_mdc_stat("num_strays"), - expect_val=0, - timeout=60 - ) - - # See the reintegration counter increment - curr_reintegrated = self.get_mdc_stat("strays_reintegrated") - self.assertGreater(curr_reintegrated, last_reintegrated) - last_reintegrated = curr_reintegrated - - # Flush the journal - self.fs.mds_asok(['flush', 'journal']) - - # See that the backtrace for the file points to the remaining link's path - post_reint_bt = self.fs.read_backtrace(ino) - self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c") - - # mds should reintegrates stray when unlink finishes - self.mount_a.run_shell(["ln", "dir_2/file_c", "dir_2/file_d"]) - self.mount_a.run_shell(["rm", "-f", "dir_2/file_c"]) - - # Stray reintegration should happen as a result of the notify_stray call - # on completion of unlink - self.wait_until_equal( - lambda: self.get_mdc_stat("num_strays"), - expect_val=0, - timeout=60 - ) - - # See the reintegration counter increment - curr_reintegrated = self.get_mdc_stat("strays_reintegrated") - self.assertGreater(curr_reintegrated, last_reintegrated) - last_reintegrated = curr_reintegrated - - # Flush the journal - self.fs.mds_asok(['flush', 'journal']) - - # See that the backtrace for the file points to the newest link's path - post_reint_bt = self.fs.read_backtrace(ino) - self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_d") - - # Now really delete it - self.mount_a.run_shell(["rm", "-f", "dir_2/file_d"]) - self._wait_for_counter("mds_cache", "strays_enqueued", 1) - self._wait_for_counter("purge_queue", "pq_executed", 1) - - self.assert_purge_idle() - self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024)) - - # We caused the inode to go stray 3 times - self.assertEqual(self.get_mdc_stat("strays_created"), 3) - # We purged it at the last - self.assertEqual(self.get_mdc_stat("strays_enqueued"), 1) - - def test_mv_hardlink_cleanup(self): - """ - That when doing a rename from A to B, and B has hardlinks, - then we make a stray for B which is then reintegrated - into one of his hardlinks. - """ - # Create file_a, file_b, and a hardlink to file_b - size_mb = 8 - self.mount_a.write_n_mb("file_a", size_mb) - file_a_ino = self.mount_a.path_to_ino("file_a") - - self.mount_a.write_n_mb("file_b", size_mb) - file_b_ino = self.mount_a.path_to_ino("file_b") - - self.mount_a.run_shell(["ln", "file_b", "linkto_b"]) - self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino) - - # mv file_a file_b - self.mount_a.run_shell(["mv", "file_a", "file_b"]) - - # Stray reintegration should happen as a result of the notify_stray call on - # completion of rename - self.wait_until_equal( - lambda: self.get_mdc_stat("num_strays"), - expect_val=0, - timeout=60 - ) - - self.assertEqual(self.get_mdc_stat("strays_created"), 1) - self.assertGreaterEqual(self.get_mdc_stat("strays_reintegrated"), 1) - - # No data objects should have been deleted, as both files still have linkage. - self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024)) - self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024)) - - self.fs.mds_asok(['flush', 'journal']) - - post_reint_bt = self.fs.read_backtrace(file_b_ino) - self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "linkto_b") - - def _setup_two_ranks(self): - # Set up two MDSs - self.fs.set_max_mds(2) - - # See that we have two active MDSs - self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, - reject_fn=lambda v: v > 2 or v < 1) - - active_mds_names = self.fs.get_active_names() - rank_0_id = active_mds_names[0] - rank_1_id = active_mds_names[1] - log.info("Ranks 0 and 1 are {0} and {1}".format( - rank_0_id, rank_1_id)) - - # Get rid of other MDS daemons so that it's easier to know which - # daemons to expect in which ranks after restarts - for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}: - self.mds_cluster.mds_stop(unneeded_mds) - self.mds_cluster.mds_fail(unneeded_mds) - - return rank_0_id, rank_1_id - - def _force_migrate(self, to_id, path, watch_ino): - """ - :param to_id: MDS id to move it to - :param path: Filesystem path (string) to move - :param watch_ino: Inode number to look for at destination to confirm move - :return: None - """ - self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", path]) - - # Poll the MDS cache dump to watch for the export completing - migrated = False - migrate_timeout = 60 - migrate_elapsed = 0 - while not migrated: - data = self.fs.mds_asok(["dump", "cache"], to_id) - for inode_data in data: - if inode_data['ino'] == watch_ino: - log.debug("Found ino in cache: {0}".format(json.dumps(inode_data, indent=2))) - if inode_data['is_auth'] is True: - migrated = True - break - - if not migrated: - if migrate_elapsed > migrate_timeout: - raise RuntimeError("Migration hasn't happened after {0}s!".format(migrate_elapsed)) - else: - migrate_elapsed += 1 - time.sleep(1) - - def _is_stopped(self, rank): - mds_map = self.fs.get_mds_map() - return rank not in [i['rank'] for i in mds_map['info'].values()] - - def test_purge_on_shutdown(self): - """ - That when an MDS rank is shut down, its purge queue is - drained in the process. - """ - rank_0_id, rank_1_id = self._setup_two_ranks() - - self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") - self.mds_cluster.mds_fail_restart(rank_1_id) - self.fs.wait_for_daemons() - - file_count = 5 - - self.mount_a.create_n_files("delete_me/file", file_count) - - self._force_migrate(rank_1_id, "delete_me", - self.mount_a.path_to_ino("delete_me/file_0")) - - self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) - self.mount_a.umount_wait() - - # See all the strays go into purge queue - self._wait_for_counter("mds_cache", "strays_created", file_count, mds_id=rank_1_id) - self._wait_for_counter("mds_cache", "strays_enqueued", file_count, mds_id=rank_1_id) - self.assertEqual(self.get_stat("mds_cache", "num_strays", mds_id=rank_1_id), 0) - - # See nothing get purged from the purge queue (yet) - time.sleep(10) - self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) - - # Shut down rank 1 - self.fs.set_max_mds(1) - self.fs.deactivate(1) - - # It shouldn't proceed past stopping because its still not allowed - # to purge - time.sleep(10) - self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) - self.assertFalse(self._is_stopped(1)) - - # Permit the daemon to start purging again - self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(rank_1_id), - 'injectargs', - "--mds_max_purge_files 100") - - # It should now proceed through shutdown - self.wait_until_true( - lambda: self._is_stopped(1), - timeout=60 - ) - - # ...and in the process purge all that data - self.await_data_pool_empty() - - def test_migration_on_shutdown(self): - """ - That when an MDS rank is shut down, any non-purgeable strays - get migrated to another rank. - """ - - rank_0_id, rank_1_id = self._setup_two_ranks() - - # Create a non-purgeable stray in a ~mds1 stray directory - # by doing a hard link and deleting the original file - self.mount_a.run_shell(["mkdir", "dir_1", "dir_2"]) - self.mount_a.run_shell(["touch", "dir_1/original"]) - self.mount_a.run_shell(["ln", "dir_1/original", "dir_2/linkto"]) - - self._force_migrate(rank_1_id, "dir_1", - self.mount_a.path_to_ino("dir_1/original")) - - # empty mds cache. otherwise mds reintegrates stray when unlink finishes - self.mount_a.umount_wait() - self.fs.mds_asok(['flush', 'journal'], rank_0_id) - self.fs.mds_asok(['flush', 'journal'], rank_1_id) - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - active_mds_names = self.fs.get_active_names() - rank_0_id = active_mds_names[0] - rank_1_id = active_mds_names[1] - - self.mount_a.mount() - - self.mount_a.run_shell(["rm", "-f", "dir_1/original"]) - self.mount_a.umount_wait() - - self._wait_for_counter("mds_cache", "strays_created", 1, - mds_id=rank_1_id) - - # Shut down rank 1 - self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "1") - self.fs.mon_manager.raw_cluster_cmd_result('mds', 'deactivate', "1") - - # Wait til we get to a single active MDS mdsmap state - self.wait_until_true(lambda: self._is_stopped(1), timeout=120) - - # See that the stray counter on rank 0 has incremented - self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1) - - def assert_backtrace(self, ino, expected_path): - """ - Assert that the backtrace in the data pool for an inode matches - an expected /foo/bar path. - """ - expected_elements = expected_path.strip("/").split("/") - bt = self.fs.read_backtrace(ino) - actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']])) - self.assertListEqual(expected_elements, actual_elements) - - def get_backtrace_path(self, ino): - bt = self.fs.read_backtrace(ino) - elements = reversed([dn['dname'] for dn in bt['ancestors']]) - return "/".join(elements) - - def assert_purge_idle(self): - """ - Assert that the MDS perf counters indicate no strays exist and - no ongoing purge activity. Sanity check for when PurgeQueue should - be idle. - """ - mdc_stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache'] - pq_stats = self.fs.mds_asok(['perf', 'dump', "purge_queue"])['purge_queue'] - self.assertEqual(mdc_stats["num_strays"], 0) - self.assertEqual(mdc_stats["num_strays_delayed"], 0) - self.assertEqual(pq_stats["pq_executing"], 0) - self.assertEqual(pq_stats["pq_executing_ops"], 0) - - def test_mv_cleanup(self): - """ - That when doing a rename from A to B, and B has no hardlinks, - then we make a stray for B and purge him. - """ - # Create file_a and file_b, write some to both - size_mb = 8 - self.mount_a.write_n_mb("file_a", size_mb) - file_a_ino = self.mount_a.path_to_ino("file_a") - self.mount_a.write_n_mb("file_b", size_mb) - file_b_ino = self.mount_a.path_to_ino("file_b") - - self.fs.mds_asok(['flush', 'journal']) - self.assert_backtrace(file_a_ino, "file_a") - self.assert_backtrace(file_b_ino, "file_b") - - # mv file_a file_b - self.mount_a.run_shell(['mv', 'file_a', 'file_b']) - - # See that stray counter increments - self.assertEqual(self.get_mdc_stat("strays_created"), 1) - # Wait for purge counter to increment - self._wait_for_counter("mds_cache", "strays_enqueued", 1) - self._wait_for_counter("purge_queue", "pq_executed", 1) - - self.assert_purge_idle() - - # file_b should have been purged - self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024)) - - # Backtrace should have updated from file_a to file_b - self.fs.mds_asok(['flush', 'journal']) - self.assert_backtrace(file_a_ino, "file_b") - - # file_a's data should still exist - self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024)) - - def _pool_df(self, pool_name): - """ - Return a dict like - { - "kb_used": 0, - "bytes_used": 0, - "max_avail": 19630292406, - "objects": 0 - } - - :param pool_name: Which pool (must exist) - """ - out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty") - for p in json.loads(out)['pools']: - if p['name'] == pool_name: - return p['stats'] - - raise RuntimeError("Pool '{0}' not found".format(pool_name)) - - def await_data_pool_empty(self): - self.wait_until_true( - lambda: self._pool_df( - self.fs.get_data_pool_name() - )['objects'] == 0, - timeout=60) - - def test_snapshot_remove(self): - """ - That removal of a snapshot that references a now-unlinked file results - in purging on the stray for the file. - """ - # Enable snapshots - self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_new_snaps", "true", - "--yes-i-really-mean-it") - - # Create a dir with a file in it - size_mb = 8 - self.mount_a.run_shell(["mkdir", "snapdir"]) - self.mount_a.run_shell(["mkdir", "snapdir/subdir"]) - self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024) - file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a") - - # Snapshot the dir - self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"]) - - # Cause the head revision to deviate from the snapshot - self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb) - - # Flush the journal so that backtraces, dirfrag objects will actually be written - self.fs.mds_asok(["flush", "journal"]) - - # Unlink the file - self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"]) - self.mount_a.run_shell(["rmdir", "snapdir/subdir"]) - - # Unmount the client because when I come back to check the data is still - # in the file I don't want to just see what's in the page cache. - self.mount_a.umount_wait() - - self.assertEqual(self.get_mdc_stat("strays_created"), 2) - - # FIXME: at this stage we see a purge and the stray count drops to - # zero, but there's actually still a stray, so at the very - # least the StrayManager stats code is slightly off - - self.mount_a.mount() - - # See that the data from the snapshotted revision of the file is still present - # and correct - self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024) - - # Remove the snapshot - self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"]) - - # Purging file_a doesn't happen until after we've flushed the journal, because - # it is referenced by the snapshotted subdir, and the snapshot isn't really - # gone until the journal references to it are gone - self.fs.mds_asok(["flush", "journal"]) - - # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs. - # See also: http://tracker.ceph.com/issues/20072 - self.wait_until_true( - lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024), - timeout=60 - ) - - # See that a purge happens now - self._wait_for_counter("mds_cache", "strays_enqueued", 2) - self._wait_for_counter("purge_queue", "pq_executed", 2) - - self.await_data_pool_empty() - - def test_fancy_layout(self): - """ - purge stray file with fancy layout - """ - - file_name = "fancy_layout_file" - self.mount_a.run_shell(["touch", file_name]) - - file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608" - self.mount_a.setfattr(file_name, "ceph.file.layout", file_layout) - - # 35MB requires 7 objects - size_mb = 35 - self.mount_a.write_n_mb(file_name, size_mb) - - self.mount_a.run_shell(["rm", "-f", file_name]) - self.fs.mds_asok(["flush", "journal"]) - - # can't use self.fs.data_objects_absent here, it does not support fancy layout - self.await_data_pool_empty() - - def test_dirfrag_limit(self): - """ - That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations). - - That fragmentation (forced) will allow more entries to be created. - - That unlinking fails when the stray directory fragment becomes too large and that unlinking may continue once those strays are purged. - """ - - self.fs.set_allow_dirfrags(True) - - LOW_LIMIT = 50 - for mds in self.fs.get_daemon_names(): - self.fs.mds_asok(["config", "set", "mds_bal_fragment_size_max", str(LOW_LIMIT)], mds) - - try: - self.mount_a.run_python(dedent(""" - import os - path = os.path.join("{path}", "subdir") - os.mkdir(path) - for n in range(0, {file_count}): - open(os.path.join(path, "%s" % n), 'w').write("%s" % n) - """.format( - path=self.mount_a.mountpoint, - file_count=LOW_LIMIT+1 - ))) - except CommandFailedError: - pass # ENOSPAC - else: - raise RuntimeError("fragment size exceeded") - - # Now test that we can go beyond the limit if we fragment the directory - - self.mount_a.run_python(dedent(""" - import os - path = os.path.join("{path}", "subdir2") - os.mkdir(path) - for n in range(0, {file_count}): - open(os.path.join(path, "%s" % n), 'w').write("%s" % n) - dfd = os.open(path, os.O_DIRECTORY) - os.fsync(dfd) - """.format( - path=self.mount_a.mountpoint, - file_count=LOW_LIMIT - ))) - - # Ensure that subdir2 is fragmented - mds_id = self.fs.get_active_names()[0] - self.fs.mds_asok(["dirfrag", "split", "/subdir2", "0/0", "1"], mds_id) - - # remount+flush (release client caps) - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"], mds_id) - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # Create 50% more files than the current fragment limit - self.mount_a.run_python(dedent(""" - import os - path = os.path.join("{path}", "subdir2") - for n in range({file_count}, ({file_count}*3)//2): - open(os.path.join(path, "%s" % n), 'w').write("%s" % n) - """.format( - path=self.mount_a.mountpoint, - file_count=LOW_LIMIT - ))) - - # Now test the stray directory size is limited and recovers - strays_before = self.get_mdc_stat("strays_created") - try: - self.mount_a.run_python(dedent(""" - import os - path = os.path.join("{path}", "subdir3") - os.mkdir(path) - for n in range({file_count}): - fpath = os.path.join(path, "%s" % n) - f = open(fpath, 'w') - f.write("%s" % n) - f.close() - os.unlink(fpath) - """.format( - path=self.mount_a.mountpoint, - file_count=LOW_LIMIT*10 # 10 stray directories, should collide before this count - ))) - except CommandFailedError: - pass # ENOSPAC - else: - raise RuntimeError("fragment size exceeded") - - strays_after = self.get_mdc_stat("strays_created") - self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT) - - self._wait_for_counter("mds_cache", "strays_enqueued", strays_after) - self._wait_for_counter("purge_queue", "pq_executed", strays_after) - - self.mount_a.run_python(dedent(""" - import os - path = os.path.join("{path}", "subdir4") - os.mkdir(path) - for n in range({file_count}): - fpath = os.path.join(path, "%s" % n) - f = open(fpath, 'w') - f.write("%s" % n) - f.close() - os.unlink(fpath) - """.format( - path=self.mount_a.mountpoint, - file_count=LOW_LIMIT - ))) - - def test_purge_queue_upgrade(self): - """ - That when starting on a system with no purge queue in the metadata - pool, we silently create one. - :return: - """ - - self.mds_cluster.mds_stop() - self.mds_cluster.mds_fail() - self.fs.rados(["rm", "500.00000000"]) - self.mds_cluster.mds_restart() - self.fs.wait_for_daemons() - - def test_purge_queue_op_rate(self): - """ - A busy purge queue is meant to aggregate operations sufficiently - that our RADOS ops to the metadata pool are not O(files). Check - that that is so. - :return: - """ - - # For low rates of deletion, the rate of metadata ops actually - # will be o(files), so to see the desired behaviour we have to give - # the system a significant quantity, i.e. an order of magnitude - # more than the number of files it will purge at one time. - - max_purge_files = 2 - - self.set_conf('mds', 'mds_bal_frag', 'false') - self.set_conf('mds', 'mds_max_purge_files', "%d" % max_purge_files) - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - - phase_1_files = 256 - phase_2_files = 512 - - self.mount_a.run_shell(["mkdir", "phase1"]) - self.mount_a.create_n_files("phase1/file", phase_1_files) - - self.mount_a.run_shell(["mkdir", "phase2"]) - self.mount_a.create_n_files("phase2/file", phase_2_files) - - def unlink_and_count_ops(path, expected_deletions): - initial_ops = self.get_stat("objecter", "op") - initial_pq_executed = self.get_stat("purge_queue", "pq_executed") - - self.mount_a.run_shell(["rm", "-rf", path]) - - self._wait_for_counter( - "purge_queue", "pq_executed", initial_pq_executed + expected_deletions - ) - - final_ops = self.get_stat("objecter", "op") - - # Calculation of the *overhead* operations, i.e. do not include - # the operations where we actually delete files. - return final_ops - initial_ops - expected_deletions - - self.fs.mds_asok(['flush', 'journal']) - phase1_ops = unlink_and_count_ops("phase1/", phase_1_files + 1) - - self.fs.mds_asok(['flush', 'journal']) - phase2_ops = unlink_and_count_ops("phase2/", phase_2_files + 1) - - log.info("Phase 1: {0}".format(phase1_ops)) - log.info("Phase 2: {0}".format(phase2_ops)) - - # The success criterion is that deleting double the number - # of files doesn't generate double the number of overhead ops - # -- this comparison is a rough approximation of that rule. - self.assertTrue(phase2_ops < phase1_ops * 1.25) - - # Finally, check that our activity did include properly quiescing - # the queue (i.e. call to Journaler::write_head in the right place), - # by restarting the MDS and checking that it doesn't try re-executing - # any of the work we did. - self.fs.mds_asok(['flush', 'journal']) # flush to ensure no strays - # hanging around - self.fs.mds_fail_restart() - self.fs.wait_for_daemons() - time.sleep(10) - self.assertEqual(self.get_stat("purge_queue", "pq_executed"), 0) - - def test_replicated_delete_speed(self): - """ - That deletions of replicated metadata are not pathologically slow - """ - rank_0_id, rank_1_id = self._setup_two_ranks() - - self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") - self.mds_cluster.mds_fail_restart(rank_1_id) - self.fs.wait_for_daemons() - - file_count = 10 - - self.mount_a.create_n_files("delete_me/file", file_count) - - self._force_migrate(rank_1_id, "delete_me", - self.mount_a.path_to_ino("delete_me/file_0")) - - begin = datetime.datetime.now() - self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) - end = datetime.datetime.now() - - # What we're really checking here is that we are completing client - # operations immediately rather than delaying until the next tick. - tick_period = float(self.fs.get_config("mds_tick_interval", - service_type="mds")) - - duration = (end - begin).total_seconds() - self.assertLess(duration, (file_count * tick_period) * 0.25) - diff --git a/src/ceph/qa/tasks/cephfs/test_volume_client.py b/src/ceph/qa/tasks/cephfs/test_volume_client.py deleted file mode 100644 index 0876af9..0000000 --- a/src/ceph/qa/tasks/cephfs/test_volume_client.py +++ /dev/null @@ -1,1016 +0,0 @@ -import json -import logging -import time -import os -from textwrap import dedent -from tasks.cephfs.cephfs_test_case import CephFSTestCase -from tasks.cephfs.fuse_mount import FuseMount -from teuthology.exceptions import CommandFailedError - -log = logging.getLogger(__name__) - - -class TestVolumeClient(CephFSTestCase): - # One for looking at the global filesystem, one for being - # the VolumeClient, two for mounting the created shares - CLIENTS_REQUIRED = 4 - - def _volume_client_python(self, client, script, vol_prefix=None, ns_prefix=None): - # Can't dedent this *and* the script we pass in, because they might have different - # levels of indentation to begin with, so leave this string zero-indented - if vol_prefix: - vol_prefix = "\"" + vol_prefix + "\"" - if ns_prefix: - ns_prefix = "\"" + ns_prefix + "\"" - return client.run_python(""" -from ceph_volume_client import CephFSVolumeClient, VolumePath -import logging -log = logging.getLogger("ceph_volume_client") -log.addHandler(logging.StreamHandler()) -log.setLevel(logging.DEBUG) -vc = CephFSVolumeClient("manila", "{conf_path}", "ceph", {vol_prefix}, {ns_prefix}) -vc.connect() -{payload} -vc.disconnect() - """.format(payload=script, conf_path=client.config_path, vol_prefix=vol_prefix, ns_prefix=ns_prefix)) - - def _sudo_write_file(self, remote, path, data): - """ - Write data to a remote file as super user - - :param remote: Remote site. - :param path: Path on the remote being written to. - :param data: Data to be written. - - Both perms and owner are passed directly to chmod. - """ - remote.run( - args=[ - 'sudo', - 'python', - '-c', - 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', - path, - ], - stdin=data, - ) - - def _configure_vc_auth(self, mount, id_name): - """ - Set up auth credentials for the VolumeClient user - """ - out = self.fs.mon_manager.raw_cluster_cmd( - "auth", "get-or-create", "client.{name}".format(name=id_name), - "mds", "allow *", - "osd", "allow rw", - "mon", "allow *" - ) - mount.client_id = id_name - self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out) - self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path()) - - def _configure_guest_auth(self, volumeclient_mount, guest_mount, - guest_entity, mount_path, - namespace_prefix=None, readonly=False, - tenant_id=None): - """ - Set up auth credentials for the guest client to mount a volume. - - :param volumeclient_mount: mount used as the handle for driving - volumeclient. - :param guest_mount: mount used by the guest client. - :param guest_entity: auth ID used by the guest client. - :param mount_path: path of the volume. - :param namespace_prefix: name prefix of the RADOS namespace, which - is used for the volume's layout. - :param readonly: defaults to False. If set to 'True' only read-only - mount access is granted to the guest. - :param tenant_id: (OpenStack) tenant ID of the guest client. - """ - - head, volume_id = os.path.split(mount_path) - head, group_id = os.path.split(head) - head, volume_prefix = os.path.split(head) - volume_prefix = "/" + volume_prefix - - # Authorize the guest client's auth ID to mount the volume. - key = self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - auth_result = vc.authorize(vp, "{guest_entity}", readonly={readonly}, - tenant_id="{tenant_id}") - print auth_result['auth_key'] - """.format( - group_id=group_id, - volume_id=volume_id, - guest_entity=guest_entity, - readonly=readonly, - tenant_id=tenant_id)), volume_prefix, namespace_prefix - ) - - # CephFSVolumeClient's authorize() does not return the secret - # key to a caller who isn't multi-tenant aware. Explicitly - # query the key for such a client. - if not tenant_id: - key = self.fs.mon_manager.raw_cluster_cmd( - "auth", "get-key", "client.{name}".format(name=guest_entity), - ) - - # The guest auth ID should exist. - existing_ids = [a['entity'] for a in self.auth_list()] - self.assertIn("client.{0}".format(guest_entity), existing_ids) - - # Create keyring file for the guest client. - keyring_txt = dedent(""" - [client.{guest_entity}] - key = {key} - - """.format( - guest_entity=guest_entity, - key=key - )) - guest_mount.client_id = guest_entity - self._sudo_write_file(guest_mount.client_remote, - guest_mount.get_keyring_path(), - keyring_txt) - - # Add a guest client section to the ceph config file. - self.set_conf("client.{0}".format(guest_entity), "client quota", "True") - self.set_conf("client.{0}".format(guest_entity), "debug client", "20") - self.set_conf("client.{0}".format(guest_entity), "debug objecter", "20") - self.set_conf("client.{0}".format(guest_entity), - "keyring", guest_mount.get_keyring_path()) - - def test_default_prefix(self): - group_id = "grpid" - volume_id = "volid" - DEFAULT_VOL_PREFIX = "volumes" - DEFAULT_NS_PREFIX = "fsvolumens_" - - self.mount_b.umount_wait() - self._configure_vc_auth(self.mount_b, "manila") - - #create a volume with default prefix - self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.create_volume(vp, 10, data_isolated=True) - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - - # The dir should be created - self.mount_a.stat(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id)) - - #namespace should be set - ns_in_attr = self.mount_a.getfattr(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id), "ceph.dir.layout.pool_namespace") - namespace = "{0}{1}".format(DEFAULT_NS_PREFIX, volume_id) - self.assertEqual(namespace, ns_in_attr) - - - def test_lifecycle(self): - """ - General smoke test for create, extend, destroy - """ - - # I'm going to use mount_c later as a guest for mounting the created - # shares - self.mounts[2].umount_wait() - - # I'm going to leave mount_b unmounted and just use it as a handle for - # driving volumeclient. It's a little hacky but we don't have a more - # general concept for librados/libcephfs clients as opposed to full - # blown mounting clients. - self.mount_b.umount_wait() - self._configure_vc_auth(self.mount_b, "manila") - - guest_entity = "guest" - group_id = "grpid" - volume_id = "volid" - - volume_prefix = "/myprefix" - namespace_prefix = "mynsprefix_" - - # Create a 100MB volume - volume_size = 100 - mount_path = self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - create_result = vc.create_volume(vp, 1024*1024*{volume_size}) - print create_result['mount_path'] - """.format( - group_id=group_id, - volume_id=volume_id, - volume_size=volume_size - )), volume_prefix, namespace_prefix) - - # The dir should be created - self.mount_a.stat(os.path.join("myprefix", group_id, volume_id)) - - # Authorize and configure credentials for the guest to mount the - # the volume. - self._configure_guest_auth(self.mount_b, self.mounts[2], guest_entity, - mount_path, namespace_prefix) - self.mounts[2].mount(mount_path=mount_path) - - # The kernel client doesn't have the quota-based df behaviour, - # or quotas at all, so only exercise the client behaviour when - # running fuse. - if isinstance(self.mounts[2], FuseMount): - # df should see volume size, same as the quota set on volume's dir - self.assertEqual(self.mounts[2].df()['total'], - volume_size * 1024 * 1024) - self.assertEqual( - self.mount_a.getfattr( - os.path.join(volume_prefix.strip("/"), group_id, volume_id), - "ceph.quota.max_bytes"), - "%s" % (volume_size * 1024 * 1024)) - - # df granularity is 4MB block so have to write at least that much - data_bin_mb = 4 - self.mounts[2].write_n_mb("data.bin", data_bin_mb) - - # Write something outside volume to check this space usage is - # not reported in the volume's DF. - other_bin_mb = 8 - self.mount_a.write_n_mb("other.bin", other_bin_mb) - - # global: df should see all the writes (data + other). This is a > - # rather than a == because the global spaced used includes all pools - def check_df(): - used = self.mount_a.df()['used'] - return used >= (other_bin_mb * 1024 * 1024) - - self.wait_until_true(check_df, timeout=30) - - # Hack: do a metadata IO to kick rstats - self.mounts[2].run_shell(["touch", "foo"]) - - # volume: df should see the data_bin_mb consumed from quota, same - # as the rbytes for the volume's dir - self.wait_until_equal( - lambda: self.mounts[2].df()['used'], - data_bin_mb * 1024 * 1024, timeout=60) - self.wait_until_equal( - lambda: self.mount_a.getfattr( - os.path.join(volume_prefix.strip("/"), group_id, volume_id), - "ceph.dir.rbytes"), - "%s" % (data_bin_mb * 1024 * 1024), timeout=60) - - # sync so that file data are persist to rados - self.mounts[2].run_shell(["sync"]) - - # Our data should stay in particular rados namespace - pool_name = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool") - namespace = "{0}{1}".format(namespace_prefix, volume_id) - ns_in_attr = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool_namespace") - self.assertEqual(namespace, ns_in_attr) - - objects_in_ns = set(self.fs.rados(["ls"], pool=pool_name, namespace=namespace).split("\n")) - self.assertNotEqual(objects_in_ns, set()) - - # De-authorize the guest - self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.deauthorize(vp, "{guest_entity}") - vc.evict("{guest_entity}") - """.format( - group_id=group_id, - volume_id=volume_id, - guest_entity=guest_entity - )), volume_prefix, namespace_prefix) - - # Once deauthorized, the client should be unable to do any more metadata ops - # The way that the client currently behaves here is to block (it acts like - # it has lost network, because there is nothing to tell it that is messages - # are being dropped because it's identity is gone) - background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False) - time.sleep(10) # Approximate check for 'stuck' as 'still running after 10s' - self.assertFalse(background.finished) - - # After deauthorisation, the client ID should be gone (this was the only - # volume it was authorised for) - self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()]) - - # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined) - self.mounts[2].kill() - self.mounts[2].kill_cleanup() - try: - background.wait() - except CommandFailedError: - # We killed the mount out from under you - pass - - self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.delete_volume(vp) - vc.purge_volume(vp) - """.format( - group_id=group_id, - volume_id=volume_id, - )), volume_prefix, namespace_prefix) - - def test_idempotency(self): - """ - That the volumeclient interface works when calling everything twice - """ - self.mount_b.umount_wait() - self._configure_vc_auth(self.mount_b, "manila") - - guest_entity = "guest" - group_id = "grpid" - volume_id = "volid" - self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.create_volume(vp, 10) - vc.create_volume(vp, 10) - vc.authorize(vp, "{guest_entity}") - vc.authorize(vp, "{guest_entity}") - vc.deauthorize(vp, "{guest_entity}") - vc.deauthorize(vp, "{guest_entity}") - vc.delete_volume(vp) - vc.delete_volume(vp) - vc.purge_volume(vp) - vc.purge_volume(vp) - - vc.create_volume(vp, 10, data_isolated=True) - vc.create_volume(vp, 10, data_isolated=True) - vc.authorize(vp, "{guest_entity}") - vc.authorize(vp, "{guest_entity}") - vc.deauthorize(vp, "{guest_entity}") - vc.deauthorize(vp, "{guest_entity}") - vc.evict("{guest_entity}") - vc.evict("{guest_entity}") - vc.delete_volume(vp, data_isolated=True) - vc.delete_volume(vp, data_isolated=True) - vc.purge_volume(vp, data_isolated=True) - vc.purge_volume(vp, data_isolated=True) - """.format( - group_id=group_id, - volume_id=volume_id, - guest_entity=guest_entity - ))) - - def test_data_isolated(self): - """ - That data isolated shares get their own pool - :return: - """ - - # Because the teuthology config template sets mon_max_pg_per_osd to - # 10000 (i.e. it just tries to ignore health warnings), reset it to something - # sane before using volume_client, to avoid creating pools with absurdly large - # numbers of PGs. - self.set_conf("global", "mon max pg per osd", "300") - for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'): - mon_daemon_state.restart() - - self.mount_b.umount_wait() - self._configure_vc_auth(self.mount_b, "manila") - - # Calculate how many PGs we'll expect the new volume pool to have - osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty')) - max_per_osd = int(self.fs.get_config('mon_max_pg_per_osd')) - osd_count = len(osd_map['osds']) - max_overall = osd_count * max_per_osd - - existing_pg_count = 0 - for p in osd_map['pools']: - existing_pg_count += p['pg_num'] - - expected_pg_num = (max_overall - existing_pg_count) / 10 - log.info("max_per_osd {0}".format(max_per_osd)) - log.info("osd_count {0}".format(osd_count)) - log.info("max_overall {0}".format(max_overall)) - log.info("existing_pg_count {0}".format(existing_pg_count)) - log.info("expected_pg_num {0}".format(expected_pg_num)) - - pools_a = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] - - group_id = "grpid" - volume_id = "volid" - self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.create_volume(vp, 10, data_isolated=True) - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - - pools_b = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] - - # Should have created one new pool - new_pools = set(p['pool_name'] for p in pools_b) - set([p['pool_name'] for p in pools_a]) - self.assertEqual(len(new_pools), 1) - - # It should have followed the heuristic for PG count - # (this is an overly strict test condition, so we may want to remove - # it at some point as/when the logic gets fancier) - created_pg_num = self.fs.mon_manager.get_pool_property(list(new_pools)[0], "pg_num") - self.assertEqual(expected_pg_num, created_pg_num) - - def test_15303(self): - """ - Reproducer for #15303 "Client holds incorrect complete flag on dir - after losing caps" (http://tracker.ceph.com/issues/15303) - """ - for m in self.mounts: - m.umount_wait() - - # Create a dir on mount A - self.mount_a.mount() - self.mount_a.run_shell(["mkdir", "parent1"]) - self.mount_a.run_shell(["mkdir", "parent2"]) - self.mount_a.run_shell(["mkdir", "parent1/mydir"]) - - # Put some files in it from mount B - self.mount_b.mount() - self.mount_b.run_shell(["touch", "parent1/mydir/afile"]) - self.mount_b.umount_wait() - - # List the dir's contents on mount A - self.assertListEqual(self.mount_a.ls("parent1/mydir"), - ["afile"]) - - def test_evict_client(self): - """ - That a volume client can be evicted based on its auth ID and the volume - path it has mounted. - """ - - if not isinstance(self.mount_a, FuseMount): - self.skipTest("Requires FUSE client to inject client metadata") - - # mounts[1] would be used as handle for driving VolumeClient. mounts[2] - # and mounts[3] would be used as guests to mount the volumes/shares. - - for i in range(1, 4): - self.mounts[i].umount_wait() - - volumeclient_mount = self.mounts[1] - self._configure_vc_auth(volumeclient_mount, "manila") - guest_mounts = (self.mounts[2], self.mounts[3]) - - guest_entity = "guest" - group_id = "grpid" - mount_paths = [] - volume_ids = [] - - # Create two volumes. Authorize 'guest' auth ID to mount the two - # volumes. Mount the two volumes. Write data to the volumes. - for i in range(2): - # Create volume. - volume_ids.append("volid_{0}".format(str(i))) - mount_paths.append( - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - create_result = vc.create_volume(vp, 10 * 1024 * 1024) - print create_result['mount_path'] - """.format( - group_id=group_id, - volume_id=volume_ids[i] - )))) - - # Authorize 'guest' auth ID to mount the volume. - self._configure_guest_auth(volumeclient_mount, guest_mounts[i], - guest_entity, mount_paths[i]) - - # Mount the volume. - guest_mounts[i].mountpoint_dir_name = 'mnt.{id}.{suffix}'.format( - id=guest_entity, suffix=str(i)) - guest_mounts[i].mount(mount_path=mount_paths[i]) - guest_mounts[i].write_n_mb("data.bin", 1) - - - # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted - # one volume. - self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.deauthorize(vp, "{guest_entity}") - vc.evict("{guest_entity}", volume_path=vp) - """.format( - group_id=group_id, - volume_id=volume_ids[0], - guest_entity=guest_entity - ))) - - # Evicted guest client, guest_mounts[0], should not be able to do - # anymore metadata ops. It should start failing all operations - # when it sees that its own address is in the blacklist. - try: - guest_mounts[0].write_n_mb("rogue.bin", 1) - except CommandFailedError: - pass - else: - raise RuntimeError("post-eviction write should have failed!") - - # The blacklisted guest client should now be unmountable - guest_mounts[0].umount_wait() - - # Guest client, guest_mounts[1], using the same auth ID 'guest', but - # has mounted the other volume, should be able to use its volume - # unaffected. - guest_mounts[1].write_n_mb("data.bin.1", 1) - - # Cleanup. - for i in range(2): - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.deauthorize(vp, "{guest_entity}") - vc.delete_volume(vp) - vc.purge_volume(vp) - """.format( - group_id=group_id, - volume_id=volume_ids[i], - guest_entity=guest_entity - ))) - - - def test_purge(self): - """ - Reproducer for #15266, exception trying to purge volumes that - contain non-ascii filenames. - - Additionally test any other purge corner cases here. - """ - # I'm going to leave mount_b unmounted and just use it as a handle for - # driving volumeclient. It's a little hacky but we don't have a more - # general concept for librados/libcephfs clients as opposed to full - # blown mounting clients. - self.mount_b.umount_wait() - self._configure_vc_auth(self.mount_b, "manila") - - group_id = "grpid" - # Use a unicode volume ID (like Manila), to reproduce #15266 - volume_id = u"volid" - - # Create - mount_path = self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", u"{volume_id}") - create_result = vc.create_volume(vp, 10) - print create_result['mount_path'] - """.format( - group_id=group_id, - volume_id=volume_id - ))) - - # Strip leading "/" - mount_path = mount_path[1:] - - # A file with non-ascii characters - self.mount_a.run_shell(["touch", os.path.join(mount_path, u"b\u00F6b")]) - - # A file with no permissions to do anything - self.mount_a.run_shell(["touch", os.path.join(mount_path, "noperms")]) - self.mount_a.run_shell(["chmod", "0000", os.path.join(mount_path, "noperms")]) - - self._volume_client_python(self.mount_b, dedent(""" - vp = VolumePath("{group_id}", u"{volume_id}") - vc.delete_volume(vp) - vc.purge_volume(vp) - """.format( - group_id=group_id, - volume_id=volume_id - ))) - - # Check it's really gone - self.assertEqual(self.mount_a.ls("volumes/_deleting"), []) - self.assertEqual(self.mount_a.ls("volumes/"), ["_deleting", group_id]) - - def test_readonly_authorization(self): - """ - That guest clients can be restricted to read-only mounts of volumes. - """ - - volumeclient_mount = self.mounts[1] - guest_mount = self.mounts[2] - volumeclient_mount.umount_wait() - guest_mount.umount_wait() - - # Configure volumeclient_mount as the handle for driving volumeclient. - self._configure_vc_auth(volumeclient_mount, "manila") - - guest_entity = "guest" - group_id = "grpid" - volume_id = "volid" - - # Create a volume. - mount_path = self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - create_result = vc.create_volume(vp, 1024*1024*10) - print create_result['mount_path'] - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - - # Authorize and configure credentials for the guest to mount the - # the volume with read-write access. - self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity, - mount_path, readonly=False) - - # Mount the volume, and write to it. - guest_mount.mount(mount_path=mount_path) - guest_mount.write_n_mb("data.bin", 1) - - # Change the guest auth ID's authorization to read-only mount access. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.deauthorize(vp, "{guest_entity}") - """.format( - group_id=group_id, - volume_id=volume_id, - guest_entity=guest_entity - ))) - self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity, - mount_path, readonly=True) - - # The effect of the change in access level to read-only is not - # immediate. The guest sees the change only after a remount of - # the volume. - guest_mount.umount_wait() - guest_mount.mount(mount_path=mount_path) - - # Read existing content of the volume. - self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"]) - # Cannot write into read-only volume. - with self.assertRaises(CommandFailedError): - guest_mount.write_n_mb("rogue.bin", 1) - - def test_get_authorized_ids(self): - """ - That for a volume, the authorized IDs and their access levels - can be obtained using CephFSVolumeClient's get_authorized_ids(). - """ - volumeclient_mount = self.mounts[1] - volumeclient_mount.umount_wait() - - # Configure volumeclient_mount as the handle for driving volumeclient. - self._configure_vc_auth(volumeclient_mount, "manila") - - group_id = "grpid" - volume_id = "volid" - guest_entity_1 = "guest1" - guest_entity_2 = "guest2" - - log.info("print group ID: {0}".format(group_id)) - - # Create a volume. - auths = self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.create_volume(vp, 1024*1024*10) - auths = vc.get_authorized_ids(vp) - print auths - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - # Check the list of authorized IDs for the volume. - expected_result = None - self.assertEqual(str(expected_result), auths) - - # Allow two auth IDs access to the volume. - auths = self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.authorize(vp, "{guest_entity_1}", readonly=False) - vc.authorize(vp, "{guest_entity_2}", readonly=True) - auths = vc.get_authorized_ids(vp) - print auths - """.format( - group_id=group_id, - volume_id=volume_id, - guest_entity_1=guest_entity_1, - guest_entity_2=guest_entity_2, - ))) - # Check the list of authorized IDs and their access levels. - expected_result = [(u'guest1', u'rw'), (u'guest2', u'r')] - self.assertItemsEqual(str(expected_result), auths) - - # Disallow both the auth IDs' access to the volume. - auths = self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.deauthorize(vp, "{guest_entity_1}") - vc.deauthorize(vp, "{guest_entity_2}") - auths = vc.get_authorized_ids(vp) - print auths - """.format( - group_id=group_id, - volume_id=volume_id, - guest_entity_1=guest_entity_1, - guest_entity_2=guest_entity_2, - ))) - # Check the list of authorized IDs for the volume. - expected_result = None - self.assertItemsEqual(str(expected_result), auths) - - def test_multitenant_volumes(self): - """ - That volume access can be restricted to a tenant. - - That metadata used to enforce tenant isolation of - volumes is stored as a two-way mapping between auth - IDs and volumes that they're authorized to access. - """ - volumeclient_mount = self.mounts[1] - volumeclient_mount.umount_wait() - - # Configure volumeclient_mount as the handle for driving volumeclient. - self._configure_vc_auth(volumeclient_mount, "manila") - - group_id = "groupid" - volume_id = "volumeid" - - # Guest clients belonging to different tenants, but using the same - # auth ID. - auth_id = "guest" - guestclient_1 = { - "auth_id": auth_id, - "tenant_id": "tenant1", - } - guestclient_2 = { - "auth_id": auth_id, - "tenant_id": "tenant2", - } - - # Create a volume. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.create_volume(vp, 1024*1024*10) - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - - # Check that volume metadata file is created on volume creation. - vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id) - self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes")) - - # Authorize 'guestclient_1', using auth ID 'guest' and belonging to - # 'tenant1', with 'rw' access to the volume. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}") - """.format( - group_id=group_id, - volume_id=volume_id, - auth_id=guestclient_1["auth_id"], - tenant_id=guestclient_1["tenant_id"] - ))) - - # Check that auth metadata file for auth ID 'guest', is - # created on authorizing 'guest' access to the volume. - auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"]) - self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes")) - - # Verify that the auth metadata file stores the tenant ID that the - # auth ID belongs to, the auth ID's authorized access levels - # for different volumes, versioning details, etc. - expected_auth_metadata = { - u"version": 2, - u"compat_version": 1, - u"dirty": False, - u"tenant_id": u"tenant1", - u"volumes": { - u"groupid/volumeid": { - u"dirty": False, - u"access_level": u"rw", - } - } - } - - auth_metadata = self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - auth_metadata = vc._auth_metadata_get("{auth_id}") - print auth_metadata - """.format( - group_id=group_id, - volume_id=volume_id, - auth_id=guestclient_1["auth_id"], - ))) - - self.assertItemsEqual(str(expected_auth_metadata), auth_metadata) - - # Verify that the volume metadata file stores info about auth IDs - # and their access levels to the volume, versioning details, etc. - expected_vol_metadata = { - u"version": 2, - u"compat_version": 1, - u"auths": { - u"guest": { - u"dirty": False, - u"access_level": u"rw" - } - } - } - - vol_metadata = self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - volume_metadata = vc._volume_metadata_get(vp) - print volume_metadata - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - self.assertItemsEqual(str(expected_vol_metadata), vol_metadata) - - # Cannot authorize 'guestclient_2' to access the volume. - # It uses auth ID 'guest', which has already been used by a - # 'guestclient_1' belonging to an another tenant for accessing - # the volume. - with self.assertRaises(CommandFailedError): - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}") - """.format( - group_id=group_id, - volume_id=volume_id, - auth_id=guestclient_2["auth_id"], - tenant_id=guestclient_2["tenant_id"] - ))) - - # Check that auth metadata file is cleaned up on removing - # auth ID's only access to a volume. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.deauthorize(vp, "{guest_entity}") - """.format( - group_id=group_id, - volume_id=volume_id, - guest_entity=guestclient_1["auth_id"] - ))) - - self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes")) - - # Check that volume metadata file is cleaned up on volume deletion. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.delete_volume(vp) - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes")) - - def test_recover_metadata(self): - """ - That volume client can recover from partial auth updates using - metadata files, which store auth info and its update status info. - """ - volumeclient_mount = self.mounts[1] - volumeclient_mount.umount_wait() - - # Configure volumeclient_mount as the handle for driving volumeclient. - self._configure_vc_auth(volumeclient_mount, "manila") - - group_id = "groupid" - volume_id = "volumeid" - - guestclient = { - "auth_id": "guest", - "tenant_id": "tenant", - } - - # Create a volume. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.create_volume(vp, 1024*1024*10) - """.format( - group_id=group_id, - volume_id=volume_id, - ))) - - # Authorize 'guestclient' access to the volume. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}") - """.format( - group_id=group_id, - volume_id=volume_id, - auth_id=guestclient["auth_id"], - tenant_id=guestclient["tenant_id"] - ))) - - # Check that auth metadata file for auth ID 'guest' is created. - auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"]) - self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes")) - - # Induce partial auth update state by modifying the auth metadata file, - # and then run recovery procedure. - self._volume_client_python(volumeclient_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - auth_metadata = vc._auth_metadata_get("{auth_id}") - auth_metadata['dirty'] = True - vc._auth_metadata_set("{auth_id}", auth_metadata) - vc.recover() - """.format( - group_id=group_id, - volume_id=volume_id, - auth_id=guestclient["auth_id"], - ))) - - def test_put_object(self): - vc_mount = self.mounts[1] - vc_mount.umount_wait() - self._configure_vc_auth(vc_mount, "manila") - - obj_data = 'test data' - obj_name = 'test_vc_obj_1' - pool_name = self.fs.get_data_pool_names()[0] - - self._volume_client_python(vc_mount, dedent(""" - vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}") - """.format( - pool_name = pool_name, - obj_name = obj_name, - obj_data = obj_data - ))) - - read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name) - self.assertEqual(obj_data, read_data) - - def test_get_object(self): - vc_mount = self.mounts[1] - vc_mount.umount_wait() - self._configure_vc_auth(vc_mount, "manila") - - obj_data = 'test_data' - obj_name = 'test_vc_ob_2' - pool_name = self.fs.get_data_pool_names()[0] - - self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data) - - self._volume_client_python(vc_mount, dedent(""" - data_read = vc.get_object("{pool_name}", "{obj_name}") - assert data_read == b"{obj_data}" - """.format( - pool_name = pool_name, - obj_name = obj_name, - obj_data = obj_data - ))) - - def test_delete_object(self): - vc_mount = self.mounts[1] - vc_mount.umount_wait() - self._configure_vc_auth(vc_mount, "manila") - - obj_data = 'test data' - obj_name = 'test_vc_obj_3' - pool_name = self.fs.get_data_pool_names()[0] - - self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data) - - self._volume_client_python(vc_mount, dedent(""" - data_read = vc.delete_object("{pool_name}", "{obj_name}") - """.format( - pool_name = pool_name, - obj_name = obj_name, - ))) - - with self.assertRaises(CommandFailedError): - self.fs.rados(['stat', obj_name], pool=pool_name) - - # Check idempotency -- no error raised trying to delete non-existent - # object - self._volume_client_python(vc_mount, dedent(""" - data_read = vc.delete_object("{pool_name}", "{obj_name}") - """.format( - pool_name = pool_name, - obj_name = obj_name, - ))) - - def test_21501(self): - """ - Reproducer for #21501 "ceph_volume_client: sets invalid caps for - existing IDs with no caps" (http://tracker.ceph.com/issues/21501) - """ - - vc_mount = self.mounts[1] - vc_mount.umount_wait() - - # Configure vc_mount as the handle for driving volumeclient - self._configure_vc_auth(vc_mount, "manila") - - # Create a volume - group_id = "grpid" - volume_id = "volid" - mount_path = self._volume_client_python(vc_mount, dedent(""" - vp = VolumePath("{group_id}", "{volume_id}") - create_result = vc.create_volume(vp, 1024*1024*10) - print create_result['mount_path'] - """.format( - group_id=group_id, - volume_id=volume_id - ))) - - # Create an auth ID with no caps - guest_id = '21501' - self.fs.mon_manager.raw_cluster_cmd_result( - 'auth', 'get-or-create', 'client.{0}'.format(guest_id)) - - guest_mount = self.mounts[2] - guest_mount.umount_wait() - - # Set auth caps for the auth ID using the volumeclient - self._configure_guest_auth(vc_mount, guest_mount, guest_id, mount_path) - - # Mount the volume in the guest using the auth ID to assert that the - # auth caps are valid - guest_mount.mount(mount_path=mount_path) diff --git a/src/ceph/qa/tasks/cephfs_test_runner.py b/src/ceph/qa/tasks/cephfs_test_runner.py deleted file mode 100644 index d57e85d..0000000 --- a/src/ceph/qa/tasks/cephfs_test_runner.py +++ /dev/null @@ -1,209 +0,0 @@ -import contextlib -import logging -import os -import unittest -from unittest import suite, loader, case -from teuthology.task import interactive -from teuthology import misc -from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster -from tasks.mgr.mgr_test_case import MgrCluster - -log = logging.getLogger(__name__) - - -class DecoratingLoader(loader.TestLoader): - """ - A specialization of TestLoader that tags some extra attributes - onto test classes as they are loaded. - """ - def __init__(self, params): - self._params = params - super(DecoratingLoader, self).__init__() - - def _apply_params(self, obj): - for k, v in self._params.items(): - setattr(obj, k, v) - - def loadTestsFromTestCase(self, testCaseClass): - self._apply_params(testCaseClass) - return super(DecoratingLoader, self).loadTestsFromTestCase(testCaseClass) - - def loadTestsFromName(self, name, module=None): - result = super(DecoratingLoader, self).loadTestsFromName(name, module) - - # Special case for when we were called with the name of a method, we get - # a suite with one TestCase - tests_in_result = list(result) - if len(tests_in_result) == 1 and isinstance(tests_in_result[0], case.TestCase): - self._apply_params(tests_in_result[0]) - - return result - - -class LogStream(object): - def __init__(self): - self.buffer = "" - - def write(self, data): - self.buffer += data - if "\n" in self.buffer: - lines = self.buffer.split("\n") - for line in lines[:-1]: - log.info(line) - self.buffer = lines[-1] - - def flush(self): - pass - - -class InteractiveFailureResult(unittest.TextTestResult): - """ - Specialization that implements interactive-on-error style - behavior. - """ - ctx = None - - def addFailure(self, test, err): - log.error(self._exc_info_to_string(err, test)) - log.error("Failure in test '{0}', going interactive".format( - self.getDescription(test) - )) - interactive.task(ctx=self.ctx, config=None) - - def addError(self, test, err): - log.error(self._exc_info_to_string(err, test)) - log.error("Error in test '{0}', going interactive".format( - self.getDescription(test) - )) - interactive.task(ctx=self.ctx, config=None) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the CephFS test cases. - - Run everything in tasks/cephfs/test_*.py: - - :: - - tasks: - - install: - - ceph: - - ceph-fuse: - - cephfs_test_runner: - - `modules` argument allows running only some specific modules: - - :: - - tasks: - ... - - cephfs_test_runner: - modules: - - tasks.cephfs.test_sessionmap - - tasks.cephfs.test_auto_repair - - By default, any cases that can't be run on the current cluster configuration - will generate a failure. When the optional `fail_on_skip` argument is set - to false, any tests that can't be run on the current configuration will - simply be skipped: - - :: - tasks: - ... - - cephfs_test_runner: - fail_on_skip: false - - """ - - ceph_cluster = CephCluster(ctx) - - if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))): - mds_cluster = MDSCluster(ctx) - fs = Filesystem(ctx) - else: - mds_cluster = None - fs = None - - if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))): - mgr_cluster = MgrCluster(ctx) - else: - mgr_cluster = None - - # Mount objects, sorted by ID - if hasattr(ctx, 'mounts'): - mounts = [v for k, v in sorted(ctx.mounts.items(), lambda a, b: cmp(a[0], b[0]))] - else: - # The test configuration has a filesystem but no fuse/kclient mounts - mounts = [] - - decorating_loader = DecoratingLoader({ - "ctx": ctx, - "mounts": mounts, - "fs": fs, - "ceph_cluster": ceph_cluster, - "mds_cluster": mds_cluster, - "mgr_cluster": mgr_cluster, - }) - - fail_on_skip = config.get('fail_on_skip', True) - - # Put useful things onto ctx for interactive debugging - ctx.fs = fs - ctx.mds_cluster = mds_cluster - ctx.mgr_cluster = mgr_cluster - - # Depending on config, either load specific modules, or scan for moduless - if config and 'modules' in config and config['modules']: - module_suites = [] - for mod_name in config['modules']: - # Test names like cephfs.test_auto_repair - module_suites.append(decorating_loader.loadTestsFromName(mod_name)) - overall_suite = suite.TestSuite(module_suites) - else: - # Default, run all tests - overall_suite = decorating_loader.discover( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "cephfs/" - ) - ) - - if ctx.config.get("interactive-on-error", False): - InteractiveFailureResult.ctx = ctx - result_class = InteractiveFailureResult - else: - result_class = unittest.TextTestResult - - class LoggingResult(result_class): - def startTest(self, test): - log.info("Starting test: {0}".format(self.getDescription(test))) - return super(LoggingResult, self).startTest(test) - - def addSkip(self, test, reason): - if fail_on_skip: - # Don't just call addFailure because that requires a traceback - self.failures.append((test, reason)) - else: - super(LoggingResult, self).addSkip(test, reason) - - # Execute! - result = unittest.TextTestRunner( - stream=LogStream(), - resultclass=LoggingResult, - verbosity=2, - failfast=True).run(overall_suite) - - if not result.wasSuccessful(): - result.printErrors() # duplicate output at end for convenience - - bad_tests = [] - for test, error in result.errors: - bad_tests.append(str(test)) - for test, failure in result.failures: - bad_tests.append(str(test)) - - raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests))) - - yield diff --git a/src/ceph/qa/tasks/check_counter.py b/src/ceph/qa/tasks/check_counter.py deleted file mode 100644 index a3d84e0..0000000 --- a/src/ceph/qa/tasks/check_counter.py +++ /dev/null @@ -1,96 +0,0 @@ - -import logging -import json - -from teuthology.task import Task -from teuthology import misc -import ceph_manager - -log = logging.getLogger(__name__) - - -class CheckCounter(Task): - """ - Use this task to validate that some daemon perf counters were - incremented by the nested tasks. - - Config: - 'cluster_name': optional, specify which cluster - 'target': dictionary of daemon type to list of performance counters. - 'dry_run': just log the value of the counters, don't fail if they - aren't nonzero. - - Success condition is that for all of the named counters, at least - one of the daemons of that type has the counter nonzero. - - Example to check cephfs dirfrag splits are happening: - - install: - - ceph: - - ceph-fuse: - - check-counter: - counters: - mds: - - "mds.dir_split" - - workunit: ... - """ - - def start(self): - log.info("START") - - def end(self): - cluster_name = self.config.get('cluster_name', None) - dry_run = self.config.get('dry_run', False) - targets = self.config.get('counters', {}) - - if cluster_name is None: - cluster_name = self.ctx.managers.keys()[0] - - for daemon_type, counters in targets.items(): - # List of 'a', 'b', 'c'... - daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type)) - daemons = dict([(daemon_id, - self.ctx.daemons.get_daemon(daemon_type, daemon_id)) - for daemon_id in daemon_ids]) - - seen = set() - - for daemon_id, daemon in daemons.items(): - if not daemon.running(): - log.info("Ignoring daemon {0}, it isn't running".format(daemon_id)) - continue - else: - log.debug("Getting stats from {0}".format(daemon_id)) - - manager = self.ctx.managers[cluster_name] - proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"]) - response_data = proc.stdout.getvalue().strip() - if response_data: - perf_dump = json.loads(response_data) - else: - log.warning("No admin socket response from {0}, skipping".format(daemon_id)) - continue - - for counter in counters: - subsys, counter_id = counter.split(".") - if subsys not in perf_dump or counter_id not in perf_dump[subsys]: - log.warning("Counter '{0}' not found on daemon {1}.{2}".format( - counter, daemon_type, daemon_id)) - continue - value = perf_dump[subsys][counter_id] - - log.info("Daemon {0}.{1} {2}={3}".format( - daemon_type, daemon_id, counter, value - )) - - if value > 0: - seen.add(counter) - - if not dry_run: - unseen = set(counters) - set(seen) - if unseen: - raise RuntimeError("The following counters failed to be set " - "on {0} daemons: {1}".format( - daemon_type, unseen - )) - -task = CheckCounter diff --git a/src/ceph/qa/tasks/cifs_mount.py b/src/ceph/qa/tasks/cifs_mount.py deleted file mode 100644 index b282b0b..0000000 --- a/src/ceph/qa/tasks/cifs_mount.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Mount cifs clients. Unmount when finished. -""" -import contextlib -import logging -import os - -from teuthology import misc as teuthology -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Mount/unmount a cifs client. - - The config is optional and defaults to mounting on all clients. If - a config is given, it is expected to be a list of clients to do - this operation on. - - Example that starts smbd and mounts cifs on all nodes:: - - tasks: - - ceph: - - samba: - - cifs-mount: - - interactive: - - Example that splits smbd and cifs: - - tasks: - - ceph: - - samba: [samba.0] - - cifs-mount: [client.0] - - ceph-fuse: [client.1] - - interactive: - - Example that specifies the share name: - - tasks: - - ceph: - - ceph-fuse: - - samba: - samba.0: - cephfuse: "{testdir}/mnt.0" - - cifs-mount: - client.0: - share: cephfuse - - :param ctx: Context - :param config: Configuration - """ - log.info('Mounting cifs clients...') - - if config is None: - config = dict(('client.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys())) - - from .samba import get_sambas - samba_roles = ['samba.{id_}'.format(id_=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')] - sambas = list(get_sambas(ctx=ctx, roles=samba_roles)) - (ip, _) = sambas[0][1].ssh.get_transport().getpeername() - log.info('samba ip: {ip}'.format(ip=ip)) - - for id_, remote in clients: - mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_)) - log.info('Mounting cifs client.{id} at {remote} {mnt}...'.format( - id=id_, remote=remote,mnt=mnt)) - - remote.run( - args=[ - 'mkdir', - '--', - mnt, - ], - ) - - rolestr = 'client.{id_}'.format(id_=id_) - unc = "ceph" - log.info("config: {c}".format(c=config)) - if config[rolestr] is not None and 'share' in config[rolestr]: - unc = config[rolestr]['share'] - - remote.run( - args=[ - 'sudo', - 'mount', - '-t', - 'cifs', - '//{sambaip}/{unc}'.format(sambaip=ip, unc=unc), - '-o', - 'username=ubuntu,password=ubuntu', - mnt, - ], - ) - - remote.run( - args=[ - 'sudo', - 'chown', - 'ubuntu:ubuntu', - '{m}/'.format(m=mnt), - ], - ) - - try: - yield - finally: - log.info('Unmounting cifs clients...') - for id_, remote in clients: - remote.run( - args=[ - 'sudo', - 'umount', - mnt, - ], - ) - for id_, remote in clients: - while True: - try: - remote.run( - args=[ - 'rmdir', '--', mnt, - run.Raw('2>&1'), - run.Raw('|'), - 'grep', 'Device or resource busy', - ], - ) - import time - time.sleep(1) - except Exception: - break diff --git a/src/ceph/qa/tasks/cram.py b/src/ceph/qa/tasks/cram.py deleted file mode 100644 index 02c6667..0000000 --- a/src/ceph/qa/tasks/cram.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Cram tests -""" -import logging -import os - -from teuthology import misc as teuthology -from teuthology.parallel import parallel -from teuthology.orchestra import run -from teuthology.config import config as teuth_config - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Run all cram tests from the specified urls on the specified - clients. Each client runs tests in parallel. - - Limitations: - Tests must have a .t suffix. Tests with duplicate names will - overwrite each other, so only the last one will run. - - For example:: - - tasks: - - ceph: - - cram: - clients: - client.0: - - http://download.ceph.com/qa/test.t - - http://download.ceph.com/qa/test2.t] - client.1: [http://download.ceph.com/qa/test.t] - branch: foo - - You can also run a list of cram tests on all clients:: - - tasks: - - ceph: - - cram: - clients: - all: [http://download.ceph.com/qa/test.t] - - :param ctx: Context - :param config: Configuration - """ - assert isinstance(config, dict) - assert 'clients' in config and isinstance(config['clients'], dict), \ - 'configuration must contain a dictionary of clients' - - clients = teuthology.replace_all_with_clients(ctx.cluster, - config['clients']) - testdir = teuthology.get_testdir(ctx) - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('workunit', {})) - - refspec = config.get('branch') - if refspec is None: - refspec = config.get('tag') - if refspec is None: - refspec = config.get('sha1') - if refspec is None: - refspec = 'HEAD' - - # hack: the git_url is always ceph-ci or ceph - git_url = teuth_config.get_ceph_git_url() - repo_name = 'ceph.git' - if git_url.count('ceph-ci'): - repo_name = 'ceph-ci.git' - - try: - for client, tests in clients.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client) - remote.run( - args=[ - 'mkdir', '--', client_dir, - run.Raw('&&'), - 'virtualenv', '{tdir}/virtualenv'.format(tdir=testdir), - run.Raw('&&'), - '{tdir}/virtualenv/bin/pip'.format(tdir=testdir), - 'install', 'cram==0.6', - ], - ) - for test in tests: - url = test.format(repo=repo_name, branch=refspec) - log.info('fetching test %s for %s', url, client) - assert test.endswith('.t'), 'tests must end in .t' - remote.run( - args=[ - 'wget', '-nc', '-nv', '-P', client_dir, '--', url, - ], - ) - - with parallel() as p: - for role in clients.iterkeys(): - p.spawn(_run_tests, ctx, role) - finally: - for client, tests in clients.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client) - test_files = set([test.rsplit('/', 1)[1] for test in tests]) - - # remove test files unless they failed - for test_file in test_files: - abs_file = os.path.join(client_dir, test_file) - remote.run( - args=[ - 'test', '-f', abs_file + '.err', - run.Raw('||'), - 'rm', '-f', '--', abs_file, - ], - ) - - # ignore failure since more than one client may - # be run on a host, and the client dir should be - # non-empty if the test failed - remote.run( - args=[ - 'rm', '-rf', '--', - '{tdir}/virtualenv'.format(tdir=testdir), - run.Raw(';'), - 'rmdir', '--ignore-fail-on-non-empty', client_dir, - ], - ) - -def _run_tests(ctx, role): - """ - For each role, check to make sure it's a client, then run the cram on that client - - :param ctx: Context - :param role: Roles - """ - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - ceph_ref = ctx.summary.get('ceph-sha1', 'master') - - testdir = teuthology.get_testdir(ctx) - log.info('Running tests for %s...', role) - remote.run( - args=[ - run.Raw('CEPH_REF={ref}'.format(ref=ceph_ref)), - run.Raw('CEPH_ID="{id}"'.format(id=id_)), - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - '{tdir}/virtualenv/bin/cram'.format(tdir=testdir), - '-v', '--', - run.Raw('{tdir}/archive/cram.{role}/*.t'.format(tdir=testdir, role=role)), - ], - logger=log.getChild(role), - ) diff --git a/src/ceph/qa/tasks/create_verify_lfn_objects.py b/src/ceph/qa/tasks/create_verify_lfn_objects.py deleted file mode 100644 index 01ab1a3..0000000 --- a/src/ceph/qa/tasks/create_verify_lfn_objects.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Rados modle-based integration tests -""" -import contextlib -import logging - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - For each combination of namespace and name_length, create - <num_objects> objects with name length <name_length> - on entry. On exit, verify that the objects still exist, can - be deleted, and then don't exist. - - Usage:: - - create_verify_lfn_objects.py: - pool: <pool_name> default: 'data' - prefix: <prefix> default: '' - namespace: [<namespace>] default: [''] - num_objects: [<num_objects>] default: 10 - name_length: [<name_length>] default: [400] - """ - pool = config.get('pool', 'data') - num_objects = config.get('num_objects', 10) - name_length = config.get('name_length', [400]) - namespace = config.get('namespace', [None]) - prefix = config.get('prefix', None) - manager = ctx.managers['ceph'] - - objects = [] - for l in name_length: - for ns in namespace: - def object_name(i): - nslength = 0 - if namespace is not '': - nslength = len(namespace) - numstr = str(i) - fillerlen = l - nslength - len(prefix) - len(numstr) - assert fillerlen >= 0 - return prefix + ('a'*fillerlen) + numstr - objects += [(ns, object_name(i)) for i in range(num_objects)] - - for ns, name in objects: - err = manager.do_put( - pool, - name, - '/etc/resolv.conf', - namespace=ns) - log.info("err is " + str(err)) - assert err == 0 - - try: - yield - finally: - log.info('ceph_verify_lfn_objects verifying...') - for ns, name in objects: - err = manager.do_get( - pool, - name, - namespace=ns) - log.info("err is " + str(err)) - assert err == 0 - - log.info('ceph_verify_lfn_objects deleting...') - for ns, name in objects: - err = manager.do_rm( - pool, - name, - namespace=ns) - log.info("err is " + str(err)) - assert err == 0 - - log.info('ceph_verify_lfn_objects verifying absent...') - for ns, name in objects: - err = manager.do_get( - pool, - name, - namespace=ns) - log.info("err is " + str(err)) - assert err != 0 diff --git a/src/ceph/qa/tasks/devstack.py b/src/ceph/qa/tasks/devstack.py deleted file mode 100644 index 943a9ff..0000000 --- a/src/ceph/qa/tasks/devstack.py +++ /dev/null @@ -1,382 +0,0 @@ -#!/usr/bin/env python -import contextlib -import logging -from cStringIO import StringIO -import textwrap -from configparser import ConfigParser -import time - -from teuthology.orchestra import run -from teuthology import misc -from teuthology.contextutil import nested - -log = logging.getLogger(__name__) - -DEVSTACK_GIT_REPO = 'https://github.com/openstack-dev/devstack.git' -DS_STABLE_BRANCHES = ("havana", "grizzly") - -is_devstack_node = lambda role: role.startswith('devstack') -is_osd_node = lambda role: role.startswith('osd') - - -@contextlib.contextmanager -def task(ctx, config): - if config is None: - config = {} - if not isinstance(config, dict): - raise TypeError("config must be a dict") - with nested(lambda: install(ctx=ctx, config=config), - lambda: smoke(ctx=ctx, config=config), - ): - yield - - -@contextlib.contextmanager -def install(ctx, config): - """ - Install OpenStack DevStack and configure it to use a Ceph cluster for - Glance and Cinder. - - Requires one node with a role 'devstack' - - Since devstack runs rampant on the system it's used on, typically you will - want to reprovision that machine after using devstack on it. - - Also, the default 2GB of RAM that is given to vps nodes is insufficient. I - recommend 4GB. Downburst can be instructed to give 4GB to a vps node by - adding this to the yaml: - - downburst: - ram: 4G - - This was created using documentation found here: - https://github.com/openstack-dev/devstack/blob/master/README.md - http://docs.ceph.com/docs/master/rbd/rbd-openstack/ - """ - if config is None: - config = {} - if not isinstance(config, dict): - raise TypeError("config must be a dict") - - devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0] - an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0] - - devstack_branch = config.get("branch", "master") - install_devstack(devstack_node, devstack_branch) - try: - configure_devstack_and_ceph(ctx, config, devstack_node, an_osd_node) - yield - finally: - pass - - -def install_devstack(devstack_node, branch="master"): - log.info("Cloning DevStack repo...") - - args = ['git', 'clone', DEVSTACK_GIT_REPO] - devstack_node.run(args=args) - - if branch != "master": - if branch in DS_STABLE_BRANCHES and not branch.startswith("stable"): - branch = "stable/" + branch - log.info("Checking out {branch} branch...".format(branch=branch)) - cmd = "cd devstack && git checkout " + branch - devstack_node.run(args=cmd) - - log.info("Installing DevStack...") - args = ['cd', 'devstack', run.Raw('&&'), './stack.sh'] - devstack_node.run(args=args) - - -def configure_devstack_and_ceph(ctx, config, devstack_node, ceph_node): - pool_size = config.get('pool_size', '128') - create_pools(ceph_node, pool_size) - distribute_ceph_conf(devstack_node, ceph_node) - # This is where we would install python-ceph and ceph-common but it appears - # the ceph task does that for us. - generate_ceph_keys(ceph_node) - distribute_ceph_keys(devstack_node, ceph_node) - secret_uuid = set_libvirt_secret(devstack_node, ceph_node) - update_devstack_config_files(devstack_node, secret_uuid) - set_apache_servername(devstack_node) - # Rebooting is the most-often-used method of restarting devstack services - misc.reboot(devstack_node) - start_devstack(devstack_node) - restart_apache(devstack_node) - - -def create_pools(ceph_node, pool_size): - log.info("Creating pools on Ceph cluster...") - - for pool_name in ['volumes', 'images', 'backups']: - args = ['sudo', 'ceph', 'osd', 'pool', 'create', pool_name, pool_size] - ceph_node.run(args=args) - - -def distribute_ceph_conf(devstack_node, ceph_node): - log.info("Copying ceph.conf to DevStack node...") - - ceph_conf_path = '/etc/ceph/ceph.conf' - ceph_conf = misc.get_file(ceph_node, ceph_conf_path, sudo=True) - misc.sudo_write_file(devstack_node, ceph_conf_path, ceph_conf) - - -def generate_ceph_keys(ceph_node): - log.info("Generating Ceph keys...") - - ceph_auth_cmds = [ - ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder', 'mon', - 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rx pool=images'], # noqa - ['sudo', 'ceph', 'auth', 'get-or-create', 'client.glance', 'mon', - 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=images'], # noqa - ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder-backup', 'mon', - 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=backups'], # noqa - ] - for cmd in ceph_auth_cmds: - ceph_node.run(args=cmd) - - -def distribute_ceph_keys(devstack_node, ceph_node): - log.info("Copying Ceph keys to DevStack node...") - - def copy_key(from_remote, key_name, to_remote, dest_path, owner): - key_stringio = StringIO() - from_remote.run( - args=['sudo', 'ceph', 'auth', 'get-or-create', key_name], - stdout=key_stringio) - key_stringio.seek(0) - misc.sudo_write_file(to_remote, dest_path, - key_stringio, owner=owner) - keys = [ - dict(name='client.glance', - path='/etc/ceph/ceph.client.glance.keyring', - # devstack appears to just want root:root - #owner='glance:glance', - ), - dict(name='client.cinder', - path='/etc/ceph/ceph.client.cinder.keyring', - # devstack appears to just want root:root - #owner='cinder:cinder', - ), - dict(name='client.cinder-backup', - path='/etc/ceph/ceph.client.cinder-backup.keyring', - # devstack appears to just want root:root - #owner='cinder:cinder', - ), - ] - for key_dict in keys: - copy_key(ceph_node, key_dict['name'], devstack_node, - key_dict['path'], key_dict.get('owner')) - - -def set_libvirt_secret(devstack_node, ceph_node): - log.info("Setting libvirt secret...") - - cinder_key_stringio = StringIO() - ceph_node.run(args=['sudo', 'ceph', 'auth', 'get-key', 'client.cinder'], - stdout=cinder_key_stringio) - cinder_key = cinder_key_stringio.getvalue().strip() - - uuid_stringio = StringIO() - devstack_node.run(args=['uuidgen'], stdout=uuid_stringio) - uuid = uuid_stringio.getvalue().strip() - - secret_path = '/tmp/secret.xml' - secret_template = textwrap.dedent(""" - <secret ephemeral='no' private='no'> - <uuid>{uuid}</uuid> - <usage type='ceph'> - <name>client.cinder secret</name> - </usage> - </secret>""") - misc.sudo_write_file(devstack_node, secret_path, - secret_template.format(uuid=uuid)) - devstack_node.run(args=['sudo', 'virsh', 'secret-define', '--file', - secret_path]) - devstack_node.run(args=['sudo', 'virsh', 'secret-set-value', '--secret', - uuid, '--base64', cinder_key]) - return uuid - - -def update_devstack_config_files(devstack_node, secret_uuid): - log.info("Updating DevStack config files to use Ceph...") - - def backup_config(node, file_name, backup_ext='.orig.teuth'): - node.run(args=['cp', '-f', file_name, file_name + backup_ext]) - - def update_config(config_name, config_stream, update_dict, - section='DEFAULT'): - parser = ConfigParser() - parser.read_file(config_stream) - for (key, value) in update_dict.items(): - parser.set(section, key, value) - out_stream = StringIO() - parser.write(out_stream) - out_stream.seek(0) - return out_stream - - updates = [ - dict(name='/etc/glance/glance-api.conf', options=dict( - default_store='rbd', - rbd_store_user='glance', - rbd_store_pool='images', - show_image_direct_url='True',)), - dict(name='/etc/cinder/cinder.conf', options=dict( - volume_driver='cinder.volume.drivers.rbd.RBDDriver', - rbd_pool='volumes', - rbd_ceph_conf='/etc/ceph/ceph.conf', - rbd_flatten_volume_from_snapshot='false', - rbd_max_clone_depth='5', - glance_api_version='2', - rbd_user='cinder', - rbd_secret_uuid=secret_uuid, - backup_driver='cinder.backup.drivers.ceph', - backup_ceph_conf='/etc/ceph/ceph.conf', - backup_ceph_user='cinder-backup', - backup_ceph_chunk_size='134217728', - backup_ceph_pool='backups', - backup_ceph_stripe_unit='0', - backup_ceph_stripe_count='0', - restore_discard_excess_bytes='true', - )), - dict(name='/etc/nova/nova.conf', options=dict( - libvirt_images_type='rbd', - libvirt_images_rbd_pool='volumes', - libvirt_images_rbd_ceph_conf='/etc/ceph/ceph.conf', - rbd_user='cinder', - rbd_secret_uuid=secret_uuid, - libvirt_inject_password='false', - libvirt_inject_key='false', - libvirt_inject_partition='-2', - )), - ] - - for update in updates: - file_name = update['name'] - options = update['options'] - config_str = misc.get_file(devstack_node, file_name, sudo=True) - config_stream = StringIO(config_str) - backup_config(devstack_node, file_name) - new_config_stream = update_config(file_name, config_stream, options) - misc.sudo_write_file(devstack_node, file_name, new_config_stream) - - -def set_apache_servername(node): - # Apache complains: "Could not reliably determine the server's fully - # qualified domain name, using 127.0.0.1 for ServerName" - # So, let's make sure it knows its name. - log.info("Setting Apache ServerName...") - - hostname = node.hostname - config_file = '/etc/apache2/conf.d/servername' - misc.sudo_write_file(node, config_file, - "ServerName {name}".format(name=hostname)) - - -def start_devstack(devstack_node): - log.info("Patching devstack start script...") - # This causes screen to start headless - otherwise rejoin-stack.sh fails - # because there is no terminal attached. - cmd = "cd devstack && sed -ie 's/screen -c/screen -dm -c/' rejoin-stack.sh" - devstack_node.run(args=cmd) - - log.info("Starting devstack...") - cmd = "cd devstack && ./rejoin-stack.sh" - devstack_node.run(args=cmd) - - # This was added because I was getting timeouts on Cinder requests - which - # were trying to access Keystone on port 5000. A more robust way to handle - # this would be to introduce a wait-loop on devstack_node that checks to - # see if a service is listening on port 5000. - log.info("Waiting 30s for devstack to start...") - time.sleep(30) - - -def restart_apache(node): - node.run(args=['sudo', '/etc/init.d/apache2', 'restart'], wait=True) - - -@contextlib.contextmanager -def exercise(ctx, config): - log.info("Running devstack exercises...") - - if config is None: - config = {} - if not isinstance(config, dict): - raise TypeError("config must be a dict") - - devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0] - - # TODO: save the log *and* preserve failures - #devstack_archive_dir = create_devstack_archive(ctx, devstack_node) - - try: - #cmd = "cd devstack && ./exercise.sh 2>&1 | tee {dir}/exercise.log".format( # noqa - # dir=devstack_archive_dir) - cmd = "cd devstack && ./exercise.sh" - devstack_node.run(args=cmd, wait=True) - yield - finally: - pass - - -def create_devstack_archive(ctx, devstack_node): - test_dir = misc.get_testdir(ctx) - devstack_archive_dir = "{test_dir}/archive/devstack".format( - test_dir=test_dir) - devstack_node.run(args="mkdir -p " + devstack_archive_dir) - return devstack_archive_dir - - -@contextlib.contextmanager -def smoke(ctx, config): - log.info("Running a basic smoketest...") - - devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0] - an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0] - - try: - create_volume(devstack_node, an_osd_node, 'smoke0', 1) - yield - finally: - pass - - -def create_volume(devstack_node, ceph_node, vol_name, size): - """ - :param size: The size of the volume, in GB - """ - size = str(size) - log.info("Creating a {size}GB volume named {name}...".format( - name=vol_name, - size=size)) - args = ['source', 'devstack/openrc', run.Raw('&&'), 'cinder', 'create', - '--display-name', vol_name, size] - out_stream = StringIO() - devstack_node.run(args=args, stdout=out_stream, wait=True) - vol_info = parse_os_table(out_stream.getvalue()) - log.debug("Volume info: %s", str(vol_info)) - - out_stream = StringIO() - try: - ceph_node.run(args="rbd --id cinder ls -l volumes", stdout=out_stream, - wait=True) - except run.CommandFailedError: - log.debug("Original rbd call failed; retrying without '--id cinder'") - ceph_node.run(args="rbd ls -l volumes", stdout=out_stream, - wait=True) - - assert vol_info['id'] in out_stream.getvalue(), \ - "Volume not found on Ceph cluster" - assert vol_info['size'] == size, \ - "Volume size on Ceph cluster is different than specified" - return vol_info['id'] - - -def parse_os_table(table_str): - out_dict = dict() - for line in table_str.split('\n'): - if line.startswith('|'): - items = line.split() - out_dict[items[1]] = items[3] - return out_dict diff --git a/src/ceph/qa/tasks/die_on_err.py b/src/ceph/qa/tasks/die_on_err.py deleted file mode 100644 index bf422ae..0000000 --- a/src/ceph/qa/tasks/die_on_err.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Raise exceptions on osd coredumps or test err directories -""" -import contextlib -import logging -import time -from teuthology.orchestra import run - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Die if {testdir}/err exists or if an OSD dumps core - """ - if config is None: - config = {} - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < num_osds: - time.sleep(10) - - testdir = teuthology.get_testdir(ctx) - - while True: - for i in range(num_osds): - (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys() - p = osd_remote.run( - args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ], - wait=True, - check_status=False, - ) - exit_status = p.exitstatus - - if exit_status == 0: - log.info("osd %d has an error" % i) - raise Exception("osd %d error" % i) - - log_path = '/var/log/ceph/osd.%d.log' % (i) - - p = osd_remote.run( - args = [ - 'tail', '-1', log_path, - run.Raw('|'), - 'grep', '-q', 'end dump' - ], - wait=True, - check_status=False, - ) - exit_status = p.exitstatus - - if exit_status == 0: - log.info("osd %d dumped core" % i) - raise Exception("osd %d dumped core" % i) - - time.sleep(5) diff --git a/src/ceph/qa/tasks/divergent_priors.py b/src/ceph/qa/tasks/divergent_priors.py deleted file mode 100644 index 12ea933..0000000 --- a/src/ceph/qa/tasks/divergent_priors.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -Special case divergence test -""" -import logging -import time - -from teuthology import misc as teuthology -from util.rados import rados - - -log = logging.getLogger(__name__) - - -def task(ctx, config): - """ - Test handling of divergent entries with prior_version - prior to log_tail - - overrides: - ceph: - conf: - osd: - debug osd: 5 - - Requires 3 osds on a single test node. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'divergent_priors task only accepts a dict for configuration' - - manager = ctx.managers['ceph'] - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.flush_pg_stats([0, 1, 2]) - manager.raw_cluster_cmd('osd', 'set', 'noout') - manager.raw_cluster_cmd('osd', 'set', 'noin') - manager.raw_cluster_cmd('osd', 'set', 'nodown') - manager.wait_for_clean() - - # something that is always there - dummyfile = '/etc/fstab' - dummyfile2 = '/etc/resolv.conf' - - # create 1 pg pool - log.info('creating foo') - manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1') - - osds = [0, 1, 2] - for i in osds: - manager.set_config(i, osd_min_pg_log_entries=10) - manager.set_config(i, osd_max_pg_log_entries=10) - manager.set_config(i, osd_pg_log_trim_min=5) - - # determine primary - divergent = manager.get_pg_primary('foo', 0) - log.info("primary and soon to be divergent is %d", divergent) - non_divergent = list(osds) - non_divergent.remove(divergent) - - log.info('writing initial objects') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - # write 100 objects - for i in range(100): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) - - manager.wait_for_clean() - - # blackhole non_divergent - log.info("blackholing osds %s", str(non_divergent)) - for i in non_divergent: - manager.set_config(i, objectstore_blackhole=1) - - DIVERGENT_WRITE = 5 - DIVERGENT_REMOVE = 5 - # Write some soon to be divergent - log.info('writing divergent objects') - for i in range(DIVERGENT_WRITE): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, - dummyfile2], wait=False) - # Remove some soon to be divergent - log.info('remove divergent objects') - for i in range(DIVERGENT_REMOVE): - rados(ctx, mon, ['-p', 'foo', 'rm', - 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False) - time.sleep(10) - mon.run( - args=['killall', '-9', 'rados'], - wait=True, - check_status=False) - - # kill all the osds but leave divergent in - log.info('killing all the osds') - for i in osds: - manager.kill_osd(i) - for i in osds: - manager.mark_down_osd(i) - for i in non_divergent: - manager.mark_out_osd(i) - - # bring up non-divergent - log.info("bringing up non_divergent %s", str(non_divergent)) - for i in non_divergent: - manager.revive_osd(i) - for i in non_divergent: - manager.mark_in_osd(i) - - # write 1 non-divergent object (ensure that old divergent one is divergent) - objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE) - log.info('writing non-divergent object ' + objname) - rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2]) - - manager.wait_for_recovery() - - # ensure no recovery of up osds first - log.info('delay recovery') - for i in non_divergent: - manager.wait_run_admin_socket( - 'osd', i, ['set_recovery_delay', '100000']) - - # bring in our divergent friend - log.info("revive divergent %d", divergent) - manager.raw_cluster_cmd('osd', 'set', 'noup') - manager.revive_osd(divergent) - - log.info('delay recovery divergent') - manager.wait_run_admin_socket( - 'osd', divergent, ['set_recovery_delay', '100000']) - - manager.raw_cluster_cmd('osd', 'unset', 'noup') - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - - log.info('wait for peering') - rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile]) - - # At this point the divergent_priors should have been detected - - log.info("killing divergent %d", divergent) - manager.kill_osd(divergent) - log.info("reviving divergent %d", divergent) - manager.revive_osd(divergent) - - time.sleep(20) - - log.info('allowing recovery') - # Set osd_recovery_delay_start back to 0 and kick the queue - for i in osds: - manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug', - 'kick_recovery_wq', ' 0') - - log.info('reading divergent objects') - for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE): - exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i, - '/tmp/existing']) - assert exit_status is 0 - - log.info("success") diff --git a/src/ceph/qa/tasks/divergent_priors2.py b/src/ceph/qa/tasks/divergent_priors2.py deleted file mode 100644 index 0ed7532..0000000 --- a/src/ceph/qa/tasks/divergent_priors2.py +++ /dev/null @@ -1,190 +0,0 @@ -""" -Special case divergence test with ceph-objectstore-tool export/remove/import -""" -import logging -import time -from cStringIO import StringIO - -from teuthology import misc as teuthology -from util.rados import rados -import os - - -log = logging.getLogger(__name__) - - -def task(ctx, config): - """ - Test handling of divergent entries with prior_version - prior to log_tail and a ceph-objectstore-tool export/import - - overrides: - ceph: - conf: - osd: - debug osd: 5 - - Requires 3 osds on a single test node. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'divergent_priors task only accepts a dict for configuration' - - manager = ctx.managers['ceph'] - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.flush_pg_stats([0, 1, 2]) - manager.raw_cluster_cmd('osd', 'set', 'noout') - manager.raw_cluster_cmd('osd', 'set', 'noin') - manager.raw_cluster_cmd('osd', 'set', 'nodown') - manager.wait_for_clean() - - # something that is always there - dummyfile = '/etc/fstab' - dummyfile2 = '/etc/resolv.conf' - testdir = teuthology.get_testdir(ctx) - - # create 1 pg pool - log.info('creating foo') - manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1') - - osds = [0, 1, 2] - for i in osds: - manager.set_config(i, osd_min_pg_log_entries=10) - manager.set_config(i, osd_max_pg_log_entries=10) - manager.set_config(i, osd_pg_log_trim_min=5) - - # determine primary - divergent = manager.get_pg_primary('foo', 0) - log.info("primary and soon to be divergent is %d", divergent) - non_divergent = list(osds) - non_divergent.remove(divergent) - - log.info('writing initial objects') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - # write 100 objects - for i in range(100): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) - - manager.wait_for_clean() - - # blackhole non_divergent - log.info("blackholing osds %s", str(non_divergent)) - for i in non_divergent: - manager.set_config(i, objectstore_blackhole=1) - - DIVERGENT_WRITE = 5 - DIVERGENT_REMOVE = 5 - # Write some soon to be divergent - log.info('writing divergent objects') - for i in range(DIVERGENT_WRITE): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, - dummyfile2], wait=False) - # Remove some soon to be divergent - log.info('remove divergent objects') - for i in range(DIVERGENT_REMOVE): - rados(ctx, mon, ['-p', 'foo', 'rm', - 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False) - time.sleep(10) - mon.run( - args=['killall', '-9', 'rados'], - wait=True, - check_status=False) - - # kill all the osds but leave divergent in - log.info('killing all the osds') - for i in osds: - manager.kill_osd(i) - for i in osds: - manager.mark_down_osd(i) - for i in non_divergent: - manager.mark_out_osd(i) - - # bring up non-divergent - log.info("bringing up non_divergent %s", str(non_divergent)) - for i in non_divergent: - manager.revive_osd(i) - for i in non_divergent: - manager.mark_in_osd(i) - - # write 1 non-divergent object (ensure that old divergent one is divergent) - objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE) - log.info('writing non-divergent object ' + objname) - rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2]) - - manager.wait_for_recovery() - - # ensure no recovery of up osds first - log.info('delay recovery') - for i in non_divergent: - manager.wait_run_admin_socket( - 'osd', i, ['set_recovery_delay', '100000']) - - # bring in our divergent friend - log.info("revive divergent %d", divergent) - manager.raw_cluster_cmd('osd', 'set', 'noup') - manager.revive_osd(divergent) - - log.info('delay recovery divergent') - manager.wait_run_admin_socket( - 'osd', divergent, ['set_recovery_delay', '100000']) - - manager.raw_cluster_cmd('osd', 'unset', 'noup') - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - - log.info('wait for peering') - rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile]) - - # At this point the divergent_priors should have been detected - - log.info("killing divergent %d", divergent) - manager.kill_osd(divergent) - - # Export a pg - (exp_remote,) = ctx.\ - cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys() - FSPATH = manager.get_filepath() - JPATH = os.path.join(FSPATH, "journal") - prefix = ("sudo adjust-ulimits ceph-objectstore-tool " - "--data-path {fpath} --journal-path {jpath} " - "--log-file=" - "/var/log/ceph/objectstore_tool.$$.log ". - format(fpath=FSPATH, jpath=JPATH)) - pid = os.getpid() - expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid)) - cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}"). - format(id=divergent, file=expfile)) - proc = exp_remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - assert proc.exitstatus == 0 - - cmd = ((prefix + "--op import --file {file}"). - format(id=divergent, file=expfile)) - proc = exp_remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - assert proc.exitstatus == 0 - - log.info("reviving divergent %d", divergent) - manager.revive_osd(divergent) - manager.wait_run_admin_socket('osd', divergent, ['dump_ops_in_flight']) - time.sleep(20); - - log.info('allowing recovery') - # Set osd_recovery_delay_start back to 0 and kick the queue - for i in osds: - manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug', - 'kick_recovery_wq', ' 0') - - log.info('reading divergent objects') - for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE): - exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i, - '/tmp/existing']) - assert exit_status is 0 - - cmd = 'rm {file}'.format(file=expfile) - exp_remote.run(args=cmd, wait=True) - log.info("success") diff --git a/src/ceph/qa/tasks/dnsmasq.py b/src/ceph/qa/tasks/dnsmasq.py deleted file mode 100644 index ee01b17..0000000 --- a/src/ceph/qa/tasks/dnsmasq.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Task for dnsmasq configuration -""" -import contextlib -import logging - -from teuthology import misc -from teuthology.exceptions import ConfigError -from teuthology import contextutil -from util import get_remote_for_role - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def setup_dnsmasq(remote, cnames): - """ configure dnsmasq on the given remote, adding each cname given """ - log.info('Configuring dnsmasq on remote %s..', remote.name) - - # back up existing resolv.conf - resolv_conf = misc.get_file(remote, '/etc/resolv.conf') - # point resolv.conf to local dnsmasq - misc.sudo_write_file(remote, '/etc/resolv.conf', - "nameserver 127.0.0.1\n") - - # add address entries to /etc/dnsmasq.d/ceph - dnsmasq = "server=8.8.8.8\nserver=8.8.4.4\n" - address_template = "address=/{cname}/{ip_address}\n" - for cname, ip_address in cnames.iteritems(): - dnsmasq += address_template.format(cname=cname, ip_address=ip_address) - misc.sudo_write_file(remote, '/etc/dnsmasq.d/ceph', dnsmasq) - - remote.run(args=['cat', '/etc/dnsmasq.d/ceph']) - # restart dnsmasq - remote.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq']) - remote.run(args=['sudo', 'systemctl', 'status', 'dnsmasq']) - # verify dns name is set - remote.run(args=['ping', '-c', '4', cnames.keys()[0]]) - - yield - - log.info('Removing dnsmasq configuration from remote %s..', remote.name) - # restore resolv.conf - misc.sudo_write_file(remote, '/etc/resolv.conf', resolv_conf) - # restart dnsmasq - remote.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq']) - -@contextlib.contextmanager -def task(ctx, config): - """ - Configures dnsmasq to add cnames for teuthology remotes. The task expects a - dictionary, where each key is a role. If all cnames for that role use the - same address as that role, the cnames can be given as a list. For example, - this entry configures dnsmasq on the remote associated with client.0, adding - two cnames for the ip address associated with client.0: - - - dnsmasq: - client.0: - - client0.example.com - - c0.example.com - - If the addresses do not all match the given role, a dictionary can be given - to specify the ip address by its target role. For example: - - - dnsmasq: - client.0: - client.0.example.com: client.0 - client.1.example.com: client.1 - """ - # apply overrides - overrides = config.get('overrides', {}) - misc.deep_merge(config, overrides.get('dnsmasq', {})) - - # multiple roles may map to the same remote, so collect names by remote - remote_names = {} - for role, cnames in config.iteritems(): - remote = get_remote_for_role(ctx, role) - if remote is None: - raise ConfigError('no remote for role %s' % role) - - names = remote_names.get(remote, {}) - - if isinstance(cnames, list): - # when given a list of cnames, point to local ip - for cname in cnames: - names[cname] = remote.ip_address - elif isinstance(cnames, dict): - # when given a dict, look up the remote ip for each - for cname, client in cnames.iteritems(): - r = get_remote_for_role(ctx, client) - if r is None: - raise ConfigError('no remote for role %s' % client) - names[cname] = r.ip_address - - remote_names[remote] = names - - # run a subtask for each unique remote - subtasks = [] - for remote, cnames in remote_names.iteritems(): - subtasks.extend([ lambda r=remote, cn=cnames: setup_dnsmasq(r, cn) ]) - - with contextutil.nested(*subtasks): - yield diff --git a/src/ceph/qa/tasks/dump_stuck.py b/src/ceph/qa/tasks/dump_stuck.py deleted file mode 100644 index 39429d2..0000000 --- a/src/ceph/qa/tasks/dump_stuck.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -Dump_stuck command -""" -import logging -import re -import time - -import ceph_manager -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - -def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10): - """ - Do checks. Make sure get_stuck_pgs return the right amout of information, then - extract health information from the raw_cluster_cmd and compare the results with - values passed in. This passes if all asserts pass. - - :param num_manager: Ceph manager - :param num_inactive: number of inaactive pages that are stuck - :param num_unclean: number of unclean pages that are stuck - :paran num_stale: number of stale pages that are stuck - :param timeout: timeout value for get_stuck_pgs calls - """ - inactive = manager.get_stuck_pgs('inactive', timeout) - unclean = manager.get_stuck_pgs('unclean', timeout) - stale = manager.get_stuck_pgs('stale', timeout) - log.info('inactive %s / %d, unclean %s / %d, stale %s / %d', - len(inactive), num_inactive, - len(unclean), num_unclean, - len(stale), num_stale) - assert len(inactive) == num_inactive - assert len(unclean) == num_unclean - assert len(stale) == num_stale - -def task(ctx, config): - """ - Test the dump_stuck command. - - :param ctx: Context - :param config: Configuration - """ - assert config is None, \ - 'dump_stuck requires no configuration' - assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ - 'dump_stuck requires exactly 2 osds' - - timeout = 60 - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - manager.flush_pg_stats([0, 1]) - manager.wait_for_clean(timeout) - - manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--', -# '--mon-osd-report-timeout 90', - '--mon-pg-stuck-threshold 10') - - # all active+clean - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=0, - ) - num_pgs = manager.get_num_pgs() - - manager.mark_out_osd(0) - time.sleep(timeout) - manager.flush_pg_stats([1]) - manager.wait_for_recovery(timeout) - - # all active+clean+remapped - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=0, - ) - - manager.mark_in_osd(0) - manager.flush_pg_stats([0, 1]) - manager.wait_for_clean(timeout) - - # all active+clean - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=0, - ) - - log.info('stopping first osd') - manager.kill_osd(0) - manager.mark_down_osd(0) - manager.wait_for_active(timeout) - - log.info('waiting for all to be unclean') - starttime = time.time() - done = False - while not done: - try: - check_stuck( - manager, - num_inactive=0, - num_unclean=num_pgs, - num_stale=0, - ) - done = True - except AssertionError: - # wait up to 15 minutes to become stale - if time.time() - starttime > 900: - raise - - - log.info('stopping second osd') - manager.kill_osd(1) - manager.mark_down_osd(1) - - log.info('waiting for all to be stale') - starttime = time.time() - done = False - while not done: - try: - check_stuck( - manager, - num_inactive=0, - num_unclean=num_pgs, - num_stale=num_pgs, - ) - done = True - except AssertionError: - # wait up to 15 minutes to become stale - if time.time() - starttime > 900: - raise - - log.info('reviving') - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): - manager.revive_osd(id_) - manager.mark_in_osd(id_) - while True: - try: - manager.flush_pg_stats([0, 1]) - break - except Exception: - log.exception('osds must not be started yet, waiting...') - time.sleep(1) - manager.wait_for_clean(timeout) - - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=0, - ) diff --git a/src/ceph/qa/tasks/ec_lost_unfound.py b/src/ceph/qa/tasks/ec_lost_unfound.py deleted file mode 100644 index cc0bdb2..0000000 --- a/src/ceph/qa/tasks/ec_lost_unfound.py +++ /dev/null @@ -1,158 +0,0 @@ -""" -Lost_unfound -""" -from teuthology.orchestra import run -import logging -import ceph_manager -from teuthology import misc as teuthology -from util.rados import rados -import time - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of lost objects on an ec pool. - - A pretty rigid cluster is brought up andtested by this task - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - manager.wait_for_clean() - - profile = config.get('erasure_code_profile', { - 'k': '2', - 'm': '2', - 'crush-failure-domain': 'osd' - }) - profile_name = profile.get('name', 'lost_unfound') - manager.create_erasure_code_profile(profile_name, profile) - pool = manager.create_pool_with_unique_name( - erasure_code_profile_name=profile_name, - min_size=2) - - # something that is always there, readable and never empty - dummyfile = '/etc/group' - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) - - manager.flush_pg_stats([0, 1]) - manager.wait_for_recovery() - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f]) - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' - ) - - manager.kill_osd(0) - manager.mark_down_osd(0) - manager.kill_osd(3) - manager.mark_down_osd(3) - - for f in range(1, 10): - rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) - - # take out osd.1 and a necessary shard of those objects. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') - manager.revive_osd(0) - manager.wait_till_osd_is_up(0) - manager.revive_osd(3) - manager.wait_till_osd_is_up(3) - - manager.flush_pg_stats([0, 2, 3]) - manager.wait_till_active() - manager.flush_pg_stats([0, 2, 3]) - - # verify that there are unfound objects - unfound = manager.get_num_unfound_objects() - log.info("there are %d unfound objects" % unfound) - assert unfound - - testdir = teuthology.get_testdir(ctx) - procs = [] - if config.get('parallel_bench', True): - procs.append(mon.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'rados', - '--no-log-to-stderr', - '--name', 'client.admin', - '-b', str(4<<10), - '-p' , pool, - '-t', '20', - 'bench', '240', 'write', - ]).format(tdir=testdir), - ], - logger=log.getChild('radosbench.{id}'.format(id='client.admin')), - stdin=run.PIPE, - wait=False - )) - time.sleep(10) - - # mark stuff lost - pgs = manager.get_pg_stats() - for pg in pgs: - if pg['stat_sum']['num_objects_unfound'] > 0: - # verify that i can list them direct from the osd - log.info('listing missing/lost in %s state %s', pg['pgid'], - pg['state']); - m = manager.list_pg_missing(pg['pgid']) - log.info('%s' % m) - assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] - - log.info("reverting unfound in %s", pg['pgid']) - manager.raw_cluster_cmd('pg', pg['pgid'], - 'mark_unfound_lost', 'delete') - else: - log.info("no unfound in %s", pg['pgid']) - - manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5') - manager.flush_pg_stats([0, 2, 3]) - manager.wait_for_recovery() - - if not config.get('parallel_bench', True): - time.sleep(20) - - # verify result - for f in range(1, 10): - err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-']) - assert err - - # see if osd.1 can cope - manager.revive_osd(1) - manager.wait_till_osd_is_up(1) - manager.wait_for_clean() - run.wait(procs) diff --git a/src/ceph/qa/tasks/exec_on_cleanup.py b/src/ceph/qa/tasks/exec_on_cleanup.py deleted file mode 100644 index e3c09d5..0000000 --- a/src/ceph/qa/tasks/exec_on_cleanup.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Exececute custom commands during unwind/cleanup -""" -import logging -import contextlib - -from teuthology import misc as teuthology -from teuthology import contextutil - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Execute commands on a given role - - tasks: - - ceph: - - kclient: [client.a] - - exec: - client.a: - - "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control" - - "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control" - - interactive: - - It stops and fails with the first command that does not return on success. It means - that if the first command fails, the second won't run at all. - - To avoid confusion it is recommended to explicitly enclose the commands in - double quotes. For instance if the command is false (without double quotes) it will - be interpreted as a boolean by the YAML parser. - - :param ctx: Context - :param config: Configuration - """ - try: - yield - finally: - log.info('Executing custom commands...') - assert isinstance(config, dict), "task exec got invalid config" - - testdir = teuthology.get_testdir(ctx) - - if 'all' in config and len(config) == 1: - a = config['all'] - roles = teuthology.all_roles(ctx.cluster) - config = dict((id_, a) for id_ in roles) - - for role, ls in config.iteritems(): - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - log.info('Running commands on role %s host %s', role, remote.name) - for c in ls: - c.replace('$TESTDIR', testdir) - remote.run( - args=[ - 'sudo', - 'TESTDIR={tdir}'.format(tdir=testdir), - 'bash', - '-c', - c], - ) - diff --git a/src/ceph/qa/tasks/filestore_idempotent.py b/src/ceph/qa/tasks/filestore_idempotent.py deleted file mode 100644 index 4e2a228..0000000 --- a/src/ceph/qa/tasks/filestore_idempotent.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Filestore/filejournal handler -""" -import logging -from teuthology.orchestra import run -import random - -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test filestore/filejournal handling of non-idempotent events. - - Currently this is a kludge; we require the ceph task preceeds us just - so that we get the tarball installed to run the test binary. - - :param ctx: Context - :param config: Configuration - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - # just use the first client... - client = clients[0]; - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - - testdir = teuthology.get_testdir(ctx) - - dir = '%s/ceph.data/test.%s' % (testdir, client) - - seed = str(int(random.uniform(1,100))) - - try: - log.info('creating a working dir') - remote.run(args=['mkdir', dir]) - remote.run( - args=[ - 'cd', dir, - run.Raw('&&'), - 'wget','-q', '-Orun_seed_to.sh', - 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to.sh;hb=HEAD', - run.Raw('&&'), - 'wget','-q', '-Orun_seed_to_range.sh', - 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to_range.sh;hb=HEAD', - run.Raw('&&'), - 'chmod', '+x', 'run_seed_to.sh', 'run_seed_to_range.sh', - ]); - - log.info('running a series of tests') - proc = remote.run( - args=[ - 'cd', dir, - run.Raw('&&'), - './run_seed_to_range.sh', seed, '50', '300', - ], - wait=False, - check_status=False) - result = proc.wait() - - if result != 0: - remote.run( - args=[ - 'cp', '-a', dir, '{tdir}/archive/idempotent_failure'.format(tdir=testdir), - ]) - raise Exception("./run_seed_to_range.sh errored out") - - finally: - remote.run(args=[ - 'rm', '-rf', '--', dir - ]) - diff --git a/src/ceph/qa/tasks/kclient.py b/src/ceph/qa/tasks/kclient.py deleted file mode 100644 index 7cc7ada..0000000 --- a/src/ceph/qa/tasks/kclient.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Mount/unmount a ``kernel`` client. -""" -import contextlib -import logging - -from teuthology.misc import deep_merge -from teuthology.orchestra.run import CommandFailedError -from teuthology import misc -from teuthology.contextutil import MaxWhileTries -from cephfs.kernel_mount import KernelMount - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Mount/unmount a ``kernel`` client. - - The config is optional and defaults to mounting on all clients. If - a config is given, it is expected to be a list of clients to do - this operation on. This lets you e.g. set up one client with - ``ceph-fuse`` and another with ``kclient``. - - Example that mounts all clients:: - - tasks: - - ceph: - - kclient: - - interactive: - - Example that uses both ``kclient` and ``ceph-fuse``:: - - tasks: - - ceph: - - ceph-fuse: [client.0] - - kclient: [client.1] - - interactive: - - - Pass a dictionary instead of lists to specify per-client config: - - tasks: - -kclient: - client.0: - debug: true - - :param ctx: Context - :param config: Configuration - """ - log.info('Mounting kernel clients...') - assert config is None or isinstance(config, list) or isinstance(config, dict), \ - "task kclient got invalid config" - - if config is None: - config = ['client.{id}'.format(id=id_) - for id_ in misc.all_roles_of_type(ctx.cluster, 'client')] - - if isinstance(config, list): - client_roles = config - config = dict([r, dict()] for r in client_roles) - elif isinstance(config, dict): - client_roles = filter(lambda x: 'client.' in x, config.keys()) - else: - raise ValueError("Invalid config object: {0} ({1})".format(config, config.__class__)) - - # config has been converted to a dict by this point - overrides = ctx.config.get('overrides', {}) - deep_merge(config, overrides.get('kclient', {})) - - clients = list(misc.get_clients(ctx=ctx, roles=client_roles)) - - test_dir = misc.get_testdir(ctx) - - # Assemble mon addresses - remotes_and_roles = ctx.cluster.remotes.items() - roles = [roles for (remote_, roles) in remotes_and_roles] - ips = [remote_.ssh.get_transport().getpeername()[0] - for (remote_, _) in remotes_and_roles] - mons = misc.get_mons(roles, ips).values() - - mounts = {} - for id_, remote in clients: - client_config = config.get("client.%s" % id_) - if client_config is None: - client_config = {} - - if config.get("disabled", False) or not client_config.get('mounted', True): - continue - - kernel_mount = KernelMount( - mons, - test_dir, - id_, - remote, - ctx.teuthology_config.get('ipmi_user', None), - ctx.teuthology_config.get('ipmi_password', None), - ctx.teuthology_config.get('ipmi_domain', None) - ) - - mounts[id_] = kernel_mount - - if client_config.get('debug', False): - remote.run(args=["sudo", "bash", "-c", "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"]) - remote.run(args=["sudo", "bash", "-c", "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"]) - - kernel_mount.mount() - - - def umount_all(): - log.info('Unmounting kernel clients...') - - forced = False - for mount in mounts.values(): - if mount.is_mounted(): - try: - mount.umount() - except (CommandFailedError, MaxWhileTries): - log.warn("Ordinary umount failed, forcing...") - forced = True - mount.umount_wait(force=True) - - return forced - - ctx.mounts = mounts - try: - yield mounts - except: - umount_all() # ignore forced retval, we are already in error handling - finally: - - forced = umount_all() - if forced: - # The context managers within the kclient manager worked (i.e. - # the test workload passed) but for some reason we couldn't - # umount, so turn this into a test failure. - raise RuntimeError("Kernel mounts did not umount cleanly") diff --git a/src/ceph/qa/tasks/locktest.py b/src/ceph/qa/tasks/locktest.py deleted file mode 100755 index 9de5ba4..0000000 --- a/src/ceph/qa/tasks/locktest.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -locktests -""" -import logging - -from teuthology.orchestra import run -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Run locktests, from the xfstests suite, on the given - clients. Whether the clients are ceph-fuse or kernel does not - matter, and the two clients can refer to the same mount. - - The config is a list of two clients to run the locktest on. The - first client will be the host. - - For example: - tasks: - - ceph: - - ceph-fuse: [client.0, client.1] - - locktest: - [client.0, client.1] - - This task does not yield; there would be little point. - - :param ctx: Context - :param config: Configuration - """ - - assert isinstance(config, list) - log.info('fetching and building locktests...') - (host,) = ctx.cluster.only(config[0]).remotes - (client,) = ctx.cluster.only(config[1]).remotes - ( _, _, host_id) = config[0].partition('.') - ( _, _, client_id) = config[1].partition('.') - testdir = teuthology.get_testdir(ctx) - hostmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=host_id) - clientmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=client_id) - - try: - for client_name in config: - log.info('building on {client_}'.format(client_=client_name)) - ctx.cluster.only(client_name).run( - args=[ - # explicitly does not support multiple autotest tasks - # in a single run; the result archival would conflict - 'mkdir', '{tdir}/archive/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'mkdir', '{tdir}/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'wget', - '-nv', - 'https://raw.github.com/gregsfortytwo/xfstests-ceph/master/src/locktest.c', - '-O', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - run.Raw('&&'), - 'g++', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - '-o', '{tdir}/locktest/locktest'.format(tdir=testdir) - ], - logger=log.getChild('locktest_client.{id}'.format(id=client_name)), - ) - - log.info('built locktest on each client') - - host.run(args=['sudo', 'touch', - '{mnt}/locktestfile'.format(mnt=hostmnt), - run.Raw('&&'), - 'sudo', 'chown', 'ubuntu.ubuntu', - '{mnt}/locktestfile'.format(mnt=hostmnt) - ] - ) - - log.info('starting on host') - hostproc = host.run( - args=[ - '{tdir}/locktest/locktest'.format(tdir=testdir), - '-p', '6788', - '-d', - '{mnt}/locktestfile'.format(mnt=hostmnt), - ], - wait=False, - logger=log.getChild('locktest.host'), - ) - log.info('starting on client') - (_,_,hostaddr) = host.name.partition('@') - clientproc = client.run( - args=[ - '{tdir}/locktest/locktest'.format(tdir=testdir), - '-p', '6788', - '-d', - '-h', hostaddr, - '{mnt}/locktestfile'.format(mnt=clientmnt), - ], - logger=log.getChild('locktest.client'), - wait=False - ) - - hostresult = hostproc.wait() - clientresult = clientproc.wait() - if (hostresult != 0) or (clientresult != 0): - raise Exception("Did not pass locking test!") - log.info('finished locktest executable with results {r} and {s}'. \ - format(r=hostresult, s=clientresult)) - - finally: - log.info('cleaning up host dir') - host.run( - args=[ - 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rmdir', '{tdir}/locktest' - ], - logger=log.getChild('.{id}'.format(id=config[0])), - ) - log.info('cleaning up client dir') - client.run( - args=[ - 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rmdir', '{tdir}/locktest'.format(tdir=testdir) - ], - logger=log.getChild('.{id}'.format(\ - id=config[1])), - ) diff --git a/src/ceph/qa/tasks/logrotate.conf b/src/ceph/qa/tasks/logrotate.conf deleted file mode 100644 index b0cb801..0000000 --- a/src/ceph/qa/tasks/logrotate.conf +++ /dev/null @@ -1,13 +0,0 @@ -/var/log/ceph/*{daemon_type}*.log {{ - rotate 100 - size {max_size} - compress - sharedscripts - postrotate - killall {daemon_type} -1 || true - endscript - missingok - notifempty - su root root -}} - diff --git a/src/ceph/qa/tasks/lost_unfound.py b/src/ceph/qa/tasks/lost_unfound.py deleted file mode 100644 index 1cc588b..0000000 --- a/src/ceph/qa/tasks/lost_unfound.py +++ /dev/null @@ -1,176 +0,0 @@ -""" -Lost_unfound -""" -import logging -import time -import ceph_manager -from teuthology import misc as teuthology -from teuthology.orchestra import run -from util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of lost objects. - - A pretty rigid cluseter is brought up andtested by this task - """ - POOL = 'unfound_pool' - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - - manager.wait_for_clean() - - manager.create_pool(POOL) - - # something that is always there - dummyfile = '/etc/fstab' - - # take an osd out until the very end - manager.kill_osd(2) - manager.mark_down_osd(2) - manager.mark_out_osd(2) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) - - manager.flush_pg_stats([0, 1]) - manager.wait_for_recovery() - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f]) - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' - ) - - manager.kill_osd(0) - manager.mark_down_osd(0) - - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - - # bring osd.0 back up, let it peer, but don't replicate the new - # objects... - log.info('osd.0 command_args is %s' % 'foo') - log.info(ctx.daemons.get_daemon('osd', 0).command_args) - ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ - '--osd-recovery-delay-start', '1000' - ]) - manager.revive_osd(0) - manager.mark_in_osd(0) - manager.wait_till_osd_is_up(0) - - manager.flush_pg_stats([1, 0]) - manager.wait_till_active() - - # take out osd.1 and the only copy of those objects. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.mark_out_osd(1) - manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') - - # bring up osd.2 so that things would otherwise, in theory, recovery fully - manager.revive_osd(2) - manager.mark_in_osd(2) - manager.wait_till_osd_is_up(2) - - manager.flush_pg_stats([0, 2]) - manager.wait_till_active() - manager.flush_pg_stats([0, 2]) - - # verify that there are unfound objects - unfound = manager.get_num_unfound_objects() - log.info("there are %d unfound objects" % unfound) - assert unfound - - testdir = teuthology.get_testdir(ctx) - procs = [] - if config.get('parallel_bench', True): - procs.append(mon.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'rados', - '--no-log-to-stderr', - '--name', 'client.admin', - '-b', str(4<<10), - '-p' , POOL, - '-t', '20', - 'bench', '240', 'write', - ]).format(tdir=testdir), - ], - logger=log.getChild('radosbench.{id}'.format(id='client.admin')), - stdin=run.PIPE, - wait=False - )) - time.sleep(10) - - # mark stuff lost - pgs = manager.get_pg_stats() - for pg in pgs: - if pg['stat_sum']['num_objects_unfound'] > 0: - primary = 'osd.%d' % pg['acting'][0] - - # verify that i can list them direct from the osd - log.info('listing missing/lost in %s state %s', pg['pgid'], - pg['state']); - m = manager.list_pg_missing(pg['pgid']) - #log.info('%s' % m) - assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] - num_unfound=0 - for o in m['objects']: - if len(o['locations']) == 0: - num_unfound += 1 - assert m['num_unfound'] == num_unfound - - log.info("reverting unfound in %s on %s", pg['pgid'], primary) - manager.raw_cluster_cmd('pg', pg['pgid'], - 'mark_unfound_lost', 'revert') - else: - log.info("no unfound in %s", pg['pgid']) - - manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.flush_pg_stats([0, 2]) - manager.wait_for_recovery() - - # verify result - for f in range(1, 10): - err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-']) - assert not err - - # see if osd.1 can cope - manager.revive_osd(1) - manager.mark_in_osd(1) - manager.wait_till_osd_is_up(1) - manager.wait_for_clean() - run.wait(procs) diff --git a/src/ceph/qa/tasks/manypools.py b/src/ceph/qa/tasks/manypools.py deleted file mode 100644 index 1ddcba5..0000000 --- a/src/ceph/qa/tasks/manypools.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Force pg creation on all osds -""" -from teuthology import misc as teuthology -from teuthology.orchestra import run -import logging - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Create the specified number of pools and write 16 objects to them (thereby forcing - the PG creation on each OSD). This task creates pools from all the clients, - in parallel. It is easy to add other daemon types which have the appropriate - permissions, but I don't think anything else does. - The config is just the number of pools to create. I recommend setting - "mon create pg interval" to a very low value in your ceph config to speed - this up. - - You probably want to do this to look at memory consumption, and - maybe to test how performance changes with the number of PGs. For example: - - tasks: - - ceph: - config: - mon: - mon create pg interval: 1 - - manypools: 3000 - - radosbench: - clients: [client.0] - time: 360 - """ - - log.info('creating {n} pools'.format(n=config)) - - poolnum = int(config) - creator_remotes = [] - client_roles = teuthology.all_roles_of_type(ctx.cluster, 'client') - log.info('got client_roles={client_roles_}'.format(client_roles_=client_roles)) - for role in client_roles: - log.info('role={role_}'.format(role_=role)) - (creator_remote, ) = ctx.cluster.only('client.{id}'.format(id=role)).remotes.iterkeys() - creator_remotes.append((creator_remote, 'client.{id}'.format(id=role))) - - remaining_pools = poolnum - poolprocs=dict() - while (remaining_pools > 0): - log.info('{n} pools remaining to create'.format(n=remaining_pools)) - for remote, role_ in creator_remotes: - poolnum = remaining_pools - remaining_pools -= 1 - if remaining_pools < 0: - continue - log.info('creating pool{num} on {role}'.format(num=poolnum, role=role_)) - proc = remote.run( - args=[ - 'rados', - '--name', role_, - 'mkpool', 'pool{num}'.format(num=poolnum), '-1', - run.Raw('&&'), - 'rados', - '--name', role_, - '--pool', 'pool{num}'.format(num=poolnum), - 'bench', '0', 'write', '-t', '16', '--block-size', '1' - ], - wait = False - ) - log.info('waiting for pool and object creates') - poolprocs[remote] = proc - - run.wait(poolprocs.itervalues()) - - log.info('created all {n} pools and wrote 16 objects to each'.format(n=poolnum)) diff --git a/src/ceph/qa/tasks/mds_creation_failure.py b/src/ceph/qa/tasks/mds_creation_failure.py deleted file mode 100644 index d1de156..0000000 --- a/src/ceph/qa/tasks/mds_creation_failure.py +++ /dev/null @@ -1,85 +0,0 @@ - -import logging -import contextlib -import time -import ceph_manager -from teuthology import misc -from teuthology.orchestra.run import CommandFailedError, Raw - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Go through filesystem creation with a synthetic failure in an MDS - in its 'up:creating' state, to exercise the retry behaviour. - """ - # Grab handles to the teuthology objects of interest - mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds')) - if len(mdslist) != 1: - # Require exactly one MDS, the code path for creation failure when - # a standby is available is different - raise RuntimeError("This task requires exactly one MDS") - - mds_id = mdslist[0] - (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys() - manager = ceph_manager.CephManager( - mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'), - ) - - # Stop MDS - manager.raw_cluster_cmd('mds', 'set', "max_mds", "0") - mds = ctx.daemons.get_daemon('mds', mds_id) - mds.stop() - manager.raw_cluster_cmd('mds', 'fail', mds_id) - - # Reset the filesystem so that next start will go into CREATING - manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it") - manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data") - - # Start the MDS with mds_kill_create_at set, it will crash during creation - mds.restart_with_args(["--mds_kill_create_at=1"]) - try: - mds.wait_for_exit() - except CommandFailedError as e: - if e.exitstatus == 1: - log.info("MDS creation killed as expected") - else: - log.error("Unexpected status code %s" % e.exitstatus) - raise - - # Since I have intentionally caused a crash, I will clean up the resulting core - # file to avoid task.internal.coredump seeing it as a failure. - log.info("Removing core file from synthetic MDS failure") - mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))]) - - # It should have left the MDS map state still in CREATING - status = manager.get_mds_status(mds_id) - assert status['state'] == 'up:creating' - - # Start the MDS again without the kill flag set, it should proceed with creation successfully - mds.restart() - - # Wait for state ACTIVE - t = 0 - create_timeout = 120 - while True: - status = manager.get_mds_status(mds_id) - if status['state'] == 'up:active': - log.info("MDS creation completed successfully") - break - elif status['state'] == 'up:creating': - log.info("MDS still in creating state") - if t > create_timeout: - log.error("Creating did not complete within %ss" % create_timeout) - raise RuntimeError("Creating did not complete within %ss" % create_timeout) - t += 1 - time.sleep(1) - else: - log.error("Unexpected MDS state: %s" % status['state']) - assert(status['state'] in ['up:active', 'up:creating']) - - # The system should be back up in a happy healthy state, go ahead and run any further tasks - # inside this context. - yield diff --git a/src/ceph/qa/tasks/mds_thrash.py b/src/ceph/qa/tasks/mds_thrash.py deleted file mode 100644 index 75d236d..0000000 --- a/src/ceph/qa/tasks/mds_thrash.py +++ /dev/null @@ -1,555 +0,0 @@ -""" -Thrash mds by simulating failures -""" -import logging -import contextlib -import ceph_manager -import itertools -import random -import signal -import time - -from gevent import sleep -from gevent.greenlet import Greenlet -from gevent.event import Event -from teuthology import misc as teuthology - -from tasks.cephfs.filesystem import MDSCluster, Filesystem - -log = logging.getLogger(__name__) - -class DaemonWatchdog(Greenlet): - """ - DaemonWatchdog:: - - Watch Ceph daemons for failures. If an extended failure is detected (i.e. - not intentional), then the watchdog will unmount file systems and send - SIGTERM to all daemons. The duration of an extended failure is configurable - with watchdog_daemon_timeout. - - watchdog_daemon_timeout [default: 300]: number of seconds a daemon - is allowed to be failed before the watchdog will bark. - """ - - def __init__(self, ctx, manager, config, thrashers): - Greenlet.__init__(self) - self.ctx = ctx - self.config = config - self.e = None - self.logger = log.getChild('daemon_watchdog') - self.manager = manager - self.name = 'watchdog' - self.stopping = Event() - self.thrashers = thrashers - - def _run(self): - try: - self.watch() - except Exception as e: - # See _run exception comment for MDSThrasher - self.e = e - self.logger.exception("exception:") - # allow successful completion so gevent doesn't see an exception... - - def log(self, x): - """Write data to logger""" - self.logger.info(x) - - def stop(self): - self.stopping.set() - - def bark(self): - self.log("BARK! unmounting mounts and killing all daemons") - for mount in self.ctx.mounts.values(): - try: - mount.umount_wait(force=True) - except: - self.logger.exception("ignoring exception:") - daemons = [] - daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.manager.cluster))) - daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.manager.cluster))) - for daemon in daemons: - try: - daemon.signal(signal.SIGTERM) - except: - self.logger.exception("ignoring exception:") - - def watch(self): - self.log("watchdog starting") - daemon_timeout = int(self.config.get('watchdog_daemon_timeout', 300)) - daemon_failure_time = {} - while not self.stopping.is_set(): - bark = False - now = time.time() - - mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.manager.cluster) - mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.manager.cluster) - clients = self.ctx.daemons.iter_daemons_of_role('client', cluster=self.manager.cluster) - - #for daemon in mons: - # self.log("mon daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished)) - #for daemon in mdss: - # self.log("mds daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished)) - - daemon_failures = [] - daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons)) - daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss)) - for daemon in daemon_failures: - name = daemon.role + '.' + daemon.id_ - dt = daemon_failure_time.setdefault(name, (daemon, now)) - assert dt[0] is daemon - delta = now-dt[1] - self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta)) - if delta > daemon_timeout: - bark = True - - # If a daemon is no longer failed, remove it from tracking: - for name in daemon_failure_time.keys(): - if name not in [d.role + '.' + d.id_ for d in daemon_failures]: - self.log("daemon {name} has been restored".format(name=name)) - del daemon_failure_time[name] - - for thrasher in self.thrashers: - if thrasher.e is not None: - self.log("thrasher on fs.{name} failed".format(name=thrasher.fs.name)) - bark = True - - if bark: - self.bark() - return - - sleep(5) - - self.log("watchdog finished") - -class MDSThrasher(Greenlet): - """ - MDSThrasher:: - - The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc). - - The config is optional. Many of the config parameters are a a maximum value - to use when selecting a random value from a range. To always use the maximum - value, set no_random to true. The config is a dict containing some or all of: - - max_thrash: [default: 1] the maximum number of active MDSs per FS that will be thrashed at - any given time. - - max_thrash_delay: [default: 30] maximum number of seconds to delay before - thrashing again. - - max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in - the replay state before thrashing. - - max_revive_delay: [default: 10] maximum number of seconds to delay before - bringing back a thrashed MDS. - - randomize: [default: true] enables randomization and use the max/min values - - seed: [no default] seed the random number generator - - thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed - during replay. Value should be between 0.0 and 1.0. - - thrash_max_mds: [default: 0.05] likelihood that the max_mds of the mds - cluster will be modified to a value [1, current) or (current, starting - max_mds]. When reduced, randomly selected MDSs other than rank 0 will be - deactivated to reach the new max_mds. Value should be between 0.0 and 1.0. - - thrash_while_stopping: [default: false] thrash an MDS while there - are MDS in up:stopping (because max_mds was changed and some - MDS were deactivated). - - thrash_weights: allows specific MDSs to be thrashed more/less frequently. - This option overrides anything specified by max_thrash. This option is a - dict containing mds.x: weight pairs. For example, [mds.a: 0.7, mds.b: - 0.3, mds.c: 0.0]. Each weight is a value from 0.0 to 1.0. Any MDSs not - specified will be automatically given a weight of 0.0 (not thrashed). - For a given MDS, by default the trasher delays for up to - max_thrash_delay, trashes, waits for the MDS to recover, and iterates. - If a non-zero weight is specified for an MDS, for each iteration the - thrasher chooses whether to thrash during that iteration based on a - random value [0-1] not exceeding the weight of that MDS. - - Examples:: - - - The following example sets the likelihood that mds.a will be thrashed - to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the - likelihood that an MDS will be thrashed in replay to 40%. - Thrash weights do not have to sum to 1. - - tasks: - - ceph: - - mds_thrash: - thrash_weights: - - mds.a: 0.8 - - mds.b: 0.2 - thrash_in_replay: 0.4 - - ceph-fuse: - - workunit: - clients: - all: [suites/fsx.sh] - - The following example disables randomization, and uses the max delay values: - - tasks: - - ceph: - - mds_thrash: - max_thrash_delay: 10 - max_revive_delay: 1 - max_replay_thrash_delay: 4 - - """ - - def __init__(self, ctx, manager, config, fs, max_mds): - Greenlet.__init__(self) - - self.config = config - self.ctx = ctx - self.e = None - self.logger = log.getChild('fs.[{f}]'.format(f = fs.name)) - self.fs = fs - self.manager = manager - self.max_mds = max_mds - self.name = 'thrasher.fs.[{f}]'.format(f = fs.name) - self.stopping = Event() - - self.randomize = bool(self.config.get('randomize', True)) - self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.05)) - self.max_thrash = int(self.config.get('max_thrash', 1)) - self.max_thrash_delay = float(self.config.get('thrash_delay', 120.0)) - self.thrash_in_replay = float(self.config.get('thrash_in_replay', False)) - assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format( - v=self.thrash_in_replay) - self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0)) - self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0)) - - def _run(self): - try: - self.do_thrash() - except Exception as e: - # Log exceptions here so we get the full backtrace (gevent loses them). - # Also allow succesful completion as gevent exception handling is a broken mess: - # - # 2017-02-03T14:34:01.259 CRITICAL:root: File "gevent.libev.corecext.pyx", line 367, in gevent.libev.corecext.loop.handle_error (src/gevent/libev/gevent.corecext.c:5051) - # File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 558, in handle_error - # self.print_exception(context, type, value, tb) - # File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 605, in print_exception - # traceback.print_exception(type, value, tb, file=errstream) - # File "/usr/lib/python2.7/traceback.py", line 124, in print_exception - # _print(file, 'Traceback (most recent call last):') - # File "/usr/lib/python2.7/traceback.py", line 13, in _print - # file.write(str+terminator) - # 2017-02-03T14:34:01.261 CRITICAL:root:IOError - self.e = e - self.logger.exception("exception:") - # allow successful completion so gevent doesn't see an exception... - - def log(self, x): - """Write data to logger assigned to this MDThrasher""" - self.logger.info(x) - - def stop(self): - self.stopping.set() - - def kill_mds(self, mds): - if self.config.get('powercycle'): - (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)). - remotes.iterkeys()) - self.log('kill_mds on mds.{m} doing powercycle of {s}'. - format(m=mds, s=remote.name)) - self._assert_ipmi(remote) - remote.console.power_off() - else: - self.ctx.daemons.get_daemon('mds', mds).stop() - - @staticmethod - def _assert_ipmi(remote): - assert remote.console.has_ipmi_credentials, ( - "powercycling requested but RemoteConsole is not " - "initialized. Check ipmi config.") - - def revive_mds(self, mds, standby_for_rank=None): - """ - Revive mds -- do an ipmpi powercycle (if indicated by the config) - and then restart (using --hot-standby if specified. - """ - if self.config.get('powercycle'): - (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)). - remotes.iterkeys()) - self.log('revive_mds on mds.{m} doing powercycle of {s}'. - format(m=mds, s=remote.name)) - self._assert_ipmi(remote) - remote.console.power_on() - self.manager.make_admin_daemon_dir(self.ctx, remote) - args = [] - if standby_for_rank: - args.extend(['--hot-standby', standby_for_rank]) - self.ctx.daemons.get_daemon('mds', mds).restart(*args) - - def wait_for_stable(self, rank = None, gid = None): - self.log('waiting for mds cluster to stabilize...') - for itercount in itertools.count(): - status = self.fs.status() - max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds'] - ranks = list(status.get_ranks(self.fs.id)) - stopping = filter(lambda info: "up:stopping" == info['state'], ranks) - actives = filter(lambda info: "up:active" == info['state'] and "laggy_since" not in info, ranks) - - if not bool(self.config.get('thrash_while_stopping', False)) and len(stopping) > 0: - if itercount % 5 == 0: - self.log('cluster is considered unstable while MDS are in up:stopping (!thrash_while_stopping)') - else: - if rank is not None: - try: - info = status.get_rank(self.fs.id, rank) - if info['gid'] != gid and "up:active" == info['state']: - self.log('mds.{name} has gained rank={rank}, replacing gid={gid}'.format(name = info['name'], rank = rank, gid = gid)) - return status - except: - pass # no rank present - if len(actives) >= max_mds: - # no replacement can occur! - self.log("cluster has %d actives (max_mds is %d), no MDS can replace rank %d".format(len(actives), max_mds, rank)) - return status - else: - if len(actives) >= max_mds: - self.log('mds cluster has {count} alive and active, now stable!'.format(count = len(actives))) - return status, None - if itercount > 300/2: # 5 minutes - raise RuntimeError('timeout waiting for cluster to stabilize') - elif itercount % 5 == 0: - self.log('mds map: {status}'.format(status=status)) - else: - self.log('no change') - sleep(2) - - def do_thrash(self): - """ - Perform the random thrashing action - """ - - self.log('starting mds_do_thrash for fs {fs}'.format(fs = self.fs.name)) - stats = { - "max_mds": 0, - "deactivate": 0, - "kill": 0, - } - - while not self.stopping.is_set(): - delay = self.max_thrash_delay - if self.randomize: - delay = random.randrange(0.0, self.max_thrash_delay) - - if delay > 0.0: - self.log('waiting for {delay} secs before thrashing'.format(delay=delay)) - self.stopping.wait(delay) - if self.stopping.is_set(): - continue - - status = self.fs.status() - - if random.random() <= self.thrash_max_mds: - max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds'] - options = range(1, max_mds)+range(max_mds+1, self.max_mds+1) - if len(options) > 0: - sample = random.sample(options, 1) - new_max_mds = sample[0] - self.log('thrashing max_mds: %d -> %d' % (max_mds, new_max_mds)) - self.fs.set_max_mds(new_max_mds) - stats['max_mds'] += 1 - - targets = filter(lambda r: r['rank'] >= new_max_mds, status.get_ranks(self.fs.id)) - if len(targets) > 0: - # deactivate mds in decending order - targets = sorted(targets, key=lambda r: r['rank'], reverse=True) - for target in targets: - self.log("deactivating rank %d" % target['rank']) - self.fs.deactivate(target['rank']) - stats['deactivate'] += 1 - status = self.wait_for_stable()[0] - else: - status = self.wait_for_stable()[0] - - count = 0 - for info in status.get_ranks(self.fs.id): - name = info['name'] - label = 'mds.' + name - rank = info['rank'] - gid = info['gid'] - - # if thrash_weights isn't specified and we've reached max_thrash, - # we're done - count = count + 1 - if 'thrash_weights' not in self.config and count > self.max_thrash: - break - - weight = 1.0 - if 'thrash_weights' in self.config: - weight = self.config['thrash_weights'].get(label, '0.0') - skip = random.randrange(0.0, 1.0) - if weight <= skip: - self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=weight)) - continue - - self.log('kill {label} (rank={rank})'.format(label=label, rank=rank)) - self.kill_mds(name) - stats['kill'] += 1 - - # wait for mon to report killed mds as crashed - last_laggy_since = None - itercount = 0 - while True: - status = self.fs.status() - info = status.get_mds(name) - if not info: - break - if 'laggy_since' in info: - last_laggy_since = info['laggy_since'] - break - if any([(f == name) for f in status.get_fsmap(self.fs.id)['mdsmap']['failed']]): - break - self.log( - 'waiting till mds map indicates {label} is laggy/crashed, in failed state, or {label} is removed from mdsmap'.format( - label=label)) - itercount = itercount + 1 - if itercount > 10: - self.log('mds map: {status}'.format(status=status)) - sleep(2) - - if last_laggy_since: - self.log( - '{label} reported laggy/crashed since: {since}'.format(label=label, since=last_laggy_since)) - else: - self.log('{label} down, removed from mdsmap'.format(label=label, since=last_laggy_since)) - - # wait for a standby mds to takeover and become active - status = self.wait_for_stable(rank, gid) - - # wait for a while before restarting old active to become new - # standby - delay = self.max_revive_delay - if self.randomize: - delay = random.randrange(0.0, self.max_revive_delay) - - self.log('waiting for {delay} secs before reviving {label}'.format( - delay=delay, label=label)) - sleep(delay) - - self.log('reviving {label}'.format(label=label)) - self.revive_mds(name) - - for itercount in itertools.count(): - if itercount > 300/2: # 5 minutes - raise RuntimeError('timeout waiting for MDS to revive') - status = self.fs.status() - info = status.get_mds(name) - if info and info['state'] in ('up:standby', 'up:standby-replay', 'up:active'): - self.log('{label} reported in {state} state'.format(label=label, state=info['state'])) - break - self.log( - 'waiting till mds map indicates {label} is in active, standby or standby-replay'.format(label=label)) - sleep(2) - - for stat in stats: - self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat])) - - # don't do replay thrashing right now -# for info in status.get_replays(self.fs.id): -# # this might race with replay -> active transition... -# if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay: -# delay = self.max_replay_thrash_delay -# if self.randomize: -# delay = random.randrange(0.0, self.max_replay_thrash_delay) -# sleep(delay) -# self.log('kill replaying mds.{id}'.format(id=self.to_kill)) -# self.kill_mds(self.to_kill) -# -# delay = self.max_revive_delay -# if self.randomize: -# delay = random.randrange(0.0, self.max_revive_delay) -# -# self.log('waiting for {delay} secs before reviving mds.{id}'.format( -# delay=delay, id=self.to_kill)) -# sleep(delay) -# -# self.log('revive mds.{id}'.format(id=self.to_kill)) -# self.revive_mds(self.to_kill) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Stress test the mds by thrashing while another task/workunit - is running. - - Please refer to MDSThrasher class for further information on the - available options. - """ - - mds_cluster = MDSCluster(ctx) - - if config is None: - config = {} - assert isinstance(config, dict), \ - 'mds_thrash task only accepts a dict for configuration' - mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds')) - assert len(mdslist) > 1, \ - 'mds_thrash task requires at least 2 metadata servers' - - # choose random seed - if 'seed' in config: - seed = int(config['seed']) - else: - seed = int(time.time()) - log.info('mds thrasher using random seed: {seed}'.format(seed=seed)) - random.seed(seed) - - (first,) = ctx.cluster.only('mds.{_id}'.format(_id=mdslist[0])).remotes.iterkeys() - manager = ceph_manager.CephManager( - first, ctx=ctx, logger=log.getChild('ceph_manager'), - ) - - # make sure everyone is in active, standby, or standby-replay - log.info('Wait for all MDSs to reach steady state...') - status = mds_cluster.status() - while True: - steady = True - for info in status.get_all(): - state = info['state'] - if state not in ('up:active', 'up:standby', 'up:standby-replay'): - steady = False - break - if steady: - break - sleep(2) - status = mds_cluster.status() - log.info('Ready to start thrashing') - - thrashers = [] - - watchdog = DaemonWatchdog(ctx, manager, config, thrashers) - watchdog.start() - - manager.wait_for_clean() - assert manager.is_clean() - for fs in status.get_filesystems(): - thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds']) - thrasher.start() - thrashers.append(thrasher) - - try: - log.debug('Yielding') - yield - finally: - log.info('joining mds_thrashers') - for thrasher in thrashers: - thrasher.stop() - if thrasher.e: - raise RuntimeError('error during thrashing') - thrasher.join() - log.info('done joining') - - watchdog.stop() - watchdog.join() diff --git a/src/ceph/qa/tasks/metadata.yaml b/src/ceph/qa/tasks/metadata.yaml deleted file mode 100644 index ccdc3b0..0000000 --- a/src/ceph/qa/tasks/metadata.yaml +++ /dev/null @@ -1,2 +0,0 @@ -instance-id: test -local-hostname: test diff --git a/src/ceph/qa/tasks/mgr/__init__.py b/src/ceph/qa/tasks/mgr/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/src/ceph/qa/tasks/mgr/__init__.py +++ /dev/null diff --git a/src/ceph/qa/tasks/mgr/mgr_test_case.py b/src/ceph/qa/tasks/mgr/mgr_test_case.py deleted file mode 100644 index ec3f98d..0000000 --- a/src/ceph/qa/tasks/mgr/mgr_test_case.py +++ /dev/null @@ -1,170 +0,0 @@ - -from unittest import case -import json -import logging - -from teuthology import misc -from tasks.ceph_test_case import CephTestCase - -# TODO move definition of CephCluster away from the CephFS stuff -from tasks.cephfs.filesystem import CephCluster - - -log = logging.getLogger(__name__) - - -class MgrCluster(CephCluster): - def __init__(self, ctx): - super(MgrCluster, self).__init__(ctx) - self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr')) - - if len(self.mgr_ids) == 0: - raise RuntimeError( - "This task requires at least one manager daemon") - - self.mgr_daemons = dict( - [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id - in self.mgr_ids]) - - def mgr_stop(self, mgr_id): - self.mgr_daemons[mgr_id].stop() - - def mgr_fail(self, mgr_id): - self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id) - - def mgr_restart(self, mgr_id): - self.mgr_daemons[mgr_id].restart() - - def get_mgr_map(self): - status = json.loads( - self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty")) - - return status["mgrmap"] - - def get_active_id(self): - return self.get_mgr_map()["active_name"] - - def get_standby_ids(self): - return [s['name'] for s in self.get_mgr_map()["standbys"]] - - def set_module_localized_conf(self, module, mgr_id, key, val): - self.mon_manager.raw_cluster_cmd("config-key", "set", - "mgr/{0}/{1}/{2}".format( - module, mgr_id, key - ), val) - - -class MgrTestCase(CephTestCase): - MGRS_REQUIRED = 1 - - def setUp(self): - super(MgrTestCase, self).setUp() - - # The test runner should have populated this - assert self.mgr_cluster is not None - - if len(self.mgr_cluster.mgr_ids) < self.MGRS_REQUIRED: - raise case.SkipTest("Only have {0} manager daemons, " - "{1} are required".format( - len(self.mgr_cluster.mgr_ids), self.MGRS_REQUIRED)) - - # Restart all the daemons - for daemon in self.mgr_cluster.mgr_daemons.values(): - daemon.stop() - - for mgr_id in self.mgr_cluster.mgr_ids: - self.mgr_cluster.mgr_fail(mgr_id) - - for daemon in self.mgr_cluster.mgr_daemons.values(): - daemon.restart() - - # Wait for an active to come up - self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "", - timeout=20) - - expect_standbys = set(self.mgr_cluster.mgr_ids) \ - - {self.mgr_cluster.get_active_id()} - self.wait_until_true( - lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys, - timeout=20) - - def _load_module(self, module_name): - loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd( - "mgr", "module", "ls"))['enabled_modules'] - if module_name in loaded: - # The enable command is idempotent, but our wait for a restart - # isn't, so let's return now if it's already loaded - return - - initial_gid = self.mgr_cluster.get_mgr_map()['active_gid'] - self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", - module_name) - - # Wait for the module to load - def has_restarted(): - mgr_map = self.mgr_cluster.get_mgr_map() - done = mgr_map['active_gid'] != initial_gid and mgr_map['available'] - if done: - log.info("Restarted after module load (new active {0}/{1})".format( - mgr_map['active_name'] , mgr_map['active_gid'])) - return done - self.wait_until_true(has_restarted, timeout=30) - - - def _get_uri(self, service_name): - # Little dict hack so that I can assign into this from - # the get_or_none function - mgr_map = {'x': None} - - def _get_or_none(): - mgr_map['x'] = self.mgr_cluster.get_mgr_map() - result = mgr_map['x']['services'].get(service_name, None) - return result - - self.wait_until_true(lambda: _get_or_none() is not None, 30) - - uri = mgr_map['x']['services'][service_name] - - log.info("Found {0} at {1} (daemon {2}/{3})".format( - service_name, uri, mgr_map['x']['active_name'], - mgr_map['x']['active_gid'])) - - return uri - - - def _assign_ports(self, module_name, config_name, min_port=7789): - """ - To avoid the need to run lots of hosts in teuthology tests to - get different URLs per mgr, we will hand out different ports - to each mgr here. - - This is already taken care of for us when running in a vstart - environment. - """ - # Start handing out ports well above Ceph's range. - assign_port = min_port - - for mgr_id in self.mgr_cluster.mgr_ids: - self.mgr_cluster.mgr_stop(mgr_id) - self.mgr_cluster.mgr_fail(mgr_id) - - for mgr_id in self.mgr_cluster.mgr_ids: - log.info("Using port {0} for {1} on mgr.{2}".format( - assign_port, module_name, mgr_id - )) - self.mgr_cluster.set_module_localized_conf(module_name, mgr_id, - config_name, - str(assign_port)) - assign_port += 1 - - for mgr_id in self.mgr_cluster.mgr_ids: - self.mgr_cluster.mgr_restart(mgr_id) - - def is_available(): - mgr_map = self.mgr_cluster.get_mgr_map() - done = mgr_map['available'] - if done: - log.info("Available after assign ports (new active {0}/{1})".format( - mgr_map['active_name'] , mgr_map['active_gid'])) - return done - self.wait_until_true(is_available, timeout=30) diff --git a/src/ceph/qa/tasks/mgr/test_dashboard.py b/src/ceph/qa/tasks/mgr/test_dashboard.py deleted file mode 100644 index 3b8a2cc..0000000 --- a/src/ceph/qa/tasks/mgr/test_dashboard.py +++ /dev/null @@ -1,70 +0,0 @@ - - -from mgr_test_case import MgrTestCase - -import logging -import requests - - -log = logging.getLogger(__name__) - - -class TestDashboard(MgrTestCase): - MGRS_REQUIRED = 3 - - def test_standby(self): - self._assign_ports("dashboard", "server_port") - self._load_module("dashboard") - - original_active = self.mgr_cluster.get_active_id() - - original_uri = self._get_uri("dashboard") - log.info("Originally running at {0}".format(original_uri)) - - self.mgr_cluster.mgr_fail(original_active) - - failed_over_uri = self._get_uri("dashboard") - log.info("After failover running at {0}".format(original_uri)) - - self.assertNotEqual(original_uri, failed_over_uri) - - # The original active daemon should have come back up as a standby - # and be doing redirects to the new active daemon - r = requests.get(original_uri, allow_redirects=False) - self.assertEqual(r.status_code, 303) - self.assertEqual(r.headers['Location'], failed_over_uri) - - def test_urls(self): - self._assign_ports("dashboard", "server_port") - self._load_module("dashboard") - - base_uri = self._get_uri("dashboard") - - # This is a very simple smoke test to check that the dashboard can - # give us a 200 response to requests. We're not testing that - # the content is correct or even renders! - - urls = [ - "/health", - "/servers", - "/osd/", - "/osd/perf/0", - "/rbd_mirroring", - "/rbd_iscsi" - ] - - failures = [] - - for url in urls: - r = requests.get(base_uri + url, allow_redirects=False) - if r.status_code >= 300 and r.status_code < 400: - log.error("Unexpected redirect to: {0} (from {1})".format( - r.headers['Location'], base_uri)) - if r.status_code != 200: - failures.append(url) - - log.info("{0}: {1} ({2} bytes)".format( - url, r.status_code, len(r.content) - )) - - self.assertListEqual(failures, []) diff --git a/src/ceph/qa/tasks/mgr/test_failover.py b/src/ceph/qa/tasks/mgr/test_failover.py deleted file mode 100644 index 0dd9cb7..0000000 --- a/src/ceph/qa/tasks/mgr/test_failover.py +++ /dev/null @@ -1,144 +0,0 @@ - -import logging -import json - -from tasks.mgr.mgr_test_case import MgrTestCase - - -log = logging.getLogger(__name__) - - -class TestFailover(MgrTestCase): - MGRS_REQUIRED = 2 - - def test_timeout(self): - """ - That when an active mgr stops responding, a standby is promoted - after mon_mgr_beacon_grace. - """ - - # Query which mgr is active - original_active = self.mgr_cluster.get_active_id() - original_standbys = self.mgr_cluster.get_standby_ids() - - # Stop that daemon - self.mgr_cluster.mgr_stop(original_active) - - # Assert that the other mgr becomes active - self.wait_until_true( - lambda: self.mgr_cluster.get_active_id() in original_standbys, - timeout=60 - ) - - self.mgr_cluster.mgr_restart(original_active) - self.wait_until_true( - lambda: original_active in self.mgr_cluster.get_standby_ids(), - timeout=10 - ) - - def test_timeout_nostandby(self): - """ - That when an active mgr stop responding, and no standby is - available, the active mgr is removed from the map anyway. - """ - # Query which mgr is active - original_active = self.mgr_cluster.get_active_id() - original_standbys = self.mgr_cluster.get_standby_ids() - - for s in original_standbys: - self.mgr_cluster.mgr_stop(s) - self.mgr_cluster.mgr_fail(s) - - self.assertListEqual(self.mgr_cluster.get_standby_ids(), []) - self.assertEqual(self.mgr_cluster.get_active_id(), original_active) - - grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace")) - log.info("Should time out in about {0} seconds".format(grace)) - - self.mgr_cluster.mgr_stop(original_active) - - # Now wait for the mon to notice the mgr is gone and remove it - # from the map. - self.wait_until_equal( - lambda: self.mgr_cluster.get_active_id(), - "", - timeout=grace * 2 - ) - - self.assertListEqual(self.mgr_cluster.get_standby_ids(), []) - self.assertEqual(self.mgr_cluster.get_active_id(), "") - - def test_explicit_fail(self): - """ - That when a user explicitly fails a daemon, a standby immediately - replaces it. - :return: - """ - # Query which mgr is active - original_active = self.mgr_cluster.get_active_id() - original_standbys = self.mgr_cluster.get_standby_ids() - - self.mgr_cluster.mgr_fail(original_active) - - # A standby should take over - self.wait_until_true( - lambda: self.mgr_cluster.get_active_id() in original_standbys, - timeout=60 - ) - - # The one we failed should come back as a standby (he isn't - # really dead) - self.wait_until_true( - lambda: original_active in self.mgr_cluster.get_standby_ids(), - timeout=10 - ) - - # Both daemons should have fully populated metadata - # (regression test for http://tracker.ceph.com/issues/21260) - meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd( - "mgr", "metadata")) - id_to_meta = dict([(i['id'], i) for i in meta]) - for i in [original_active] + original_standbys: - self.assertIn(i, id_to_meta) - self.assertIn('ceph_version', id_to_meta[i]) - - # We should be able to fail back over again: the exercises - # our re-initialization of the python runtime within - # a single process lifetime. - - # Get rid of any bystander standbys so that the original_active - # will be selected as next active. - new_active = self.mgr_cluster.get_active_id() - for daemon in original_standbys: - if daemon != new_active: - self.mgr_cluster.mgr_stop(daemon) - self.mgr_cluster.mgr_fail(daemon) - - self.assertListEqual(self.mgr_cluster.get_standby_ids(), - [original_active]) - - self.mgr_cluster.mgr_stop(new_active) - self.mgr_cluster.mgr_fail(new_active) - - self.assertEqual(self.mgr_cluster.get_active_id(), original_active) - self.assertEqual(self.mgr_cluster.get_standby_ids(), []) - - def test_standby_timeout(self): - """ - That when a standby daemon stops sending beacons, it is - removed from the list of standbys - :return: - """ - original_active = self.mgr_cluster.get_active_id() - original_standbys = self.mgr_cluster.get_standby_ids() - - victim = original_standbys[0] - self.mgr_cluster.mgr_stop(victim) - - expect_standbys = set(original_standbys) - {victim} - - self.wait_until_true( - lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys, - timeout=60 - ) - self.assertEqual(self.mgr_cluster.get_active_id(), original_active) diff --git a/src/ceph/qa/tasks/mgr/test_module_selftest.py b/src/ceph/qa/tasks/mgr/test_module_selftest.py deleted file mode 100644 index 2776fb8..0000000 --- a/src/ceph/qa/tasks/mgr/test_module_selftest.py +++ /dev/null @@ -1,74 +0,0 @@ - -import time -import requests - -from tasks.mgr.mgr_test_case import MgrTestCase - - -class TestModuleSelftest(MgrTestCase): - """ - That modules with a self-test command can be loaded and execute it - without errors. - - This is not a substitute for really testing the modules, but it - is quick and is designed to catch regressions that could occur - if data structures change in a way that breaks how the modules - touch them. - """ - MGRS_REQUIRED = 1 - - def _selftest_plugin(self, module_name): - self._load_module(module_name) - - # Execute the module's self-test routine - self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test") - - def test_zabbix(self): - self._selftest_plugin("zabbix") - - def test_prometheus(self): - self._selftest_plugin("prometheus") - - def test_influx(self): - self._selftest_plugin("influx") - - def test_selftest_run(self): - self._load_module("selftest") - self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run") - - def test_selftest_command_spam(self): - # Use the selftest module to stress the mgr daemon - self._load_module("selftest") - - # Use the dashboard to test that the mgr is still able to do its job - self._assign_ports("dashboard", "server_port") - self._load_module("dashboard") - - original_active = self.mgr_cluster.get_active_id() - original_standbys = self.mgr_cluster.get_standby_ids() - - self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", - "background", "start", - "command_spam") - - dashboard_uri = self._get_uri("dashboard") - - delay = 10 - periods = 10 - for i in range(0, periods): - t1 = time.time() - # Check that an HTTP module remains responsive - r = requests.get(dashboard_uri) - self.assertEqual(r.status_code, 200) - - # Check that a native non-module command remains responsive - self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df") - - time.sleep(delay - (time.time() - t1)) - - self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", - "background", "stop") - - # Check that all mgr daemons are still running - self.assertEqual(original_active, self.mgr_cluster.get_active_id()) - self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids()) diff --git a/src/ceph/qa/tasks/mon_clock_skew_check.py b/src/ceph/qa/tasks/mon_clock_skew_check.py deleted file mode 100644 index 547339f..0000000 --- a/src/ceph/qa/tasks/mon_clock_skew_check.py +++ /dev/null @@ -1,76 +0,0 @@ -""" -Handle clock skews in monitors. -""" -import logging -import contextlib -import ceph_manager -import time -import gevent -from StringIO import StringIO -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -class ClockSkewCheck: - """ - Check if there are any clock skews among the monitors in the - quorum. - - This task accepts the following options: - - interval amount of seconds to wait before check. (default: 30.0) - expect-skew 'true' or 'false', to indicate whether to expect a skew during - the run or not. If 'true', the test will fail if no skew is - found, and succeed if a skew is indeed found; if 'false', it's - the other way around. (default: false) - - - mon_clock_skew_check: - expect-skew: true - """ - - def __init__(self, ctx, manager, config, logger): - self.ctx = ctx - self.manager = manager - - self.stopping = False - self.logger = logger - self.config = config - - if self.config is None: - self.config = dict() - - -def task(ctx, config): - if config is None: - config = {} - assert isinstance(config, dict), \ - 'mon_clock_skew_check task only accepts a dict for configuration' - interval = float(config.get('interval', 30.0)) - expect_skew = config.get('expect-skew', False) - - log.info('Beginning mon_clock_skew_check...') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - quorum_size = len(teuthology.get_mon_names(ctx)) - manager.wait_for_mon_quorum_size(quorum_size) - - # wait a bit - log.info('sleeping for {s} seconds'.format( - s=interval)) - time.sleep(interval) - - health = manager.get_mon_health(True) - log.info('got health %s' % health) - if expect_skew: - if 'MON_CLOCK_SKEW' not in health['checks']: - raise RuntimeError('expected MON_CLOCK_SKEW but got none') - else: - if 'MON_CLOCK_SKEW' in health['checks']: - raise RuntimeError('got MON_CLOCK_SKEW but expected none') - diff --git a/src/ceph/qa/tasks/mon_recovery.py b/src/ceph/qa/tasks/mon_recovery.py deleted file mode 100644 index bfa2cdf..0000000 --- a/src/ceph/qa/tasks/mon_recovery.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Monitor recovery -""" -import logging -import ceph_manager -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test monitor recovery. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)] - log.info("mon ids = %s" % mons) - - manager.wait_for_mon_quorum_size(len(mons)) - - log.info('verifying all monitors are in the quorum') - for m in mons: - s = manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons) - - log.info('restarting each monitor in turn') - for m in mons: - # stop a monitor - manager.kill_mon(m) - manager.wait_for_mon_quorum_size(len(mons) - 1) - - # restart - manager.revive_mon(m) - manager.wait_for_mon_quorum_size(len(mons)) - - # in forward and reverse order, - rmons = mons - rmons.reverse() - for mons in mons, rmons: - log.info('stopping all monitors') - for m in mons: - manager.kill_mon(m) - - log.info('forming a minimal quorum for %s, then adding monitors' % mons) - qnum = (len(mons) / 2) + 1 - num = 0 - for m in mons: - manager.revive_mon(m) - num += 1 - if num >= qnum: - manager.wait_for_mon_quorum_size(num) - - # on both leader and non-leader ranks... - for rank in [0, 1]: - # take one out - log.info('removing mon %s' % mons[rank]) - manager.kill_mon(mons[rank]) - manager.wait_for_mon_quorum_size(len(mons) - 1) - - log.info('causing some monitor log activity') - m = 30 - for n in range(1, m): - manager.raw_cluster_cmd('log', '%d of %d' % (n, m)) - - log.info('adding mon %s back in' % mons[rank]) - manager.revive_mon(mons[rank]) - manager.wait_for_mon_quorum_size(len(mons)) diff --git a/src/ceph/qa/tasks/mon_seesaw.py b/src/ceph/qa/tasks/mon_seesaw.py deleted file mode 100644 index b101c0e..0000000 --- a/src/ceph/qa/tasks/mon_seesaw.py +++ /dev/null @@ -1,198 +0,0 @@ -from cStringIO import StringIO - -import contextlib -import logging -import random - -from teuthology import misc as teuthology -from teuthology.orchestra import run - -from ceph_manager import CephManager, write_conf - - -log = logging.getLogger(__name__) - - -def _get_mons(ctx): - return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)] - - -# teuthology prepares the monitor IPs (and ports) in get_mons(), we can -# enumerate all monitor ports ([6789..]), and find the next available one. -def _get_next_port(ctx, ip, cluster): - # assuming we have only one cluster here. - used = [] - for name in teuthology.get_mon_names(ctx, cluster): - addr = ctx.ceph[cluster].conf[name]['mon addr'] - mon_ip, mon_port = addr.split(':') - if mon_ip != ip: - continue - used.append(int(mon_port)) - port = 6789 - used.sort() - for p in used: - if p != port: - break - port += 1 - return port - - -def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path): - # co-locate a new monitor on remote where an existing monitor is hosted - cluster = manager.cluster - remote.run(args=['sudo', 'mkdir', '-p', data_path]) - keyring_path = '/etc/ceph/{cluster}.keyring'.format( - cluster=manager.cluster) - testdir = teuthology.get_testdir(ctx) - monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir, - cluster=cluster) - manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path) - if manager.controller != remote: - monmap = teuthology.get_file(manager.controller, monmap_path) - teuthology.write_file(remote, monmap_path, StringIO(monmap)) - remote.run( - args=[ - 'sudo', - 'ceph-mon', - '--cluster', cluster, - '--mkfs', - '-i', mon, - '--monmap', monmap_path, - '--keyring', keyring_path]) - if manager.controller != remote: - teuthology.delete_file(remote, monmap_path) - # raw_cluster_cmd() is performed using sudo, so sudo here also. - teuthology.delete_file(manager.controller, monmap_path, sudo=True) - # update ceph.conf so that the ceph CLI is able to connect to the cluster - if conf_path: - ip = remote.ip_address - port = _get_next_port(ctx, ip, cluster) - mon_addr = '{ip}:{port}'.format(ip=ip, port=port) - ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr} - write_conf(ctx, conf_path, cluster) - - -def _teardown_mon(ctx, manager, remote, name, data_path, conf_path): - cluster = manager.cluster - del ctx.ceph[cluster].conf[name] - write_conf(ctx, conf_path, cluster) - remote.run(args=['sudo', 'rm', '-rf', data_path]) - - -@contextlib.contextmanager -def _prepare_mon(ctx, manager, remote, mon): - cluster = manager.cluster - data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format( - cluster=cluster, id=mon) - conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster) - name = 'mon.{0}'.format(mon) - _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path) - yield - _teardown_mon(ctx, manager, remote, name, - data_path, conf_path) - - -# run_daemon() in ceph.py starts a herd of daemons of the same type, but -# _run_daemon() starts only one instance. -@contextlib.contextmanager -def _run_daemon(ctx, remote, cluster, type_, id_): - testdir = teuthology.get_testdir(ctx) - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - daemon_signal = 'kill' - run_cmd = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'daemon-helper', - daemon_signal, - ] - run_cmd_tail = [ - 'ceph-%s' % (type_), - '-f', - '--cluster', cluster, - '-i', id_] - run_cmd.extend(run_cmd_tail) - ctx.daemons.add_daemon(remote, type_, id_, - cluster=cluster, - args=run_cmd, - logger=log.getChild(type_), - stdin=run.PIPE, - wait=False) - daemon = ctx.daemons.get_daemon(type_, id_, cluster) - yield daemon - daemon.stop() - - -@contextlib.contextmanager -def task(ctx, config): - """ - replace a monitor with a newly added one, and then revert this change - - How it works:: - 1. add a mon with specified id (mon.victim_prime) - 2. wait for quorum - 3. remove a monitor with specified id (mon.victim), mon.victim will commit - suicide - 4. wait for quorum - 5. <yield> - 5. add mon.a back, and start it - 6. wait for quorum - 7. remove mon.a_prime - - Options:: - victim the id of the mon to be removed (pick a random mon by default) - replacer the id of the new mon (use "${victim}_prime" if not specified) - """ - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager')) - - if config is None: - config = {} - assert isinstance(config, dict), \ - "task ceph only supports a dictionary for configuration" - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('mon_seesaw', {})) - victim = config.get('victim', random.choice(_get_mons(ctx))) - replacer = config.get('replacer', '{0}_prime'.format(victim)) - remote = manager.find_remote('mon', victim) - quorum = manager.get_mon_quorum() - cluster = manager.cluster - log.info('replacing {victim} with {replacer}'.format(victim=victim, - replacer=replacer)) - with _prepare_mon(ctx, manager, remote, replacer): - with _run_daemon(ctx, remote, cluster, 'mon', replacer): - # replacer will join the quorum automatically - manager.wait_for_mon_quorum_size(len(quorum) + 1, 10) - # if we don't remove the victim from monmap, there is chance that - # we are leaving the new joiner with a monmap of 2 mon, and it will - # not able to reach the other one, it will be keeping probing for - # ever. - log.info('removing {mon}'.format(mon=victim)) - manager.raw_cluster_cmd('mon', 'remove', victim) - manager.wait_for_mon_quorum_size(len(quorum), 10) - # the victim will commit suicide after being removed from - # monmap, let's wait until it stops. - ctx.daemons.get_daemon('mon', victim, cluster).wait(10) - try: - # perform other tasks - yield - finally: - # bring the victim back online - # nuke the monstore of victim, otherwise it will refuse to boot - # with following message: - # - # not in monmap and have been in a quorum before; must have - # been removed - log.info('re-adding {mon}'.format(mon=victim)) - data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format( - cluster=cluster, id=victim) - remote.run(args=['sudo', 'rm', '-rf', data_path]) - name = 'mon.{0}'.format(victim) - _setup_mon(ctx, manager, remote, victim, name, data_path, None) - log.info('reviving {mon}'.format(mon=victim)) - manager.revive_mon(victim) - manager.wait_for_mon_quorum_size(len(quorum) + 1, 10) - manager.raw_cluster_cmd('mon', 'remove', replacer) - manager.wait_for_mon_quorum_size(len(quorum), 10) diff --git a/src/ceph/qa/tasks/mon_thrash.py b/src/ceph/qa/tasks/mon_thrash.py deleted file mode 100644 index 0754bcd..0000000 --- a/src/ceph/qa/tasks/mon_thrash.py +++ /dev/null @@ -1,343 +0,0 @@ -""" -Monitor thrash -""" -import logging -import contextlib -import ceph_manager -import random -import time -import gevent -import json -import math -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def _get_mons(ctx): - """ - Get monitor names from the context value. - """ - mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)] - return mons - -class MonitorThrasher: - """ - How it works:: - - - pick a monitor - - kill it - - wait for quorum to be formed - - sleep for 'revive_delay' seconds - - revive monitor - - wait for quorum to be formed - - sleep for 'thrash_delay' seconds - - Options:: - - seed Seed to use on the RNG to reproduce a previous - behaviour (default: None; i.e., not set) - revive_delay Number of seconds to wait before reviving - the monitor (default: 10) - thrash_delay Number of seconds to wait in-between - test iterations (default: 0) - thrash_store Thrash monitor store before killing the monitor being thrashed (default: False) - thrash_store_probability Probability of thrashing a monitor's store - (default: 50) - thrash_many Thrash multiple monitors instead of just one. If - 'maintain-quorum' is set to False, then we will - thrash up to as many monitors as there are - available. (default: False) - maintain_quorum Always maintain quorum, taking care on how many - monitors we kill during the thrashing. If we - happen to only have one or two monitors configured, - if this option is set to True, then we won't run - this task as we cannot guarantee maintenance of - quorum. Setting it to false however would allow the - task to run with as many as just one single monitor. - (default: True) - freeze_mon_probability: how often to freeze the mon instead of killing it, - in % (default: 0) - freeze_mon_duration: how many seconds to freeze the mon (default: 15) - scrub Scrub after each iteration (default: True) - - Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also - be set to True. - - For example:: - - tasks: - - ceph: - - mon_thrash: - revive_delay: 20 - thrash_delay: 1 - thrash_store: true - thrash_store_probability: 40 - seed: 31337 - maintain_quorum: true - thrash_many: true - - ceph-fuse: - - workunit: - clients: - all: - - mon/workloadgen.sh - """ - def __init__(self, ctx, manager, config, logger): - self.ctx = ctx - self.manager = manager - self.manager.wait_for_clean() - - self.stopping = False - self.logger = logger - self.config = config - - if self.config is None: - self.config = dict() - - """ Test reproducibility """ - self.random_seed = self.config.get('seed', None) - - if self.random_seed is None: - self.random_seed = int(time.time()) - - self.rng = random.Random() - self.rng.seed(int(self.random_seed)) - - """ Monitor thrashing """ - self.revive_delay = float(self.config.get('revive_delay', 10.0)) - self.thrash_delay = float(self.config.get('thrash_delay', 0.0)) - - self.thrash_many = self.config.get('thrash_many', False) - self.maintain_quorum = self.config.get('maintain_quorum', True) - - self.scrub = self.config.get('scrub', True) - - self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10)) - self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0)) - - assert self.max_killable() > 0, \ - 'Unable to kill at least one monitor with the current config.' - - """ Store thrashing """ - self.store_thrash = self.config.get('store_thrash', False) - self.store_thrash_probability = int( - self.config.get('store_thrash_probability', 50)) - if self.store_thrash: - assert self.store_thrash_probability > 0, \ - 'store_thrash is set, probability must be > 0' - assert self.maintain_quorum, \ - 'store_thrash = true must imply maintain_quorum = true' - - self.thread = gevent.spawn(self.do_thrash) - - def log(self, x): - """ - locally log info messages - """ - self.logger.info(x) - - def do_join(self): - """ - Break out of this processes thrashing loop. - """ - self.stopping = True - self.thread.get() - - def should_thrash_store(self): - """ - If allowed, indicate that we should thrash a certain percentage of - the time as determined by the store_thrash_probability value. - """ - if not self.store_thrash: - return False - return self.rng.randrange(0, 101) < self.store_thrash_probability - - def thrash_store(self, mon): - """ - Thrash the monitor specified. - :param mon: monitor to thrash - """ - addr = self.ctx.ceph['ceph'].conf['mon.%s' % mon]['mon addr'] - self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr)) - out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force') - j = json.loads(out) - assert j['ret'] == 0, \ - 'error forcing store sync on mon.{id}:\n{ret}'.format( - id=mon,ret=out) - - def should_freeze_mon(self): - """ - Indicate that we should freeze a certain percentago of the time - as determined by the freeze_mon_probability value. - """ - return self.rng.randrange(0, 101) < self.freeze_mon_probability - - def freeze_mon(self, mon): - """ - Send STOP signal to freeze the monitor. - """ - log.info('Sending STOP to mon %s', mon) - self.manager.signal_mon(mon, 19) # STOP - - def unfreeze_mon(self, mon): - """ - Send CONT signal to unfreeze the monitor. - """ - log.info('Sending CONT to mon %s', mon) - self.manager.signal_mon(mon, 18) # CONT - - def kill_mon(self, mon): - """ - Kill the monitor specified - """ - self.log('killing mon.{id}'.format(id=mon)) - self.manager.kill_mon(mon) - - def revive_mon(self, mon): - """ - Revive the monitor specified - """ - self.log('killing mon.{id}'.format(id=mon)) - self.log('reviving mon.{id}'.format(id=mon)) - self.manager.revive_mon(mon) - - def max_killable(self): - """ - Return the maximum number of monitors we can kill. - """ - m = len(_get_mons(self.ctx)) - if self.maintain_quorum: - return max(math.ceil(m/2.0)-1, 0) - else: - return m - - def do_thrash(self): - """ - Cotinuously loop and thrash the monitors. - """ - self.log('start thrashing') - self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\ - 'thrash many: {tm}, maintain quorum: {mq} '\ - 'store thrash: {st}, probability: {stp} '\ - 'freeze mon: prob {fp} duration {fd}'.format( - s=self.random_seed,r=self.revive_delay,t=self.thrash_delay, - tm=self.thrash_many, mq=self.maintain_quorum, - st=self.store_thrash,stp=self.store_thrash_probability, - fp=self.freeze_mon_probability,fd=self.freeze_mon_duration, - )) - - while not self.stopping: - mons = _get_mons(self.ctx) - self.manager.wait_for_mon_quorum_size(len(mons)) - self.log('making sure all monitors are in the quorum') - for m in mons: - s = self.manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons) - - kill_up_to = self.rng.randrange(1, self.max_killable()+1) - mons_to_kill = self.rng.sample(mons, kill_up_to) - self.log('monitors to thrash: {m}'.format(m=mons_to_kill)) - - mons_to_freeze = [] - for mon in mons: - if mon in mons_to_kill: - continue - if self.should_freeze_mon(): - mons_to_freeze.append(mon) - self.log('monitors to freeze: {m}'.format(m=mons_to_freeze)) - - for mon in mons_to_kill: - self.log('thrashing mon.{m}'.format(m=mon)) - - """ we only thrash stores if we are maintaining quorum """ - if self.should_thrash_store() and self.maintain_quorum: - self.thrash_store(mon) - - self.kill_mon(mon) - - if mons_to_freeze: - for mon in mons_to_freeze: - self.freeze_mon(mon) - self.log('waiting for {delay} secs to unfreeze mons'.format( - delay=self.freeze_mon_duration)) - time.sleep(self.freeze_mon_duration) - for mon in mons_to_freeze: - self.unfreeze_mon(mon) - - if self.maintain_quorum: - self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill)) - for m in mons: - if m in mons_to_kill: - continue - s = self.manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons)-len(mons_to_kill) - - self.log('waiting for {delay} secs before reviving monitors'.format( - delay=self.revive_delay)) - time.sleep(self.revive_delay) - - for mon in mons_to_kill: - self.revive_mon(mon) - # do more freezes - if mons_to_freeze: - for mon in mons_to_freeze: - self.freeze_mon(mon) - self.log('waiting for {delay} secs to unfreeze mons'.format( - delay=self.freeze_mon_duration)) - time.sleep(self.freeze_mon_duration) - for mon in mons_to_freeze: - self.unfreeze_mon(mon) - - self.manager.wait_for_mon_quorum_size(len(mons)) - for m in mons: - s = self.manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons) - - if self.scrub: - self.log('triggering scrub') - try: - self.manager.raw_cluster_cmd('scrub') - except Exception: - log.exception("Saw exception while triggering scrub") - - if self.thrash_delay > 0.0: - self.log('waiting for {delay} secs before continuing thrashing'.format( - delay=self.thrash_delay)) - time.sleep(self.thrash_delay) - -@contextlib.contextmanager -def task(ctx, config): - """ - Stress test the monitor by thrashing them while another task/workunit - is running. - - Please refer to MonitorThrasher class for further information on the - available options. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'mon_thrash task only accepts a dict for configuration' - assert len(_get_mons(ctx)) > 2, \ - 'mon_thrash task requires at least 3 monitors' - log.info('Beginning mon_thrash...') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - thrash_proc = MonitorThrasher(ctx, - manager, config, - logger=log.getChild('mon_thrasher')) - try: - log.debug('Yielding') - yield - finally: - log.info('joining mon_thrasher') - thrash_proc.do_join() - mons = _get_mons(ctx) - manager.wait_for_mon_quorum_size(len(mons)) diff --git a/src/ceph/qa/tasks/multibench.py b/src/ceph/qa/tasks/multibench.py deleted file mode 100644 index 53b1aa5..0000000 --- a/src/ceph/qa/tasks/multibench.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Multibench testing -""" -import contextlib -import logging -import radosbench -import time -import copy -import gevent - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run multibench - - The config should be as follows: - - multibench: - time: <seconds to run total> - segments: <number of concurrent benches> - radosbench: <config for radosbench> - - example: - - tasks: - - ceph: - - multibench: - clients: [client.0] - time: 360 - - interactive: - """ - log.info('Beginning multibench...') - assert isinstance(config, dict), \ - "please list clients to run on" - - def run_one(num): - """Run test spawn from gevent""" - start = time.time() - if not config.get('radosbench'): - benchcontext = {} - else: - benchcontext = copy.copy(config.get('radosbench')) - iterations = 0 - while time.time() - start < int(config.get('time', 600)): - log.info("Starting iteration %s of segment %s"%(iterations, num)) - benchcontext['pool'] = str(num) + "-" + str(iterations) - with radosbench.task(ctx, benchcontext): - time.sleep() - iterations += 1 - log.info("Starting %s threads"%(str(config.get('segments', 3)),)) - segments = [ - gevent.spawn(run_one, i) - for i in range(0, int(config.get('segments', 3)))] - - try: - yield - finally: - [i.get() for i in segments] diff --git a/src/ceph/qa/tasks/object_source_down.py b/src/ceph/qa/tasks/object_source_down.py deleted file mode 100644 index 9705d7c..0000000 --- a/src/ceph/qa/tasks/object_source_down.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -Test Object locations going down -""" -import logging -import ceph_manager -import time -from teuthology import misc as teuthology -from util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of object location going down - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.wait_for_clean() - - # something that is always there - dummyfile = '/etc/fstab' - - # take 0, 1 out - manager.mark_out_osd(0) - manager.mark_out_osd(1) - manager.wait_for_clean() - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.0', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.2', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.3', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile]) - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) - - manager.mark_out_osd(3) - manager.wait_till_active() - - manager.mark_in_osd(0) - manager.wait_till_active() - - manager.flush_pg_stats([2, 0]) - - manager.mark_out_osd(2) - manager.wait_till_active() - - # bring up 1 - manager.mark_in_osd(1) - manager.wait_till_active() - - manager.flush_pg_stats([0, 1]) - log.info("Getting unfound objects") - unfound = manager.get_num_unfound_objects() - assert not unfound - - manager.kill_osd(2) - manager.mark_down_osd(2) - manager.kill_osd(3) - manager.mark_down_osd(3) - - manager.flush_pg_stats([0, 1]) - log.info("Getting unfound objects") - unfound = manager.get_num_unfound_objects() - assert unfound diff --git a/src/ceph/qa/tasks/omapbench.py b/src/ceph/qa/tasks/omapbench.py deleted file mode 100644 index e026c74..0000000 --- a/src/ceph/qa/tasks/omapbench.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Run omapbench executable within teuthology -""" -import contextlib -import logging - -from teuthology.orchestra import run -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run omapbench - - The config should be as follows:: - - omapbench: - clients: [client list] - threads: <threads at once> - objects: <number of objects to write> - entries: <number of entries per object map> - keysize: <number of characters per object map key> - valsize: <number of characters per object map val> - increment: <interval to show in histogram (in ms)> - omaptype: <how the omaps should be generated> - - example:: - - tasks: - - ceph: - - omapbench: - clients: [client.0] - threads: 30 - objects: 1000 - entries: 10 - keysize: 10 - valsize: 100 - increment: 100 - omaptype: uniform - - interactive: - """ - log.info('Beginning omapbench...') - assert isinstance(config, dict), \ - "please list clients to run on" - omapbench = {} - testdir = teuthology.get_testdir(ctx) - print(str(config.get('increment',-1))) - for role in config.get('clients', ['client.0']): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - proc = remote.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'omapbench', - '--name', role[len(PREFIX):], - '-t', str(config.get('threads', 30)), - '-o', str(config.get('objects', 1000)), - '--entries', str(config.get('entries',10)), - '--keysize', str(config.get('keysize',10)), - '--valsize', str(config.get('valsize',1000)), - '--inc', str(config.get('increment',10)), - '--omaptype', str(config.get('omaptype','uniform')) - ]).format(tdir=testdir), - ], - logger=log.getChild('omapbench.{id}'.format(id=id_)), - stdin=run.PIPE, - wait=False - ) - omapbench[id_] = proc - - try: - yield - finally: - log.info('joining omapbench') - run.wait(omapbench.itervalues()) diff --git a/src/ceph/qa/tasks/osd_backfill.py b/src/ceph/qa/tasks/osd_backfill.py deleted file mode 100644 index 04658d2..0000000 --- a/src/ceph/qa/tasks/osd_backfill.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -Osd backfill test -""" -import logging -import ceph_manager -import time -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - - -def rados_start(ctx, remote, cmd): - """ - Run a remote rados command (currently used to only write data) - """ - log.info("rados %s" % ' '.join(cmd)) - testdir = teuthology.get_testdir(ctx) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - ]; - pre.extend(cmd) - proc = remote.run( - args=pre, - wait=False, - ) - return proc - -def task(ctx, config): - """ - Test backfill - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'thrashosds task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - assert num_osds == 3 - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.flush_pg_stats([0, 1, 2]) - manager.wait_for_clean() - - # write some data - p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096', - '--no-cleanup']) - err = p.wait() - log.info('err is %d' % err) - - # mark osd.0 out to trigger a rebalance/backfill - manager.mark_out_osd(0) - - # also mark it down to it won't be included in pg_temps - manager.kill_osd(0) - manager.mark_down_osd(0) - - # wait for everything to peer and be happy... - manager.flush_pg_stats([1, 2]) - manager.wait_for_recovery() - - # write some new data - p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096', - '--no-cleanup']) - - time.sleep(15) - - # blackhole + restart osd.1 - # this triggers a divergent backfill target - manager.blackhole_kill_osd(1) - time.sleep(2) - manager.revive_osd(1) - - # wait for our writes to complete + succeed - err = p.wait() - log.info('err is %d' % err) - - # wait for osd.1 and osd.2 to be up - manager.wait_till_osd_is_up(1) - manager.wait_till_osd_is_up(2) - - # cluster must recover - manager.flush_pg_stats([1, 2]) - manager.wait_for_recovery() - - # re-add osd.0 - manager.revive_osd(0) - manager.flush_pg_stats([1, 2]) - manager.wait_for_clean() - - diff --git a/src/ceph/qa/tasks/osd_failsafe_enospc.py b/src/ceph/qa/tasks/osd_failsafe_enospc.py deleted file mode 100644 index 6910854..0000000 --- a/src/ceph/qa/tasks/osd_failsafe_enospc.py +++ /dev/null @@ -1,218 +0,0 @@ -""" -Handle osdfailsafe configuration settings (nearfull ratio and full ratio) -""" -from cStringIO import StringIO -import logging -import time - -from teuthology.orchestra import run -from util.rados import rados -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio - configuration settings - - In order for test to pass must use log-whitelist as follows - - tasks: - - chef: - - install: - - ceph: - log-whitelist: ['OSD near full', 'OSD full dropping all updates'] - - osd_failsafe_enospc: - - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'osd_failsafe_enospc task only accepts a dict for configuration' - - # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding - sleep_time = 50 - - # something that is always there - dummyfile = '/etc/fstab' - dummyfile2 = '/etc/resolv.conf' - - manager = ctx.managers['ceph'] - - # create 1 pg pool with 1 rep which can only be on osd.0 - osds = manager.get_osd_dump() - for osd in osds: - if osd['osd'] != 0: - manager.mark_out_osd(osd['osd']) - - log.info('creating pool foo') - manager.create_pool("foo") - manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1') - - # State NONE -> NEAR - log.info('1. Verify warning messages when exceeding nearfull_ratio') - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - proc = mon.run( - args=[ - 'sudo', - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001') - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count - - # State NEAR -> FULL - log.info('2. Verify error messages when exceeding full_ratio') - - proc = mon.run( - args=[ - 'sudo', - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count - - log.info('3. Verify write failure when exceeding full_ratio') - - # Write data should fail - ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile]) - assert ret != 0, 'Expected write failure but it succeeded with exit status 0' - - # Put back default - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') - time.sleep(10) - - # State FULL -> NEAR - log.info('4. Verify write success when NOT exceeding full_ratio') - - # Write should succeed - ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2]) - assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret - - log.info('5. Verify warning messages again when exceeding nearfull_ratio') - - proc = mon.run( - args=[ - 'sudo', - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90') - time.sleep(10) - - # State NONE -> FULL - log.info('6. Verify error messages again when exceeding full_ratio') - - proc = mon.run( - args=[ - 'sudo', - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count - - # State FULL -> NONE - log.info('7. Verify no messages settings back to default') - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') - time.sleep(10) - - proc = mon.run( - args=[ - 'sudo', - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count - - log.info('Test Passed') - - # Bring all OSDs back in - manager.remove_pool("foo") - for osd in osds: - if osd['osd'] != 0: - manager.mark_in_osd(osd['osd']) diff --git a/src/ceph/qa/tasks/osd_max_pg_per_osd.py b/src/ceph/qa/tasks/osd_max_pg_per_osd.py deleted file mode 100644 index b4e2aa4..0000000 --- a/src/ceph/qa/tasks/osd_max_pg_per_osd.py +++ /dev/null @@ -1,126 +0,0 @@ -import logging -import random - - -log = logging.getLogger(__name__) - - -def pg_num_in_all_states(pgs, *states): - return sum(1 for state in pgs.itervalues() - if all(s in state for s in states)) - - -def pg_num_in_any_state(pgs, *states): - return sum(1 for state in pgs.itervalues() - if any(s in state for s in states)) - - -def test_create_from_mon(ctx, config): - """ - osd should stop creating new pools if the number of pg it servers - exceeds the max-pg-per-osd setting, and it should resume the previously - suspended pg creations once the its pg number drops down below the setting - How it works:: - 1. set the hard limit of pg-per-osd to "2" - 2. create pool.a with pg_num=2 - # all pgs should be active+clean - 2. create pool.b with pg_num=2 - # new pgs belonging to this pool should be unknown (the primary osd - reaches the limit) or creating (replica osd reaches the limit) - 3. remove pool.a - 4. all pg belonging to pool.b should be active+clean - """ - pg_num = config.get('pg_num', 2) - manager = ctx.managers['ceph'] - log.info('1. creating pool.a') - pool_a = manager.create_pool_with_unique_name(pg_num) - manager.wait_for_clean() - assert manager.get_num_active_clean() == pg_num - - log.info('2. creating pool.b') - pool_b = manager.create_pool_with_unique_name(pg_num) - pg_states = manager.wait_till_pg_convergence(300) - pg_created = pg_num_in_all_states(pg_states, 'active', 'clean') - assert pg_created == pg_num - pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating') - assert pg_pending == pg_num - - log.info('3. removing pool.a') - manager.remove_pool(pool_a) - pg_states = manager.wait_till_pg_convergence(300) - assert len(pg_states) == pg_num - pg_created = pg_num_in_all_states(pg_states, 'active', 'clean') - assert pg_created == pg_num - - # cleanup - manager.remove_pool(pool_b) - - -def test_create_from_peer(ctx, config): - """ - osd should stop creating new pools if the number of pg it servers - exceeds the max-pg-per-osd setting, and it should resume the previously - suspended pg creations once the its pg number drops down below the setting - - How it works:: - 0. create 4 OSDs. - 1. create pool.a with pg_num=1, size=2 - pg will be mapped to osd.0, and osd.1, and it should be active+clean - 2. create pool.b with pg_num=1, size=2. - if the pgs stuck in creating, delete the pool since the pool and try - again, eventually we'll get the pool to land on the other 2 osds that - aren't occupied by pool.a. (this will also verify that pgs for deleted - pools get cleaned out of the creating wait list.) - 3. mark an osd out. verify that some pgs get stuck stale or peering. - 4. delete a pool, verify pgs go active. - """ - pg_num = config.get('pg_num', 1) - pool_size = config.get('pool_size', 2) - from_primary = config.get('from_primary', True) - - manager = ctx.managers['ceph'] - log.info('1. creating pool.a') - pool_a = manager.create_pool_with_unique_name(pg_num) - manager.wait_for_clean() - assert manager.get_num_active_clean() == pg_num - - log.info('2. creating pool.b') - while True: - pool_b = manager.create_pool_with_unique_name(pg_num) - pg_states = manager.wait_till_pg_convergence(300) - pg_created = pg_num_in_all_states(pg_states, 'active', 'clean') - assert pg_created >= pg_num - pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating') - assert pg_pending == pg_num * 2 - pg_created - if pg_created == pg_num * 2: - break - manager.remove_pool(pool_b) - - log.info('3. mark an osd out') - pg_stats = manager.get_pg_stats() - pg = random.choice(pg_stats) - if from_primary: - victim = pg['acting'][-1] - else: - victim = pg['acting'][0] - manager.mark_out_osd(victim) - pg_states = manager.wait_till_pg_convergence(300) - pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering') - assert pg_stuck > 0 - - log.info('4. removing pool.b') - manager.remove_pool(pool_b) - manager.wait_for_clean(30) - - # cleanup - manager.remove_pool(pool_a) - - -def task(ctx, config): - assert isinstance(config, dict), \ - 'osd_max_pg_per_osd task only accepts a dict for config' - manager = ctx.managers['ceph'] - if config.get('test_create_from_mon', True): - test_create_from_mon(ctx, config) - else: - test_create_from_peer(ctx, config) diff --git a/src/ceph/qa/tasks/osd_recovery.py b/src/ceph/qa/tasks/osd_recovery.py deleted file mode 100644 index 41e86d6..0000000 --- a/src/ceph/qa/tasks/osd_recovery.py +++ /dev/null @@ -1,193 +0,0 @@ -""" -osd recovery -""" -import logging -import ceph_manager -import time -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - - -def rados_start(testdir, remote, cmd): - """ - Run a remote rados command (currently used to only write data) - """ - log.info("rados %s" % ' '.join(cmd)) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - ]; - pre.extend(cmd) - proc = remote.run( - args=pre, - wait=False, - ) - return proc - -def task(ctx, config): - """ - Test (non-backfill) recovery - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'task only accepts a dict for configuration' - testdir = teuthology.get_testdir(ctx) - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - assert num_osds == 3 - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.flush_pg_stats([0, 1, 2]) - manager.wait_for_clean() - - # test some osdmap flags - manager.raw_cluster_cmd('osd', 'set', 'noin') - manager.raw_cluster_cmd('osd', 'set', 'noout') - manager.raw_cluster_cmd('osd', 'set', 'noup') - manager.raw_cluster_cmd('osd', 'set', 'nodown') - manager.raw_cluster_cmd('osd', 'unset', 'noin') - manager.raw_cluster_cmd('osd', 'unset', 'noout') - manager.raw_cluster_cmd('osd', 'unset', 'noup') - manager.raw_cluster_cmd('osd', 'unset', 'nodown') - - # write some new data - p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096', - '--no-cleanup']) - - time.sleep(15) - - # trigger a divergent target: - # blackhole + restart osd.1 (shorter log) - manager.blackhole_kill_osd(1) - # kill osd.2 (longer log... we'll make it divergent below) - manager.kill_osd(2) - time.sleep(2) - manager.revive_osd(1) - - # wait for our writes to complete + succeed - err = p.wait() - log.info('err is %d' % err) - - # cluster must repeer - manager.flush_pg_stats([0, 1]) - manager.wait_for_active_or_down() - - # write some more (make sure osd.2 really is divergent) - p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096']) - p.wait() - - # revive divergent osd - manager.revive_osd(2) - - while len(manager.get_osd_status()['up']) < 3: - log.info('waiting a bit...') - time.sleep(2) - log.info('3 are up!') - - # cluster must recover - manager.flush_pg_stats([0, 1, 2]) - manager.wait_for_clean() - - -def test_incomplete_pgs(ctx, config): - """ - Test handling of incomplete pgs. Requires 4 osds. - """ - testdir = teuthology.get_testdir(ctx) - if config is None: - config = {} - assert isinstance(config, dict), \ - 'task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - assert num_osds == 4 - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 4: - time.sleep(10) - - manager.flush_pg_stats([0, 1, 2, 3]) - manager.wait_for_clean() - - log.info('Testing incomplete pgs...') - - for i in range(4): - manager.set_config( - i, - osd_recovery_delay_start=1000) - - # move data off of osd.0, osd.1 - manager.raw_cluster_cmd('osd', 'out', '0', '1') - manager.flush_pg_stats([0, 1, 2, 3], [0, 1]) - manager.wait_for_clean() - - # lots of objects in rbd (no pg log, will backfill) - p = rados_start(testdir, mon, - ['-p', 'rbd', 'bench', '20', 'write', '-b', '1', - '--no-cleanup']) - p.wait() - - # few objects in rbd pool (with pg log, normal recovery) - for f in range(1, 20): - p = rados_start(testdir, mon, ['-p', 'rbd', 'put', - 'foo.%d' % f, '/etc/passwd']) - p.wait() - - # move it back - manager.raw_cluster_cmd('osd', 'in', '0', '1') - manager.raw_cluster_cmd('osd', 'out', '2', '3') - time.sleep(10) - manager.flush_pg_stats([0, 1, 2, 3], [2, 3]) - time.sleep(10) - manager.wait_for_active() - - assert not manager.is_clean() - assert not manager.is_recovered() - - # kill 2 + 3 - log.info('stopping 2,3') - manager.kill_osd(2) - manager.kill_osd(3) - log.info('...') - manager.raw_cluster_cmd('osd', 'down', '2', '3') - manager.flush_pg_stats([0, 1]) - manager.wait_for_active_or_down() - - assert manager.get_num_down() > 0 - - # revive 2 + 3 - manager.revive_osd(2) - manager.revive_osd(3) - while len(manager.get_osd_status()['up']) < 4: - log.info('waiting a bit...') - time.sleep(2) - log.info('all are up!') - - for i in range(4): - manager.kick_recovery_wq(i) - - # cluster must recover - manager.wait_for_clean() diff --git a/src/ceph/qa/tasks/peer.py b/src/ceph/qa/tasks/peer.py deleted file mode 100644 index 9850da1..0000000 --- a/src/ceph/qa/tasks/peer.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Peer test (Single test, not much configurable here) -""" -import logging -import json -import time - -import ceph_manager -from teuthology import misc as teuthology -from util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test peering. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'peer task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.flush_pg_stats([0, 1, 2]) - manager.wait_for_clean() - - for i in range(3): - manager.set_config( - i, - osd_recovery_delay_start=120) - - # take on osd down - manager.kill_osd(2) - manager.mark_down_osd(2) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-']) - - manager.flush_pg_stats([0, 1]) - manager.wait_for_recovery() - - # kill another and revive 2, so that some pgs can't peer. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.revive_osd(2) - manager.wait_till_osd_is_up(2) - - manager.flush_pg_stats([0, 2]) - - manager.wait_for_active_or_down() - - manager.flush_pg_stats([0, 2]) - - # look for down pgs - num_down_pgs = 0 - pgs = manager.get_pg_stats() - for pg in pgs: - out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query') - log.debug("out string %s",out) - j = json.loads(out) - log.info("pg is %s, query json is %s", pg, j) - - if pg['state'].count('down'): - num_down_pgs += 1 - # verify that it is blocked on osd.1 - rs = j['recovery_state'] - assert len(rs) >= 2 - assert rs[0]['name'] == 'Started/Primary/Peering/Down' - assert rs[1]['name'] == 'Started/Primary/Peering' - assert rs[1]['blocked'] - assert rs[1]['down_osds_we_would_probe'] == [1] - assert len(rs[1]['peering_blocked_by']) == 1 - assert rs[1]['peering_blocked_by'][0]['osd'] == 1 - - assert num_down_pgs > 0 - - # bring it all back - manager.revive_osd(1) - manager.wait_till_osd_is_up(1) - manager.flush_pg_stats([0, 1, 2]) - manager.wait_for_clean() diff --git a/src/ceph/qa/tasks/peering_speed_test.py b/src/ceph/qa/tasks/peering_speed_test.py deleted file mode 100644 index ab53238..0000000 --- a/src/ceph/qa/tasks/peering_speed_test.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Remotely run peering tests. -""" -import logging -import time - -log = logging.getLogger(__name__) - -from args import argify - -POOLNAME = "POOLNAME" -ARGS = [ - ('num_pgs', 'number of pgs to create', 256, int), - ('max_time', 'seconds to complete peering', 0, int), - ('runs', 'trials to run', 10, int), - ('num_objects', 'objects to create', 256 * 1024, int), - ('object_size', 'size in bytes for objects', 64, int), - ('creation_time_limit', 'time limit for pool population', 60*60, int), - ('create_threads', 'concurrent writes for create', 256, int) - ] - -def setup(ctx, config): - """ - Setup peering test on remotes. - """ - manager = ctx.managers['ceph'] - manager.clear_pools() - manager.create_pool(POOLNAME, config.num_pgs) - log.info("populating pool") - manager.rados_write_objects( - POOLNAME, - config.num_objects, - config.object_size, - config.creation_time_limit, - config.create_threads) - log.info("done populating pool") - -def do_run(ctx, config): - """ - Perform the test. - """ - start = time.time() - # mark in osd - manager = ctx.managers['ceph'] - manager.mark_in_osd(0) - log.info("writing out objects") - manager.rados_write_objects( - POOLNAME, - config.num_pgs, # write 1 object per pg or so - 1, - config.creation_time_limit, - config.num_pgs, # lots of concurrency - cleanup = True) - peering_end = time.time() - - log.info("peering done, waiting on recovery") - manager.wait_for_clean() - - log.info("recovery done") - recovery_end = time.time() - if config.max_time: - assert(peering_end - start < config.max_time) - manager.mark_out_osd(0) - manager.wait_for_clean() - return { - 'time_to_active': peering_end - start, - 'time_to_clean': recovery_end - start - } - -@argify("peering_speed_test", ARGS) -def task(ctx, config): - """ - Peering speed test - """ - setup(ctx, config) - manager = ctx.managers['ceph'] - manager.mark_out_osd(0) - manager.wait_for_clean() - ret = [] - for i in range(config.runs): - log.info("Run {i}".format(i = i)) - ret.append(do_run(ctx, config)) - - manager.mark_in_osd(0) - ctx.summary['recovery_times'] = { - 'runs': ret - } diff --git a/src/ceph/qa/tasks/populate_rbd_pool.py b/src/ceph/qa/tasks/populate_rbd_pool.py deleted file mode 100644 index db67d60..0000000 --- a/src/ceph/qa/tasks/populate_rbd_pool.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Populate rbd pools -""" -import contextlib -import logging - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Populate <num_pools> pools with prefix <pool_prefix> with <num_images> - rbd images at <num_snaps> snaps - - The config could be as follows:: - - populate_rbd_pool: - client: <client> - pool_prefix: foo - num_pools: 5 - num_images: 10 - num_snaps: 3 - image_size: 10737418240 - """ - if config is None: - config = {} - client = config.get("client", "client.0") - pool_prefix = config.get("pool_prefix", "foo") - num_pools = config.get("num_pools", 2) - num_images = config.get("num_images", 20) - num_snaps = config.get("num_snaps", 4) - image_size = config.get("image_size", 100) - write_size = config.get("write_size", 1024*1024) - write_threads = config.get("write_threads", 10) - write_total_per_snap = config.get("write_total_per_snap", 1024*1024*30) - - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - - for poolid in range(num_pools): - poolname = "%s-%s" % (pool_prefix, str(poolid)) - log.info("Creating pool %s" % (poolname,)) - ctx.managers['ceph'].create_pool(poolname) - for imageid in range(num_images): - imagename = "rbd-%s" % (str(imageid),) - log.info("Creating imagename %s" % (imagename,)) - remote.run( - args = [ - "rbd", - "create", - imagename, - "--image-format", "1", - "--size", str(image_size), - "--pool", str(poolname)]) - def bench_run(): - remote.run( - args = [ - "rbd", - "bench-write", - imagename, - "--pool", poolname, - "--io-size", str(write_size), - "--io-threads", str(write_threads), - "--io-total", str(write_total_per_snap), - "--io-pattern", "rand"]) - log.info("imagename %s first bench" % (imagename,)) - bench_run() - for snapid in range(num_snaps): - snapname = "snap-%s" % (str(snapid),) - log.info("imagename %s creating snap %s" % (imagename, snapname)) - remote.run( - args = [ - "rbd", "snap", "create", - "--pool", poolname, - "--snap", snapname, - imagename - ]) - bench_run() - - try: - yield - finally: - log.info('done') diff --git a/src/ceph/qa/tasks/qemu.py b/src/ceph/qa/tasks/qemu.py deleted file mode 100644 index 82252e1..0000000 --- a/src/ceph/qa/tasks/qemu.py +++ /dev/null @@ -1,577 +0,0 @@ -""" -Qemu task -""" -from cStringIO import StringIO - -import contextlib -import logging -import os -import yaml - -from teuthology import misc as teuthology -from teuthology import contextutil -from tasks import rbd -from teuthology.orchestra import run -from teuthology.config import config as teuth_config - -log = logging.getLogger(__name__) - -DEFAULT_NUM_DISKS = 2 -DEFAULT_IMAGE_URL = 'http://download.ceph.com/qa/ubuntu-12.04.qcow2' -DEFAULT_IMAGE_SIZE = 10240 # in megabytes -DEFAULT_CPUS = 1 -DEFAULT_MEM = 4096 # in megabytes - -def create_images(ctx, config, managers): - for client, client_config in config.iteritems(): - disks = client_config.get('disks', DEFAULT_NUM_DISKS) - if not isinstance(disks, list): - disks = [{} for n in range(int(disks))] - clone = client_config.get('clone', False) - assert disks, 'at least one rbd device must be used' - for i, disk in enumerate(disks[1:]): - create_config = { - client: { - 'image_name': '{client}.{num}'.format(client=client, - num=i + 1), - 'image_format': 2 if clone else 1, - 'image_size': (disk or {}).get('image_size', - DEFAULT_IMAGE_SIZE), - } - } - managers.append( - lambda create_config=create_config: - rbd.create_image(ctx=ctx, config=create_config) - ) - -def create_clones(ctx, config, managers): - for client, client_config in config.iteritems(): - clone = client_config.get('clone', False) - if clone: - num_disks = client_config.get('disks', DEFAULT_NUM_DISKS) - if isinstance(num_disks, list): - num_disks = len(num_disks) - for i in xrange(num_disks): - create_config = { - client: { - 'image_name': - '{client}.{num}-clone'.format(client=client, num=i), - 'parent_name': - '{client}.{num}'.format(client=client, num=i), - } - } - managers.append( - lambda create_config=create_config: - rbd.clone_image(ctx=ctx, config=create_config) - ) - -@contextlib.contextmanager -def create_dirs(ctx, config): - """ - Handle directory creation and cleanup - """ - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - assert 'test' in client_config, 'You must specify a test to run' - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'install', '-d', '-m0755', '--', - '{tdir}/qemu'.format(tdir=testdir), - '{tdir}/archive/qemu'.format(tdir=testdir), - ] - ) - try: - yield - finally: - for client, client_config in config.iteritems(): - assert 'test' in client_config, 'You must specify a test to run' - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true', - ] - ) - -@contextlib.contextmanager -def generate_iso(ctx, config): - """Execute system commands to generate iso""" - log.info('generating iso...') - testdir = teuthology.get_testdir(ctx) - - # use ctx.config instead of config, because config has been - # through teuthology.replace_all_with_clients() - refspec = ctx.config.get('branch') - if refspec is None: - refspec = ctx.config.get('tag') - if refspec is None: - refspec = ctx.config.get('sha1') - if refspec is None: - refspec = 'HEAD' - - # hack: the git_url is always ceph-ci or ceph - git_url = teuth_config.get_ceph_git_url() - repo_name = 'ceph.git' - if git_url.count('ceph-ci'): - repo_name = 'ceph-ci.git' - - for client, client_config in config.iteritems(): - assert 'test' in client_config, 'You must specify a test to run' - test_url = client_config['test'].format(repo=repo_name, branch=refspec) - (remote,) = ctx.cluster.only(client).remotes.keys() - src_dir = os.path.dirname(__file__) - userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client) - metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client) - - with file(os.path.join(src_dir, 'userdata_setup.yaml'), 'rb') as f: - test_setup = ''.join(f.readlines()) - # configuring the commands to setup the nfs mount - mnt_dir = "/export/{client}".format(client=client) - test_setup = test_setup.format( - mnt_dir=mnt_dir - ) - - with file(os.path.join(src_dir, 'userdata_teardown.yaml'), 'rb') as f: - test_teardown = ''.join(f.readlines()) - - user_data = test_setup - if client_config.get('type', 'filesystem') == 'filesystem': - num_disks = client_config.get('disks', DEFAULT_NUM_DISKS) - if isinstance(num_disks, list): - num_disks = len(num_disks) - for i in xrange(1, num_disks): - dev_letter = chr(ord('a') + i) - user_data += """ -- | - #!/bin/bash - mkdir /mnt/test_{dev_letter} - mkfs -t xfs /dev/vd{dev_letter} - mount -t xfs /dev/vd{dev_letter} /mnt/test_{dev_letter} -""".format(dev_letter=dev_letter) - - user_data += """ -- | - #!/bin/bash - test -d /etc/ceph || mkdir /etc/ceph - cp /mnt/cdrom/ceph.* /etc/ceph/ -""" - - cloud_config_archive = client_config.get('cloud_config_archive', []) - if cloud_config_archive: - user_data += yaml.safe_dump(cloud_config_archive, default_style='|', - default_flow_style=False) - - # this may change later to pass the directories as args to the - # script or something. xfstests needs that. - user_data += """ -- | - #!/bin/bash - test -d /mnt/test_b && cd /mnt/test_b - /mnt/cdrom/test.sh > /mnt/log/test.log 2>&1 && touch /mnt/log/success -""" + test_teardown - - user_data = user_data.format( - ceph_branch=ctx.config.get('branch'), - ceph_sha1=ctx.config.get('sha1')) - teuthology.write_file(remote, userdata_path, StringIO(user_data)) - - with file(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f: - teuthology.write_file(remote, metadata_path, f) - - test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client) - - log.info('fetching test %s for %s', test_url, client) - remote.run( - args=[ - 'wget', '-nv', '-O', test_file, - test_url, - run.Raw('&&'), - 'chmod', '755', test_file, - ], - ) - remote.run( - args=[ - 'genisoimage', '-quiet', '-input-charset', 'utf-8', - '-volid', 'cidata', '-joliet', '-rock', - '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), - '-graft-points', - 'user-data={userdata}'.format(userdata=userdata_path), - 'meta-data={metadata}'.format(metadata=metadata_path), - 'ceph.conf=/etc/ceph/ceph.conf', - 'ceph.keyring=/etc/ceph/ceph.keyring', - 'test.sh={file}'.format(file=test_file), - ], - ) - try: - yield - finally: - for client in config.iterkeys(): - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'rm', '-f', - '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), - os.path.join(testdir, 'qemu', 'userdata.' + client), - os.path.join(testdir, 'qemu', 'metadata.' + client), - '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client), - ], - ) - -@contextlib.contextmanager -def download_image(ctx, config): - """Downland base image, remove image file when done""" - log.info('downloading base image') - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - base_file = '{tdir}/qemu/base.{client}.qcow2'.format(tdir=testdir, client=client) - image_url = client_config.get('image_url', DEFAULT_IMAGE_URL) - remote.run( - args=[ - 'wget', '-nv', '-O', base_file, image_url, - ] - ) - - disks = client_config.get('disks', None) - if not isinstance(disks, list): - disks = [{}] - image_name = '{client}.0'.format(client=client) - image_size = (disks[0] or {}).get('image_size', DEFAULT_IMAGE_SIZE) - remote.run( - args=[ - 'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw', - base_file, 'rbd:rbd/{image_name}'.format(image_name=image_name) - ] - ) - remote.run( - args=[ - 'rbd', 'resize', - '--size={image_size}M'.format(image_size=image_size), - image_name, - ] - ) - try: - yield - finally: - log.debug('cleaning up base image files') - for client in config.iterkeys(): - base_file = '{tdir}/qemu/base.{client}.qcow2'.format( - tdir=testdir, - client=client, - ) - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'rm', '-f', base_file, - ], - ) - - -def _setup_nfs_mount(remote, client, mount_dir): - """ - Sets up an nfs mount on the remote that the guest can use to - store logs. This nfs mount is also used to touch a file - at the end of the test to indiciate if the test was successful - or not. - """ - export_dir = "/export/{client}".format(client=client) - log.info("Creating the nfs export directory...") - remote.run(args=[ - 'sudo', 'mkdir', '-p', export_dir, - ]) - log.info("Mounting the test directory...") - remote.run(args=[ - 'sudo', 'mount', '--bind', mount_dir, export_dir, - ]) - log.info("Adding mount to /etc/exports...") - export = "{dir} *(rw,no_root_squash,no_subtree_check,insecure)".format( - dir=export_dir - ) - remote.run(args=[ - 'sudo', 'sed', '-i', '/^\/export\//d', "/etc/exports", - ]) - remote.run(args=[ - 'echo', export, run.Raw("|"), - 'sudo', 'tee', '-a', "/etc/exports", - ]) - log.info("Restarting NFS...") - if remote.os.package_type == "deb": - remote.run(args=['sudo', 'service', 'nfs-kernel-server', 'restart']) - else: - remote.run(args=['sudo', 'systemctl', 'restart', 'nfs']) - - -def _teardown_nfs_mount(remote, client): - """ - Tears down the nfs mount on the remote used for logging and reporting the - status of the tests being ran in the guest. - """ - log.info("Tearing down the nfs mount for {remote}".format(remote=remote)) - export_dir = "/export/{client}".format(client=client) - log.info("Stopping NFS...") - if remote.os.package_type == "deb": - remote.run(args=[ - 'sudo', 'service', 'nfs-kernel-server', 'stop' - ]) - else: - remote.run(args=[ - 'sudo', 'systemctl', 'stop', 'nfs' - ]) - log.info("Unmounting exported directory...") - remote.run(args=[ - 'sudo', 'umount', export_dir - ]) - log.info("Deleting exported directory...") - remote.run(args=[ - 'sudo', 'rm', '-r', '/export' - ]) - log.info("Deleting export from /etc/exports...") - remote.run(args=[ - 'sudo', 'sed', '-i', '$ d', '/etc/exports' - ]) - log.info("Starting NFS...") - if remote.os.package_type == "deb": - remote.run(args=[ - 'sudo', 'service', 'nfs-kernel-server', 'start' - ]) - else: - remote.run(args=[ - 'sudo', 'systemctl', 'start', 'nfs' - ]) - - -@contextlib.contextmanager -def run_qemu(ctx, config): - """Setup kvm environment and start qemu""" - procs = [] - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client) - remote.run( - args=[ - 'mkdir', log_dir, run.Raw('&&'), - 'sudo', 'modprobe', 'kvm', - ] - ) - - # make an nfs mount to use for logging and to - # allow to test to tell teuthology the tests outcome - _setup_nfs_mount(remote, client, log_dir) - - # Hack to make sure /dev/kvm permissions are set correctly - # See http://tracker.ceph.com/issues/17977 and - # https://bugzilla.redhat.com/show_bug.cgi?id=1333159 - remote.run(args='sudo udevadm control --reload') - remote.run(args='sudo udevadm trigger /dev/kvm') - remote.run(args='ls -l /dev/kvm') - - qemu_cmd = 'qemu-system-x86_64' - if remote.os.package_type == "rpm": - qemu_cmd = "/usr/libexec/qemu-kvm" - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'daemon-helper', - 'term', - qemu_cmd, '-enable-kvm', '-nographic', '-cpu', 'host', - '-smp', str(client_config.get('cpus', DEFAULT_CPUS)), - '-m', str(client_config.get('memory', DEFAULT_MEM)), - # cd holding metadata for cloud-init - '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), - ] - - cachemode = 'none' - ceph_config = ctx.ceph['ceph'].conf.get('global', {}) - ceph_config.update(ctx.ceph['ceph'].conf.get('client', {})) - ceph_config.update(ctx.ceph['ceph'].conf.get(client, {})) - if ceph_config.get('rbd cache', True): - if ceph_config.get('rbd cache max dirty', 1) > 0: - cachemode = 'writeback' - else: - cachemode = 'writethrough' - - clone = client_config.get('clone', False) - num_disks = client_config.get('disks', DEFAULT_NUM_DISKS) - if isinstance(num_disks, list): - num_disks = len(num_disks) - for i in xrange(num_disks): - suffix = '-clone' if clone else '' - args.extend([ - '-drive', - 'file=rbd:rbd/{img}:id={id},format=raw,if=virtio,cache={cachemode}'.format( - img='{client}.{num}{suffix}'.format(client=client, num=i, - suffix=suffix), - id=client[len('client.'):], - cachemode=cachemode, - ), - ]) - - log.info('starting qemu...') - procs.append( - remote.run( - args=args, - logger=log.getChild(client), - stdin=run.PIPE, - wait=False, - ) - ) - - try: - yield - finally: - log.info('waiting for qemu tests to finish...') - run.wait(procs) - - log.debug('checking that qemu tests succeeded...') - for client in config.iterkeys(): - (remote,) = ctx.cluster.only(client).remotes.keys() - - # ensure we have permissions to all the logs - log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, - client=client) - remote.run( - args=[ - 'sudo', 'chmod', 'a+rw', '-R', log_dir - ] - ) - - # teardown nfs mount - _teardown_nfs_mount(remote, client) - # check for test status - remote.run( - args=[ - 'test', '-f', - '{tdir}/archive/qemu/{client}/success'.format( - tdir=testdir, - client=client - ), - ], - ) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run a test inside of QEMU on top of rbd. Only one test - is supported per client. - - For example, you can specify which clients to run on:: - - tasks: - - ceph: - - qemu: - client.0: - test: http://download.ceph.com/qa/test.sh - client.1: - test: http://download.ceph.com/qa/test2.sh - - Or use the same settings on all clients: - - tasks: - - ceph: - - qemu: - all: - test: http://download.ceph.com/qa/test.sh - - For tests that don't need a filesystem, set type to block:: - - tasks: - - ceph: - - qemu: - client.0: - test: http://download.ceph.com/qa/test.sh - type: block - - The test should be configured to run on /dev/vdb and later - devices. - - If you want to run a test that uses more than one rbd image, - specify how many images to use:: - - tasks: - - ceph: - - qemu: - client.0: - test: http://download.ceph.com/qa/test.sh - type: block - disks: 2 - - - or - - - tasks: - - ceph: - - qemu: - client.0: - test: http://ceph.com/qa/test.sh - type: block - disks: - - image_size: 1024 - - image_size: 2048 - - You can set the amount of CPUs and memory the VM has (default is 1 CPU and - 4096 MB):: - - tasks: - - ceph: - - qemu: - client.0: - test: http://download.ceph.com/qa/test.sh - cpus: 4 - memory: 512 # megabytes - - If you want to run a test against a cloned rbd image, set clone to true:: - - tasks: - - ceph: - - qemu: - client.0: - test: http://download.ceph.com/qa/test.sh - clone: true - - If you need to configure additional cloud-config options, set cloud_config - to the required data set:: - - tasks: - - ceph - - qemu: - client.0: - test: http://ceph.com/qa/test.sh - cloud_config_archive: - - | - #/bin/bash - touch foo1 - - content: | - test data - type: text/plain - filename: /tmp/data - - If you need to override the default cloud image, set image_url: - - tasks: - - ceph - - qemu: - client.0: - test: http://ceph.com/qa/test.sh - image_url: https://cloud-images.ubuntu.com/releases/16.04/release/ubuntu-16.04-server-cloudimg-amd64-disk1.img - """ - assert isinstance(config, dict), \ - "task qemu only supports a dictionary for configuration" - - config = teuthology.replace_all_with_clients(ctx.cluster, config) - - managers = [] - create_images(ctx=ctx, config=config, managers=managers) - managers.extend([ - lambda: create_dirs(ctx=ctx, config=config), - lambda: generate_iso(ctx=ctx, config=config), - lambda: download_image(ctx=ctx, config=config), - ]) - create_clones(ctx=ctx, config=config, managers=managers) - managers.append( - lambda: run_qemu(ctx=ctx, config=config), - ) - - with contextutil.nested(*managers): - yield diff --git a/src/ceph/qa/tasks/rados.py b/src/ceph/qa/tasks/rados.py deleted file mode 100644 index 3ab93d6..0000000 --- a/src/ceph/qa/tasks/rados.py +++ /dev/null @@ -1,266 +0,0 @@ -""" -Rados modle-based integration tests -""" -import contextlib -import logging -import gevent -from teuthology import misc as teuthology - -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run RadosModel-based integration tests. - - The config should be as follows:: - - rados: - clients: [client list] - ops: <number of ops> - objects: <number of objects to use> - max_in_flight: <max number of operations in flight> - object_size: <size of objects in bytes> - min_stride_size: <minimum write stride size in bytes> - max_stride_size: <maximum write stride size in bytes> - op_weights: <dictionary mapping operation type to integer weight> - runs: <number of times to run> - the pool is remade between runs - ec_pool: use an ec pool - erasure_code_profile: profile to use with the erasure coded pool - fast_read: enable ec_pool's fast_read - min_size: set the min_size of created pool - pool_snaps: use pool snapshots instead of selfmanaged snapshots - write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED. - This mean data don't access in the near future. - Let osd backend don't keep data in cache. - - For example:: - - tasks: - - ceph: - - rados: - clients: [client.0] - ops: 1000 - max_seconds: 0 # 0 for no limit - objects: 25 - max_in_flight: 16 - object_size: 4000000 - min_stride_size: 1024 - max_stride_size: 4096 - op_weights: - read: 20 - write: 10 - delete: 2 - snap_create: 3 - rollback: 2 - snap_remove: 0 - ec_pool: create an ec pool, defaults to False - erasure_code_use_overwrites: test overwrites, default false - erasure_code_profile: - name: teuthologyprofile - k: 2 - m: 1 - crush-failure-domain: osd - pool_snaps: true - write_fadvise_dontneed: true - runs: 10 - - interactive: - - Optionally, you can provide the pool name to run against: - - tasks: - - ceph: - - exec: - client.0: - - ceph osd pool create foo - - rados: - clients: [client.0] - pools: [foo] - ... - - Alternatively, you can provide a pool prefix: - - tasks: - - ceph: - - exec: - client.0: - - ceph osd pool create foo.client.0 - - rados: - clients: [client.0] - pool_prefix: foo - ... - - The tests are run asynchronously, they are not complete when the task - returns. For instance: - - - rados: - clients: [client.0] - pools: [ecbase] - ops: 4000 - objects: 500 - op_weights: - read: 100 - write: 100 - delete: 50 - copy_from: 50 - - print: "**** done rados ec-cache-agent (part 2)" - - will run the print task immediately after the rados tasks begins but - not after it completes. To make the rados task a blocking / sequential - task, use: - - - sequential: - - rados: - clients: [client.0] - pools: [ecbase] - ops: 4000 - objects: 500 - op_weights: - read: 100 - write: 100 - delete: 50 - copy_from: 50 - - print: "**** done rados ec-cache-agent (part 2)" - - """ - log.info('Beginning rados...') - assert isinstance(config, dict), \ - "please list clients to run on" - - object_size = int(config.get('object_size', 4000000)) - op_weights = config.get('op_weights', {}) - testdir = teuthology.get_testdir(ctx) - args = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph_test_rados'] - if config.get('ec_pool', False): - args.extend(['--no-omap']) - if not config.get('erasure_code_use_overwrites', False): - args.extend(['--ec-pool']) - if config.get('write_fadvise_dontneed', False): - args.extend(['--write-fadvise-dontneed']) - if config.get('set_redirect', False): - args.extend(['--set_redirect']) - if config.get('pool_snaps', False): - args.extend(['--pool-snaps']) - args.extend([ - '--max-ops', str(config.get('ops', 10000)), - '--objects', str(config.get('objects', 500)), - '--max-in-flight', str(config.get('max_in_flight', 16)), - '--size', str(object_size), - '--min-stride-size', str(config.get('min_stride_size', object_size / 10)), - '--max-stride-size', str(config.get('max_stride_size', object_size / 5)), - '--max-seconds', str(config.get('max_seconds', 0)) - ]) - - weights = {} - weights['read'] = 100 - weights['write'] = 100 - weights['delete'] = 10 - # Parallel of the op_types in test/osd/TestRados.cc - for field in [ - # read handled above - # write handled above - # delete handled above - "snap_create", - "snap_remove", - "rollback", - "setattr", - "rmattr", - "watch", - "copy_from", - "hit_set_list", - "is_dirty", - "undirty", - "cache_flush", - "cache_try_flush", - "cache_evict", - "append", - "write", - "read", - "delete" - ]: - if field in op_weights: - weights[field] = op_weights[field] - - if config.get('write_append_excl', True): - if 'write' in weights: - weights['write'] = weights['write'] / 2 - weights['write_excl'] = weights['write'] - - if 'append' in weights: - weights['append'] = weights['append'] / 2 - weights['append_excl'] = weights['append'] - - for op, weight in weights.iteritems(): - args.extend([ - '--op', op, str(weight) - ]) - - - def thread(): - """Thread spawned by gevent""" - clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - log.info('clients are %s' % clients) - manager = ctx.managers['ceph'] - if config.get('ec_pool', False): - profile = config.get('erasure_code_profile', {}) - profile_name = profile.get('name', 'teuthologyprofile') - manager.create_erasure_code_profile(profile_name, profile) - else: - profile_name = None - for i in range(int(config.get('runs', '1'))): - log.info("starting run %s out of %s", str(i), config.get('runs', '1')) - tests = {} - existing_pools = config.get('pools', []) - created_pools = [] - for role in config.get('clients', clients): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - - pool = config.get('pool', None) - if not pool and existing_pools: - pool = existing_pools.pop() - else: - pool = manager.create_pool_with_unique_name( - erasure_code_profile_name=profile_name, - erasure_code_use_overwrites= - config.get('erasure_code_use_overwrites', False) - ) - created_pools.append(pool) - if config.get('fast_read', False): - manager.raw_cluster_cmd( - 'osd', 'pool', 'set', pool, 'fast_read', 'true') - min_size = config.get('min_size', None); - if min_size is not None: - manager.raw_cluster_cmd( - 'osd', 'pool', 'set', pool, 'min_size', str(min_size)) - - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - proc = remote.run( - args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args + - ["--pool", pool], - logger=log.getChild("rados.{id}".format(id=id_)), - stdin=run.PIPE, - wait=False - ) - tests[id_] = proc - run.wait(tests.itervalues()) - - for pool in created_pools: - manager.wait_snap_trimming_complete(pool); - manager.remove_pool(pool) - - running = gevent.spawn(thread) - - try: - yield - finally: - log.info('joining rados') - running.get() diff --git a/src/ceph/qa/tasks/radosbench.py b/src/ceph/qa/tasks/radosbench.py deleted file mode 100644 index 530a6f1..0000000 --- a/src/ceph/qa/tasks/radosbench.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -Rados benchmarking -""" -import contextlib -import logging - -from teuthology.orchestra import run -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run radosbench - - The config should be as follows: - - radosbench: - clients: [client list] - time: <seconds to run> - pool: <pool to use> - size: write size to use - objectsize: object size to use - unique_pool: use a unique pool, defaults to False - ec_pool: create an ec pool, defaults to False - create_pool: create pool, defaults to True - erasure_code_profile: - name: teuthologyprofile - k: 2 - m: 1 - crush-failure-domain: osd - cleanup: false (defaults to true) - type: <write|seq|rand> (defaults to write) - example: - - tasks: - - ceph: - - radosbench: - clients: [client.0] - time: 360 - - interactive: - """ - log.info('Beginning radosbench...') - assert isinstance(config, dict), \ - "please list clients to run on" - radosbench = {} - - testdir = teuthology.get_testdir(ctx) - manager = ctx.managers['ceph'] - runtype = config.get('type', 'write') - - create_pool = config.get('create_pool', True) - for role in config.get('clients', ['client.0']): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - - if config.get('ec_pool', False): - profile = config.get('erasure_code_profile', {}) - profile_name = profile.get('name', 'teuthologyprofile') - manager.create_erasure_code_profile(profile_name, profile) - else: - profile_name = None - - cleanup = [] - if not config.get('cleanup', True): - cleanup = ['--no-cleanup'] - - pool = config.get('pool', 'data') - if create_pool: - if pool != 'data': - manager.create_pool(pool, erasure_code_profile_name=profile_name) - else: - pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name) - - osize = config.get('objectsize', 0) - if osize is 0: - objectsize = [] - else: - objectsize = ['-o', str(osize)] - size = ['-b', str(config.get('size', 4<<20))] - # If doing a reading run then populate data - if runtype != "write": - proc = remote.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'rados', - '--no-log-to-stderr', - '--name', role] - + size + objectsize + - ['-p' , pool, - 'bench', str(60), "write", "--no-cleanup" - ]).format(tdir=testdir), - ], - logger=log.getChild('radosbench.{id}'.format(id=id_)), - wait=True - ) - size = [] - objectsize = [] - - proc = remote.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'rados', - '--no-log-to-stderr', - '--name', role] - + size + objectsize + - ['-p' , pool, - 'bench', str(config.get('time', 360)), runtype, - ] + cleanup).format(tdir=testdir), - ], - logger=log.getChild('radosbench.{id}'.format(id=id_)), - stdin=run.PIPE, - wait=False - ) - radosbench[id_] = proc - - try: - yield - finally: - timeout = config.get('time', 360) * 30 + 300 - log.info('joining radosbench (timing out after %ss)', timeout) - run.wait(radosbench.itervalues(), timeout=timeout) - - if pool is not 'data' and create_pool: - manager.remove_pool(pool) diff --git a/src/ceph/qa/tasks/radosbenchsweep.py b/src/ceph/qa/tasks/radosbenchsweep.py deleted file mode 100644 index cda106a..0000000 --- a/src/ceph/qa/tasks/radosbenchsweep.py +++ /dev/null @@ -1,221 +0,0 @@ -""" -Rados benchmarking sweep -""" -import contextlib -import logging -import re - -from cStringIO import StringIO -from itertools import product - -from teuthology.orchestra import run -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Execute a radosbench parameter sweep - - Puts radosbench in a loop, taking values from the given config at each - iteration. If given, the min and max values below create a range, e.g. - min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas. - - Parameters: - - clients: [client list] - time: seconds to run (default=120) - sizes: [list of object sizes] (default=[4M]) - mode: <write|read|seq> (default=write) - repetitions: execute the same configuration multiple times (default=1) - min_num_replicas: minimum number of replicas to use (default = 3) - max_num_replicas: maximum number of replicas to use (default = 3) - min_num_osds: the minimum number of OSDs in a pool (default=all) - max_num_osds: the maximum number of OSDs in a pool (default=all) - file: name of CSV-formatted output file (default='radosbench.csv') - columns: columns to include (default=all) - - rep: execution number (takes values from 'repetitions') - - num_osd: number of osds for pool - - num_replica: number of replicas - - avg_throughput: throughput - - avg_latency: latency - - stdev_throughput: - - stdev_latency: - - Example: - - radsobenchsweep: - columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput] - """ - log.info('Beginning radosbenchsweep...') - assert isinstance(config, dict), 'expecting dictionary for configuration' - - # get and validate config values - # { - - # only one client supported for now - if len(config.get('clients', [])) != 1: - raise Exception("Only one client can be specified") - - # only write mode - if config.get('mode', 'write') != 'write': - raise Exception("Only 'write' mode supported for now.") - - # OSDs - total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd') - min_num_osds = config.get('min_num_osds', total_osds_in_cluster) - max_num_osds = config.get('max_num_osds', total_osds_in_cluster) - - if max_num_osds > total_osds_in_cluster: - raise Exception('max_num_osds cannot be greater than total in cluster') - if min_num_osds < 1: - raise Exception('min_num_osds cannot be less than 1') - if min_num_osds > max_num_osds: - raise Exception('min_num_osds cannot be greater than max_num_osd') - osds = range(0, (total_osds_in_cluster + 1)) - - # replicas - min_num_replicas = config.get('min_num_replicas', 3) - max_num_replicas = config.get('max_num_replicas', 3) - - if min_num_replicas < 1: - raise Exception('min_num_replicas cannot be less than 1') - if min_num_replicas > max_num_replicas: - raise Exception('min_num_replicas cannot be greater than max_replicas') - if max_num_replicas > max_num_osds: - raise Exception('max_num_replicas cannot be greater than max_num_osds') - replicas = range(min_num_replicas, (max_num_replicas + 1)) - - # object size - sizes = config.get('size', [4 << 20]) - - # repetitions - reps = range(config.get('repetitions', 1)) - - # file - fname = config.get('file', 'radosbench.csv') - f = open('{}/{}'.format(ctx.archive, fname), 'w') - f.write(get_csv_header(config) + '\n') - # } - - # set default pools size=1 to avoid 'unhealthy' issues - ctx.manager.set_pool_property('data', 'size', 1) - ctx.manager.set_pool_property('metadata', 'size', 1) - ctx.manager.set_pool_property('rbd', 'size', 1) - - current_osds_out = 0 - - # sweep through all parameters - for osds_out, size, replica, rep in product(osds, sizes, replicas, reps): - - osds_in = total_osds_in_cluster - osds_out - - if osds_in == 0: - # we're done - break - - if current_osds_out != osds_out: - # take an osd out - ctx.manager.raw_cluster_cmd( - 'osd', 'reweight', str(osds_out-1), '0.0') - wait_until_healthy(ctx, config) - current_osds_out = osds_out - - if osds_in not in range(min_num_osds, (max_num_osds + 1)): - # no need to execute with a number of osds that wasn't requested - continue - - if osds_in < replica: - # cannot execute with more replicas than available osds - continue - - run_radosbench(ctx, config, f, osds_in, size, replica, rep) - - f.close() - - yield - - -def get_csv_header(conf): - all_columns = [ - 'rep', 'num_osd', 'num_replica', 'avg_throughput', - 'avg_latency', 'stdev_throughput', 'stdev_latency' - ] - given_columns = conf.get('columns', None) - if given_columns and len(given_columns) != 0: - for column in given_columns: - if column not in all_columns: - raise Exception('Unknown column ' + column) - return ','.join(conf['columns']) - else: - conf['columns'] = all_columns - return ','.join(all_columns) - - -def run_radosbench(ctx, config, f, num_osds, size, replica, rep): - pool = ctx.manager.create_pool_with_unique_name() - - ctx.manager.set_pool_property(pool, 'size', replica) - - wait_until_healthy(ctx, config) - - log.info('Executing with parameters: ') - log.info(' num_osd =' + str(num_osds)) - log.info(' size =' + str(size)) - log.info(' num_replicas =' + str(replica)) - log.info(' repetition =' + str(rep)) - - for role in config.get('clients', ['client.0']): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - - proc = remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{}/archive/coverage'.format(teuthology.get_testdir(ctx)), - 'rados', - '--no-log-to-stderr', - '--name', role, - '-b', str(size), - '-p', pool, - 'bench', str(config.get('time', 120)), 'write', - ], - logger=log.getChild('radosbench.{id}'.format(id=id_)), - stdin=run.PIPE, - stdout=StringIO(), - wait=False - ) - - # parse output to get summary and format it as CSV - proc.wait() - out = proc.stdout.getvalue() - all_values = { - 'stdev_throughput': re.sub(r'Stddev Bandwidth: ', '', re.search( - r'Stddev Bandwidth:.*', out).group(0)), - 'stdev_latency': re.sub(r'Stddev Latency: ', '', re.search( - r'Stddev Latency:.*', out).group(0)), - 'avg_throughput': re.sub(r'Bandwidth \(MB/sec\): ', '', re.search( - r'Bandwidth \(MB/sec\):.*', out).group(0)), - 'avg_latency': re.sub(r'Average Latency: ', '', re.search( - r'Average Latency:.*', out).group(0)), - 'rep': str(rep), - 'num_osd': str(num_osds), - 'num_replica': str(replica) - } - values_to_write = [] - for column in config['columns']: - values_to_write.extend([all_values[column]]) - f.write(','.join(values_to_write) + '\n') - - ctx.manager.remove_pool(pool) - - -def wait_until_healthy(ctx, config): - first_mon = teuthology.get_first_mon(ctx, config) - (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() - teuthology.wait_until_healthy(ctx, mon_remote) diff --git a/src/ceph/qa/tasks/radosgw_admin.py b/src/ceph/qa/tasks/radosgw_admin.py deleted file mode 100644 index 8e744e3..0000000 --- a/src/ceph/qa/tasks/radosgw_admin.py +++ /dev/null @@ -1,955 +0,0 @@ -""" -Rgw admin testing against a running instance -""" -# The test cases in this file have been annotated for inventory. -# To extract the inventory (in csv format) use the command: -# -# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //' -# -# to run this standalone: -# python qa/tasks/radosgw_admin.py [USER] HOSTNAME -# - -import copy -import json -import logging -import time -import datetime -import Queue -import bunch - -import sys - -from cStringIO import StringIO - -import boto.exception -import boto.s3.connection -import boto.s3.acl -from boto.utils import RequestHook - -import httplib2 - -import util.rgw as rgw_utils - -from util.rgw import rgwadmin, get_user_summary, get_user_successful_ops - -log = logging.getLogger(__name__) - -def usage_acc_findentry2(entries, user, add=True): - for e in entries: - if e['user'] == user: - return e - if not add: - return None - e = {'user': user, 'buckets': []} - entries.append(e) - return e -def usage_acc_findsum2(summaries, user, add=True): - for e in summaries: - if e['user'] == user: - return e - if not add: - return None - e = {'user': user, 'categories': [], - 'total': {'bytes_received': 0, - 'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }} - summaries.append(e) - return e -def usage_acc_update2(x, out, b_in, err): - x['bytes_sent'] += b_in - x['bytes_received'] += out - x['ops'] += 1 - if not err: - x['successful_ops'] += 1 -def usage_acc_validate_fields(r, x, x2, what): - q=[] - for field in ['bytes_sent', 'bytes_received', 'ops', 'successful_ops']: - try: - if x2[field] < x[field]: - q.append("field %s: %d < %d" % (field, x2[field], x[field])) - except Exception as ex: - r.append( "missing/bad field " + field + " in " + what + " " + str(ex)) - return - if len(q) > 0: - r.append("incomplete counts in " + what + ": " + ", ".join(q)) -class usage_acc: - def __init__(self): - self.results = {'entries': [], 'summary': []} - def findentry(self, user): - return usage_acc_findentry2(self.results['entries'], user) - def findsum(self, user): - return usage_acc_findsum2(self.results['summary'], user) - def e2b(self, e, bucket, add=True): - for b in e['buckets']: - if b['bucket'] == bucket: - return b - if not add: - return None - b = {'bucket': bucket, 'categories': []} - e['buckets'].append(b) - return b - def c2x(self, c, cat, add=True): - for x in c: - if x['category'] == cat: - return x - if not add: - return None - x = {'bytes_received': 0, 'category': cat, - 'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 } - c.append(x) - return x - def update(self, c, cat, user, out, b_in, err): - x = self.c2x(c, cat) - usage_acc_update2(x, out, b_in, err) - if not err and cat == 'create_bucket' and not x.has_key('owner'): - x['owner'] = user - def make_entry(self, cat, bucket, user, out, b_in, err): - if cat == 'create_bucket' and err: - return - e = self.findentry(user) - b = self.e2b(e, bucket) - self.update(b['categories'], cat, user, out, b_in, err) - s = self.findsum(user) - x = self.c2x(s['categories'], cat) - usage_acc_update2(x, out, b_in, err) - x = s['total'] - usage_acc_update2(x, out, b_in, err) - def generate_make_entry(self): - return lambda cat,bucket,user,out,b_in,err: self.make_entry(cat, bucket, user, out, b_in, err) - def get_usage(self): - return self.results - def compare_results(self, results): - if not results.has_key('entries') or not results.has_key('summary'): - return ['Missing entries or summary'] - r = [] - for e in self.results['entries']: - try: - e2 = usage_acc_findentry2(results['entries'], e['user'], False) - except Exception as ex: - r.append("malformed entry looking for user " - + e['user'] + " " + str(ex)) - break - if e2 == None: - r.append("missing entry for user " + e['user']) - continue - for b in e['buckets']: - c = b['categories'] - if b['bucket'] == 'nosuchbucket': - print "got here" - try: - b2 = self.e2b(e2, b['bucket'], False) - if b2 != None: - c2 = b2['categories'] - except Exception as ex: - r.append("malformed entry looking for bucket " - + b['bucket'] + " in user " + e['user'] + " " + str(ex)) - break - if b2 == None: - r.append("can't find bucket " + b['bucket'] - + " in user " + e['user']) - continue - for x in c: - try: - x2 = self.c2x(c2, x['category'], False) - except Exception as ex: - r.append("malformed entry looking for " - + x['category'] + " in bucket " + b['bucket'] - + " user " + e['user'] + " " + str(ex)) - break - usage_acc_validate_fields(r, x, x2, "entry: category " - + x['category'] + " bucket " + b['bucket'] - + " in user " + e['user']) - for s in self.results['summary']: - c = s['categories'] - try: - s2 = usage_acc_findsum2(results['summary'], s['user'], False) - except Exception as ex: - r.append("malformed summary looking for user " + e['user'] - + " " + str(ex)) - break - if s2 == None: - r.append("missing summary for user " + e['user'] + " " + str(ex)) - continue - try: - c2 = s2['categories'] - except Exception as ex: - r.append("malformed summary missing categories for user " - + e['user'] + " " + str(ex)) - break - for x in c: - try: - x2 = self.c2x(c2, x['category'], False) - except Exception as ex: - r.append("malformed summary looking for " - + x['category'] + " user " + e['user'] + " " + str(ex)) - break - usage_acc_validate_fields(r, x, x2, "summary: category " - + x['category'] + " in user " + e['user']) - x = s['total'] - try: - x2 = s2['total'] - except Exception as ex: - r.append("malformed summary looking for totals for user " - + e['user'] + " " + str(ex)) - break - usage_acc_validate_fields(r, x, x2, "summary: totals for user" + e['user']) - return r - -def ignore_this_entry(cat, bucket, user, out, b_in, err): - pass -class requestlog_queue(): - def __init__(self, add): - self.q = Queue.Queue(1000) - self.adder = add - def handle_request_data(self, request, response, error=False): - now = datetime.datetime.now() - if error: - pass - elif response.status < 200 or response.status >= 400: - error = True - self.q.put(bunch.Bunch({'t': now, 'o': request, 'i': response, 'e': error})) - def clear(self): - with self.q.mutex: - self.q.queue.clear() - def log_and_clear(self, cat, bucket, user, add_entry = None): - while not self.q.empty(): - j = self.q.get() - bytes_out = 0 - if 'Content-Length' in j.o.headers: - bytes_out = int(j.o.headers['Content-Length']) - bytes_in = 0 - if 'content-length' in j.i.msg.dict: - bytes_in = int(j.i.msg.dict['content-length']) - log.info('RL: %s %s %s bytes_out=%d bytes_in=%d failed=%r' - % (cat, bucket, user, bytes_out, bytes_in, j.e)) - if add_entry == None: - add_entry = self.adder - add_entry(cat, bucket, user, bytes_out, bytes_in, j.e) - -def create_presigned_url(conn, method, bucket_name, key_name, expiration): - return conn.generate_url(expires_in=expiration, - method=method, - bucket=bucket_name, - key=key_name, - query_auth=True, - ) - -def send_raw_http_request(conn, method, bucket_name, key_name, follow_redirects = False): - url = create_presigned_url(conn, method, bucket_name, key_name, 3600) - print url - h = httplib2.Http() - h.follow_redirects = follow_redirects - return h.request(url, method) - - -def get_acl(key): - """ - Helper function to get the xml acl from a key, ensuring that the xml - version tag is removed from the acl response - """ - raw_acl = key.get_xml_acl() - - def remove_version(string): - return string.split( - '<?xml version="1.0" encoding="UTF-8"?>' - )[-1] - - def remove_newlines(string): - return string.strip('\n') - - return remove_version( - remove_newlines(raw_acl) - ) - -def task(ctx, config): - """ - Test radosgw-admin functionality against a running rgw instance. - """ - global log - - assert ctx.rgw.config, \ - "radosgw_admin task needs a config passed from the rgw task" - config = ctx.rgw.config - log.debug('config is: %r', config) - - clients_from_config = config.keys() - - # choose first client as default - client = clients_from_config[0] - - # once the client is chosen, pull the host name and assigned port out of - # the role_endpoints that were assigned by the rgw task - (remote_host, remote_port) = ctx.rgw.role_endpoints[client] - - ## - user1='foo' - user2='fud' - subuser1='foo:foo1' - subuser2='foo:foo2' - display_name1='Foo' - display_name2='Fud' - email='foo@foo.com' - email2='bar@bar.com' - access_key='9te6NH5mcdcq0Tc5i8i1' - secret_key='Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu' - access_key2='p5YnriCv1nAtykxBrupQ' - secret_key2='Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh' - swift_secret1='gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL' - swift_secret2='ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy' - - bucket_name='myfoo' - bucket_name2='mybar' - - # connect to rgw - connection = boto.s3.connection.S3Connection( - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - is_secure=False, - port=remote_port, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - connection2 = boto.s3.connection.S3Connection( - aws_access_key_id=access_key2, - aws_secret_access_key=secret_key2, - is_secure=False, - port=remote_port, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - - acc = usage_acc() - rl = requestlog_queue(acc.generate_make_entry()) - connection.set_request_hook(rl) - connection2.set_request_hook(rl) - - # legend (test cases can be easily grep-ed out) - # TESTCASE 'testname','object','method','operation','assertion' - - # TESTCASE 'usage-show0' 'usage' 'show' 'all usage' 'succeeds' - (err, summary0) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True) - - # TESTCASE 'info-nosuch','user','info','non-existent user','fails' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1]) - assert err - - # TESTCASE 'create-ok','user','create','w/all valid info','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user1, - '--display-name', display_name1, - '--email', email, - '--access-key', access_key, - '--secret', secret_key, - '--max-buckets', '4' - ], - check_status=True) - - # TESTCASE 'duplicate email','user','create','existing user email','fails' - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user2, - '--display-name', display_name2, - '--email', email, - ]) - assert err - - # TESTCASE 'info-existing','user','info','existing user','returns correct info' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert out['user_id'] == user1 - assert out['email'] == email - assert out['display_name'] == display_name1 - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - assert not out['suspended'] - - # TESTCASE 'suspend-ok','user','suspend','active user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1], - check_status=True) - - # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert out['suspended'] - - # TESTCASE 're-enable','user','enable','suspended user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], check_status=True) - - # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert not out['suspended'] - - # TESTCASE 'add-keys','key','create','w/valid info','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'key', 'create', '--uid', user1, - '--access-key', access_key2, '--secret', secret_key2, - ], check_status=True) - - # TESTCASE 'info-new-key','user','info','after key addition','returns all keys' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], - check_status=True) - assert len(out['keys']) == 2 - assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2 - assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2 - - # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed' - (err, out) = rgwadmin(ctx, client, [ - 'key', 'rm', '--uid', user1, - '--access-key', access_key2, - ], check_status=True) - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - - # TESTCASE 'add-swift-key','key','create','swift key','succeeds' - subuser_access = 'full' - subuser_perm = 'full-control' - - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'create', '--subuser', subuser1, - '--access', subuser_access - ], check_status=True) - - # TESTCASE 'add-swift-key','key','create','swift key','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'modify', '--subuser', subuser1, - '--secret', swift_secret1, - '--key-type', 'swift', - ], check_status=True) - - # TESTCASE 'subuser-perm-mask', 'subuser', 'info', 'test subuser perm mask durability', 'succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1]) - - assert out['subusers'][0]['permissions'] == subuser_perm - - # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert len(out['swift_keys']) == 1 - assert out['swift_keys'][0]['user'] == subuser1 - assert out['swift_keys'][0]['secret_key'] == swift_secret1 - - # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'create', '--subuser', subuser2, - '--secret', swift_secret2, - '--key-type', 'swift', - ], check_status=True) - - # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert len(out['swift_keys']) == 2 - assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2 - assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2 - - # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed' - (err, out) = rgwadmin(ctx, client, [ - 'key', 'rm', '--subuser', subuser1, - '--key-type', 'swift', - ], check_status=True) - assert len(out['swift_keys']) == 1 - - # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'rm', '--subuser', subuser1, - ], check_status=True) - assert len(out['subusers']) == 1 - - # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'rm', '--subuser', subuser2, - '--key-type', 'swift', '--purge-keys', - ], check_status=True) - assert len(out['swift_keys']) == 0 - assert len(out['subusers']) == 0 - - # TESTCASE 'bucket-stats','bucket','stats','no session/buckets','succeeds, empty list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], - check_status=True) - assert len(out) == 0 - - # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True) - assert len(out) == 0 - - # create a first bucket - bucket = connection.create_bucket(bucket_name) - - rl.log_and_clear("create_bucket", bucket_name, user1) - - # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True) - assert len(out) == 1 - assert out[0] == bucket_name - - bucket_list = connection.get_all_buckets() - assert len(bucket_list) == 1 - assert bucket_list[0].name == bucket_name - - rl.log_and_clear("list_buckets", '', user1) - - # TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True) - assert len(out) >= 1 - assert bucket_name in out; - - # TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4' - bucket2 = connection.create_bucket(bucket_name + '2') - rl.log_and_clear("create_bucket", bucket_name + '2', user1) - bucket3 = connection.create_bucket(bucket_name + '3') - rl.log_and_clear("create_bucket", bucket_name + '3', user1) - bucket4 = connection.create_bucket(bucket_name + '4') - rl.log_and_clear("create_bucket", bucket_name + '4', user1) - # the 5th should fail. - failed = False - try: - connection.create_bucket(bucket_name + '5') - except Exception: - failed = True - assert failed - rl.log_and_clear("create_bucket", bucket_name + '5', user1) - - # delete the buckets - bucket2.delete() - rl.log_and_clear("delete_bucket", bucket_name + '2', user1) - bucket3.delete() - rl.log_and_clear("delete_bucket", bucket_name + '3', user1) - bucket4.delete() - rl.log_and_clear("delete_bucket", bucket_name + '4', user1) - - # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list' - (err, out) = rgwadmin(ctx, client, [ - 'bucket', 'stats', '--bucket', bucket_name], check_status=True) - assert out['owner'] == user1 - bucket_id = out['id'] - - # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID' - (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], check_status=True) - assert len(out) == 1 - assert out[0]['id'] == bucket_id # does it return the same ID twice in a row? - - # use some space - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('one') - rl.log_and_clear("put_obj", bucket_name, user1) - - # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object' - (err, out) = rgwadmin(ctx, client, [ - 'bucket', 'stats', '--bucket', bucket_name], check_status=True) - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 1 - assert out['usage']['rgw.main']['size_kb'] > 0 - - # reclaim it - key.delete() - rl.log_and_clear("delete_obj", bucket_name, user1) - - # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error' - (err, out) = rgwadmin(ctx, client, - ['bucket', 'unlink', '--uid', user1, '--bucket', bucket_name], - check_status=True) - - # create a second user to link the bucket to - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user2, - '--display-name', display_name2, - '--access-key', access_key2, - '--secret', secret_key2, - '--max-buckets', '1', - ], - check_status=True) - - # try creating an object with the first user before the bucket is relinked - denied = False - key = boto.s3.key.Key(bucket) - - try: - key.set_contents_from_string('two') - except boto.exception.S3ResponseError: - denied = True - - assert not denied - rl.log_and_clear("put_obj", bucket_name, user1) - - # delete the object - key.delete() - rl.log_and_clear("delete_obj", bucket_name, user1) - - # link the bucket to another user - (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)], - check_status=True) - - bucket_data = out['data'] - assert bucket_data['bucket']['name'] == bucket_name - - bucket_id = bucket_data['bucket']['bucket_id'] - - # link the bucket to another user - (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--uid', user2, '--bucket', bucket_name, '--bucket-id', bucket_id], - check_status=True) - - # try to remove user, should fail (has a linked bucket) - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2]) - assert err - - # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'succeeds, bucket unlinked' - (err, out) = rgwadmin(ctx, client, ['bucket', 'unlink', '--uid', user2, '--bucket', bucket_name], - check_status=True) - - # relink the bucket to the first user and delete the second user - (err, out) = rgwadmin(ctx, client, - ['bucket', 'link', '--uid', user1, '--bucket', bucket_name, '--bucket-id', bucket_id], - check_status=True) - - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2], - check_status=True) - - # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed' - - # upload an object - object_name = 'four' - key = boto.s3.key.Key(bucket, object_name) - key.set_contents_from_string(object_name) - rl.log_and_clear("put_obj", bucket_name, user1) - - # fetch it too (for usage stats presently) - s = key.get_contents_as_string() - rl.log_and_clear("get_obj", bucket_name, user1) - assert s == object_name - # list bucket too (for usage stats presently) - keys = list(bucket.list()) - rl.log_and_clear("list_bucket", bucket_name, user1) - assert len(keys) == 1 - assert keys[0].name == object_name - - # now delete it - (err, out) = rgwadmin(ctx, client, - ['object', 'rm', '--bucket', bucket_name, '--object', object_name], - check_status=True) - - # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects' - (err, out) = rgwadmin(ctx, client, [ - 'bucket', 'stats', '--bucket', bucket_name], - check_status=True) - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 0 - - # list log objects - # TESTCASE 'log-list','log','list','after activity','succeeds, lists one no objects' - (err, out) = rgwadmin(ctx, client, ['log', 'list'], check_status=True) - assert len(out) > 0 - - for obj in out: - # TESTCASE 'log-show','log','show','after activity','returns expected info' - if obj[:4] == 'meta' or obj[:4] == 'data' or obj[:18] == 'obj_delete_at_hint': - continue - - (err, rgwlog) = rgwadmin(ctx, client, ['log', 'show', '--object', obj], - check_status=True) - assert len(rgwlog) > 0 - - # exempt bucket_name2 from checking as it was only used for multi-region tests - assert rgwlog['bucket'].find(bucket_name) == 0 or rgwlog['bucket'].find(bucket_name2) == 0 - assert rgwlog['bucket'] != bucket_name or rgwlog['bucket_id'] == bucket_id - assert rgwlog['bucket_owner'] == user1 or rgwlog['bucket'] == bucket_name + '5' or rgwlog['bucket'] == bucket_name2 - for entry in rgwlog['log_entries']: - log.debug('checking log entry: ', entry) - assert entry['bucket'] == rgwlog['bucket'] - possible_buckets = [bucket_name + '5', bucket_name2] - user = entry['user'] - assert user == user1 or user.endswith('system-user') or \ - rgwlog['bucket'] in possible_buckets - - # TESTCASE 'log-rm','log','rm','delete log objects','succeeds' - (err, out) = rgwadmin(ctx, client, ['log', 'rm', '--object', obj], - check_status=True) - - # TODO: show log by bucket+date - - # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1], - check_status=True) - - # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects' - denied = False - try: - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('five') - except boto.exception.S3ResponseError as e: - denied = True - assert e.status == 403 - - assert denied - rl.log_and_clear("put_obj", bucket_name, user1) - - # TESTCASE 'user-renable2','user','enable','suspended user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], - check_status=True) - - # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects' - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('six') - rl.log_and_clear("put_obj", bucket_name, user1) - - # TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection' - - # create an object large enough to be split into multiple parts - test_string = 'foo'*10000000 - - big_key = boto.s3.key.Key(bucket) - big_key.set_contents_from_string(test_string) - rl.log_and_clear("put_obj", bucket_name, user1) - - # now delete the head - big_key.delete() - rl.log_and_clear("delete_obj", bucket_name, user1) - - # wait a bit to give the garbage collector time to cycle - time.sleep(15) - - (err, out) = rgwadmin(ctx, client, ['gc', 'list']) - - assert len(out) > 0 - - # TESTCASE 'gc-process', 'gc', 'process', 'manually collect garbage' - (err, out) = rgwadmin(ctx, client, ['gc', 'process'], check_status=True) - - #confirm - (err, out) = rgwadmin(ctx, client, ['gc', 'list']) - - assert len(out) == 0 - - # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets' - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1]) - assert err - - # delete should fail because ``key`` still exists - try: - bucket.delete() - except boto.exception.S3ResponseError as e: - assert e.status == 409 - rl.log_and_clear("delete_bucket", bucket_name, user1) - - key.delete() - rl.log_and_clear("delete_obj", bucket_name, user1) - bucket.delete() - rl.log_and_clear("delete_bucket", bucket_name, user1) - - # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy' - bucket = connection.create_bucket(bucket_name) - rl.log_and_clear("create_bucket", bucket_name, user1) - - # create an object - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('seven') - rl.log_and_clear("put_obj", bucket_name, user1) - - # should be private already but guarantee it - key.set_acl('private') - rl.log_and_clear("put_acls", bucket_name, user1) - - (err, out) = rgwadmin(ctx, client, - ['policy', '--bucket', bucket.name, '--object', key.key], - check_status=True, format='xml') - - acl = get_acl(key) - rl.log_and_clear("get_acls", bucket_name, user1) - - assert acl == out.strip('\n') - - # add another grantee by making the object public read - key.set_acl('public-read') - rl.log_and_clear("put_acls", bucket_name, user1) - - (err, out) = rgwadmin(ctx, client, - ['policy', '--bucket', bucket.name, '--object', key.key], - check_status=True, format='xml') - - acl = get_acl(key) - rl.log_and_clear("get_acls", bucket_name, user1) - - assert acl == out.strip('\n') - - # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds' - bucket = connection.create_bucket(bucket_name) - rl.log_and_clear("create_bucket", bucket_name, user1) - key_name = ['eight', 'nine', 'ten', 'eleven'] - for i in range(4): - key = boto.s3.key.Key(bucket) - key.set_contents_from_string(key_name[i]) - rl.log_and_clear("put_obj", bucket_name, user1) - - (err, out) = rgwadmin(ctx, client, - ['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'], - check_status=True) - - # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds' - caps='user=read' - (err, out) = rgwadmin(ctx, client, ['caps', 'add', '--uid', user1, '--caps', caps]) - - assert out['caps'][0]['perm'] == 'read' - - # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds' - (err, out) = rgwadmin(ctx, client, ['caps', 'rm', '--uid', user1, '--caps', caps]) - - assert not out['caps'] - - # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets' - bucket = connection.create_bucket(bucket_name) - rl.log_and_clear("create_bucket", bucket_name, user1) - key = boto.s3.key.Key(bucket) - - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1]) - assert err - - # TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds' - bucket = connection.create_bucket(bucket_name) - rl.log_and_clear("create_bucket", bucket_name, user1) - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('twelve') - rl.log_and_clear("put_obj", bucket_name, user1) - - time.sleep(35) - - # need to wait for all usage data to get flushed, should take up to 30 seconds - timestamp = time.time() - while time.time() - timestamp <= (2 * 60): # wait up to 20 minutes - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj']) # one of the operations we did is delete_obj, should be present. - if get_user_successful_ops(out, user1) > 0: - break - time.sleep(1) - - assert time.time() - timestamp <= (20 * 60) - - # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds' - (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True) - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - - r = acc.compare_results(out) - if len(r) != 0: - sys.stderr.write(("\n".join(r))+"\n") - assert(len(r) == 0) - - user_summary = get_user_summary(out, user1) - - total = user_summary['total'] - assert total['successful_ops'] > 0 - - # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds' - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1], - check_status=True) - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - for entry in user_summary['categories']: - assert entry['successful_ops'] > 0 - assert user_summary['user'] == user1 - - # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds' - test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket'] - for cat in test_categories: - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat], - check_status=True) - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - assert user_summary['user'] == user1 - assert len(user_summary['categories']) == 1 - entry = user_summary['categories'][0] - assert entry['category'] == cat - assert entry['successful_ops'] > 0 - - # should be all through with connection. (anything using connection - # should be BEFORE the usage stuff above.) - rl.log_and_clear("(before-close)", '-', '-', ignore_this_entry) - connection.close() - connection = None - - # the usage flush interval is 30 seconds, wait that much an then some - # to make sure everything has been flushed - time.sleep(35) - - # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed' - (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1], - check_status=True) - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1], - check_status=True) - assert len(out['entries']) == 0 - assert len(out['summary']) == 0 - - (err, out) = rgwadmin(ctx, client, - ['user', 'rm', '--uid', user1, '--purge-data' ], - check_status=True) - - # TESTCASE 'rm-user3','user','rm','deleted user','fails' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1]) - assert err - - # TESTCASE 'zone-info', 'zone', 'get', 'get zone info', 'succeeds, has default placement rule' - # - - (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default']) - orig_placement_pools = len(out['placement_pools']) - - # removed this test, it is not correct to assume that zone has default placement, it really - # depends on how we set it up before - # - # assert len(out) > 0 - # assert len(out['placement_pools']) == 1 - - # default_rule = out['placement_pools'][0] - # assert default_rule['key'] == 'default-placement' - - rule={'key': 'new-placement', 'val': {'data_pool': '.rgw.buckets.2', 'index_pool': '.rgw.buckets.index.2'}} - - out['placement_pools'].append(rule) - - (err, out) = rgwadmin(ctx, client, ['zone', 'set'], - stdin=StringIO(json.dumps(out)), - check_status=True) - - (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default']) - assert len(out) > 0 - assert len(out['placement_pools']) == orig_placement_pools + 1 - - zonecmd = ['zone', 'placement', 'rm', - '--rgw-zone', 'default', - '--placement-id', 'new-placement'] - - (err, out) = rgwadmin(ctx, client, zonecmd, check_status=True) - -import sys -from tasks.radosgw_admin import task -from teuthology.config import config -from teuthology.orchestra import cluster, remote -import argparse; - -def main(): - if len(sys.argv) == 3: - user = sys.argv[1] + "@" - host = sys.argv[2] - elif len(sys.argv) == 2: - user = "" - host = sys.argv[1] - else: - sys.stderr.write("usage: radosgw_admin.py [user] host\n") - exit(1) - client0 = remote.Remote(user + host) - ctx = config - ctx.cluster=cluster.Cluster(remotes=[(client0, - [ 'ceph.client.rgw.%s' % (host), ]),]) - - ctx.rgw = argparse.Namespace() - endpoints = {} - endpoints['ceph.client.rgw.%s' % host] = (host, 80) - ctx.rgw.role_endpoints = endpoints - ctx.rgw.realm = None - ctx.rgw.regions = {'region0': { 'api name': 'api1', - 'is master': True, 'master zone': 'r0z0', - 'zones': ['r0z0', 'r0z1'] }} - ctx.rgw.config = {'ceph.client.rgw.%s' % host: {'system user': {'name': '%s-system-user' % host}}} - task(config, None) - exit() - -if __name__ == '__main__': - main() diff --git a/src/ceph/qa/tasks/radosgw_admin_rest.py b/src/ceph/qa/tasks/radosgw_admin_rest.py deleted file mode 100644 index 7bd72d1..0000000 --- a/src/ceph/qa/tasks/radosgw_admin_rest.py +++ /dev/null @@ -1,668 +0,0 @@ -""" -Run a series of rgw admin commands through the rest interface. - -The test cases in this file have been annotated for inventory. -To extract the inventory (in csv format) use the command: - - grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //' - -""" -from cStringIO import StringIO -import logging -import json - -import boto.exception -import boto.s3.connection -import boto.s3.acl - -import requests -import time - -from boto.connection import AWSAuthConnection -from teuthology import misc as teuthology -from util.rgw import get_user_summary, get_user_successful_ops - -log = logging.getLogger(__name__) - -def rgwadmin(ctx, client, cmd): - """ - Perform rgw admin command - - :param client: client - :param cmd: command to execute. - :return: command exit status, json result. - """ - log.info('radosgw-admin: %s' % cmd) - testdir = teuthology.get_testdir(ctx) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '--log-to-stderr', - '--format', 'json', - ] - pre.extend(cmd) - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - proc = remote.run( - args=pre, - check_status=False, - stdout=StringIO(), - stderr=StringIO(), - ) - r = proc.exitstatus - out = proc.stdout.getvalue() - j = None - if not r and out != '': - try: - j = json.loads(out) - log.info(' json result: %s' % j) - except ValueError: - j = out - log.info(' raw result: %s' % j) - return (r, j) - - -def rgwadmin_rest(connection, cmd, params=None, headers=None, raw=False): - """ - perform a rest command - """ - log.info('radosgw-admin-rest: %s %s' % (cmd, params)) - put_cmds = ['create', 'link', 'add'] - post_cmds = ['unlink', 'modify'] - delete_cmds = ['trim', 'rm', 'process'] - get_cmds = ['check', 'info', 'show', 'list'] - - bucket_sub_resources = ['object', 'policy', 'index'] - user_sub_resources = ['subuser', 'key', 'caps'] - zone_sub_resources = ['pool', 'log', 'garbage'] - - def get_cmd_method_and_handler(cmd): - """ - Get the rest command and handler from information in cmd and - from the imported requests object. - """ - if cmd[1] in put_cmds: - return 'PUT', requests.put - elif cmd[1] in delete_cmds: - return 'DELETE', requests.delete - elif cmd[1] in post_cmds: - return 'POST', requests.post - elif cmd[1] in get_cmds: - return 'GET', requests.get - - def get_resource(cmd): - """ - Get the name of the resource from information in cmd. - """ - if cmd[0] == 'bucket' or cmd[0] in bucket_sub_resources: - if cmd[0] == 'bucket': - return 'bucket', '' - else: - return 'bucket', cmd[0] - elif cmd[0] == 'user' or cmd[0] in user_sub_resources: - if cmd[0] == 'user': - return 'user', '' - else: - return 'user', cmd[0] - elif cmd[0] == 'usage': - return 'usage', '' - elif cmd[0] == 'zone' or cmd[0] in zone_sub_resources: - if cmd[0] == 'zone': - return 'zone', '' - else: - return 'zone', cmd[0] - - def build_admin_request(conn, method, resource = '', headers=None, data='', - query_args=None, params=None): - """ - Build an administative request adapted from the build_request() - method of boto.connection - """ - - path = conn.calling_format.build_path_base('admin', resource) - auth_path = conn.calling_format.build_auth_path('admin', resource) - host = conn.calling_format.build_host(conn.server_name(), 'admin') - if query_args: - path += '?' + query_args - boto.log.debug('path=%s' % path) - auth_path += '?' + query_args - boto.log.debug('auth_path=%s' % auth_path) - return AWSAuthConnection.build_base_http_request(conn, method, path, - auth_path, params, headers, data, host) - - method, handler = get_cmd_method_and_handler(cmd) - resource, query_args = get_resource(cmd) - request = build_admin_request(connection, method, resource, - query_args=query_args, headers=headers) - - url = '{protocol}://{host}{path}'.format(protocol=request.protocol, - host=request.host, path=request.path) - - request.authorize(connection=connection) - result = handler(url, params=params, headers=request.headers) - - if raw: - log.info(' text result: %s' % result.txt) - return result.status_code, result.txt - else: - log.info(' json result: %s' % result.json()) - return result.status_code, result.json() - - -def task(ctx, config): - """ - Test radosgw-admin functionality through the RESTful interface - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - # just use the first client... - client = clients[0] - - ## - admin_user = 'ada' - admin_display_name = 'Ms. Admin User' - admin_access_key = 'MH1WC2XQ1S8UISFDZC8W' - admin_secret_key = 'dQyrTPA0s248YeN5bBv4ukvKU0kh54LWWywkrpoG' - admin_caps = 'users=read, write; usage=read, write; buckets=read, write; zone=read, write' - - user1 = 'foo' - user2 = 'fud' - subuser1 = 'foo:foo1' - subuser2 = 'foo:foo2' - display_name1 = 'Foo' - display_name2 = 'Fud' - email = 'foo@foo.com' - access_key = '9te6NH5mcdcq0Tc5i8i1' - secret_key = 'Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu' - access_key2 = 'p5YnriCv1nAtykxBrupQ' - secret_key2 = 'Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh' - swift_secret1 = 'gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL' - swift_secret2 = 'ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy' - - bucket_name = 'myfoo' - - # legend (test cases can be easily grep-ed out) - # TESTCASE 'testname','object','method','operation','assertion' - # TESTCASE 'create-admin-user','user','create','administrative user','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', admin_user, - '--display-name', admin_display_name, - '--access-key', admin_access_key, - '--secret', admin_secret_key, - '--max-buckets', '0', - '--caps', admin_caps - ]) - logging.error(out) - logging.error(err) - assert not err - - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - remote_host = remote.name.split('@')[1] - admin_conn = boto.s3.connection.S3Connection( - aws_access_key_id=admin_access_key, - aws_secret_access_key=admin_secret_key, - is_secure=False, - port=7280, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - - # TESTCASE 'info-nosuch','user','info','non-existent user','fails' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {"uid": user1}) - assert ret == 404 - - # TESTCASE 'create-ok','user','create','w/all valid info','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['user', 'create'], - {'uid' : user1, - 'display-name' : display_name1, - 'email' : email, - 'access-key' : access_key, - 'secret-key' : secret_key, - 'max-buckets' : '4' - }) - - assert ret == 200 - - # TESTCASE 'info-existing','user','info','existing user','returns correct info' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - - assert out['user_id'] == user1 - assert out['email'] == email - assert out['display_name'] == display_name1 - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - assert not out['suspended'] - - # TESTCASE 'suspend-ok','user','suspend','active user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True}) - assert ret == 200 - - # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert out['suspended'] - - # TESTCASE 're-enable','user','enable','suspended user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'}) - assert not err - - # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert not out['suspended'] - - # TESTCASE 'add-keys','key','create','w/valid info','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['key', 'create'], - {'uid' : user1, - 'access-key' : access_key2, - 'secret-key' : secret_key2 - }) - - - assert ret == 200 - - # TESTCASE 'info-new-key','user','info','after key addition','returns all keys' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out['keys']) == 2 - assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2 - assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2 - - # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['key', 'rm'], - {'uid' : user1, - 'access-key' : access_key2 - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - - # TESTCASE 'add-swift-key','key','create','swift key','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'create'], - {'subuser' : subuser1, - 'secret-key' : swift_secret1, - 'key-type' : 'swift' - }) - - assert ret == 200 - - # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out['swift_keys']) == 1 - assert out['swift_keys'][0]['user'] == subuser1 - assert out['swift_keys'][0]['secret_key'] == swift_secret1 - - # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'create'], - {'subuser' : subuser2, - 'secret-key' : swift_secret2, - 'key-type' : 'swift' - }) - - assert ret == 200 - - # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out['swift_keys']) == 2 - assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2 - assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2 - - # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['key', 'rm'], - {'subuser' : subuser1, - 'key-type' :'swift' - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert len(out['swift_keys']) == 1 - - # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'rm'], - {'subuser' : subuser1 - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert len(out['subusers']) == 1 - - # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'rm'], - {'subuser' : subuser2, - 'key-type' : 'swift', - '{purge-keys' :True - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert len(out['swift_keys']) == 0 - assert len(out['subusers']) == 0 - - # TESTCASE 'bucket-stats','bucket','info','no session/buckets','succeeds, empty list' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out) == 0 - - # connect to rgw - connection = boto.s3.connection.S3Connection( - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - is_secure=False, - port=7280, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - - # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True}) - assert ret == 200 - assert len(out) == 0 - - # create a first bucket - bucket = connection.create_bucket(bucket_name) - - # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out) == 1 - assert out[0] == bucket_name - - # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list' - (ret, out) = rgwadmin_rest(admin_conn, - ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True}) - - assert ret == 200 - assert out['owner'] == user1 - bucket_id = out['id'] - - # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True}) - assert ret == 200 - assert len(out) == 1 - assert out[0]['id'] == bucket_id # does it return the same ID twice in a row? - - # use some space - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('one') - - # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True}) - assert ret == 200 - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 1 - assert out['usage']['rgw.main']['size_kb'] > 0 - - # reclaim it - key.delete() - - # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'unlink'], {'uid' : user1, 'bucket' : bucket_name}) - - assert ret == 200 - - # create a second user to link the bucket to - (ret, out) = rgwadmin_rest(admin_conn, - ['user', 'create'], - {'uid' : user2, - 'display-name' : display_name2, - 'access-key' : access_key2, - 'secret-key' : secret_key2, - 'max-buckets' : '1', - }) - - assert ret == 200 - - # try creating an object with the first user before the bucket is relinked - denied = False - key = boto.s3.key.Key(bucket) - - try: - key.set_contents_from_string('two') - except boto.exception.S3ResponseError: - denied = True - - assert not denied - - # delete the object - key.delete() - - # link the bucket to another user - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user2, 'bucket' : bucket_name}) - - assert ret == 200 - - # try creating an object with the first user which should cause an error - key = boto.s3.key.Key(bucket) - - try: - key.set_contents_from_string('three') - except boto.exception.S3ResponseError: - denied = True - - assert denied - - # relink the bucket to the first user and delete the second user - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user1, 'bucket' : bucket_name}) - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user2}) - assert ret == 200 - - # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed' - - # upload an object - object_name = 'four' - key = boto.s3.key.Key(bucket, object_name) - key.set_contents_from_string(object_name) - - # now delete it - (ret, out) = rgwadmin_rest(admin_conn, ['object', 'rm'], {'bucket' : bucket_name, 'object' : object_name}) - assert ret == 200 - - # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True}) - assert ret == 200 - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 0 - - # create a bucket for deletion stats - useless_bucket = connection.create_bucket('useless_bucket') - useless_key = useless_bucket.new_key('useless_key') - useless_key.set_contents_from_string('useless string') - - # delete it - useless_key.delete() - useless_bucket.delete() - - # wait for the statistics to flush - time.sleep(60) - - # need to wait for all usage data to get flushed, should take up to 30 seconds - timestamp = time.time() - while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'categories' : 'delete_obj'}) # last operation we did is delete obj, wait for it to flush - - if get_user_successful_ops(out, user1) > 0: - break - time.sleep(1) - - assert time.time() - timestamp <= (20 * 60) - - # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show']) - assert ret == 200 - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - user_summary = get_user_summary(out, user1) - total = user_summary['total'] - assert total['successful_ops'] > 0 - - # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1}) - assert ret == 200 - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - for entry in user_summary['categories']: - assert entry['successful_ops'] > 0 - assert user_summary['user'] == user1 - - # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds' - test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket'] - for cat in test_categories: - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1, 'categories' : cat}) - assert ret == 200 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - assert user_summary['user'] == user1 - assert len(user_summary['categories']) == 1 - entry = user_summary['categories'][0] - assert entry['category'] == cat - assert entry['successful_ops'] > 0 - - # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed' - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'trim'], {'uid' : user1}) - assert ret == 200 - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1}) - assert ret == 200 - assert len(out['entries']) == 0 - assert len(out['summary']) == 0 - - # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True}) - assert ret == 200 - - # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects' - try: - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('five') - except boto.exception.S3ResponseError as e: - assert e.status == 403 - - # TESTCASE 'user-renable2','user','enable','suspended user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'}) - assert ret == 200 - - # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects' - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('six') - - # TESTCASE 'garbage-list', 'garbage', 'list', 'get list of objects ready for garbage collection' - - # create an object large enough to be split into multiple parts - test_string = 'foo'*10000000 - - big_key = boto.s3.key.Key(bucket) - big_key.set_contents_from_string(test_string) - - # now delete the head - big_key.delete() - - # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1}) - assert ret == 409 - - # delete should fail because ``key`` still exists - try: - bucket.delete() - except boto.exception.S3ResponseError as e: - assert e.status == 409 - - key.delete() - bucket.delete() - - # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy' - bucket = connection.create_bucket(bucket_name) - - # create an object - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('seven') - - # should be private already but guarantee it - key.set_acl('private') - - (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key}) - assert ret == 200 - - acl = key.get_xml_acl() - assert acl == out.strip('\n') - - # add another grantee by making the object public read - key.set_acl('public-read') - - (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key}) - assert ret == 200 - - acl = key.get_xml_acl() - assert acl == out.strip('\n') - - # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds' - bucket = connection.create_bucket(bucket_name) - key_name = ['eight', 'nine', 'ten', 'eleven'] - for i in range(4): - key = boto.s3.key.Key(bucket) - key.set_contents_from_string(key_name[i]) - - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'rm'], {'bucket' : bucket_name, 'purge-objects' : True}) - assert ret == 200 - - # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds' - caps = 'usage=read' - (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'add'], {'uid' : user1, 'user-caps' : caps}) - assert ret == 200 - assert out[0]['perm'] == 'read' - - # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'rm'], {'uid' : user1, 'user-caps' : caps}) - assert ret == 200 - assert not out - - # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets' - bucket = connection.create_bucket(bucket_name) - key = boto.s3.key.Key(bucket) - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1}) - assert ret == 409 - - # TESTCASE 'rm-user2', 'user', 'rm', user with data', 'succeeds' - bucket = connection.create_bucket(bucket_name) - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('twelve') - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1, 'purge-data' : True}) - assert ret == 200 - - # TESTCASE 'rm-user3','user','info','deleted user','fails' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 404 - diff --git a/src/ceph/qa/tasks/rbd.py b/src/ceph/qa/tasks/rbd.py deleted file mode 100644 index d45636a..0000000 --- a/src/ceph/qa/tasks/rbd.py +++ /dev/null @@ -1,612 +0,0 @@ -""" -Rbd testing task -""" -import contextlib -import logging -import os -import tempfile - -from cStringIO import StringIO -from teuthology.orchestra import run -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.parallel import parallel -from teuthology.task.common_fs_utils import generic_mkfs -from teuthology.task.common_fs_utils import generic_mount -from teuthology.task.common_fs_utils import default_image_name - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def create_image(ctx, config): - """ - Create an rbd image. - - For example:: - - tasks: - - ceph: - - rbd.create_image: - client.0: - image_name: testimage - image_size: 100 - image_format: 1 - client.1: - - Image size is expressed as a number of megabytes; default value - is 10240. - - Image format value must be either 1 or 2; default value is 1. - - """ - assert isinstance(config, dict) or isinstance(config, list), \ - "task create_image only supports a list or dictionary for configuration" - - if isinstance(config, dict): - images = config.items() - else: - images = [(role, None) for role in config] - - testdir = teuthology.get_testdir(ctx) - for role, properties in images: - if properties is None: - properties = {} - name = properties.get('image_name', default_image_name(role)) - size = properties.get('image_size', 10240) - fmt = properties.get('image_format', 1) - (remote,) = ctx.cluster.only(role).remotes.keys() - log.info('Creating image {name} with size {size}'.format(name=name, - size=size)) - args = [ - 'adjust-ulimits', - 'ceph-coverage'.format(tdir=testdir), - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '-p', 'rbd', - 'create', - '--size', str(size), - name, - ] - # omit format option if using the default (format 1) - # since old versions of don't support it - if int(fmt) != 1: - args += ['--image-format', str(fmt)] - remote.run(args=args) - try: - yield - finally: - log.info('Deleting rbd images...') - for role, properties in images: - if properties is None: - properties = {} - name = properties.get('image_name', default_image_name(role)) - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '-p', 'rbd', - 'rm', - name, - ], - ) - -@contextlib.contextmanager -def clone_image(ctx, config): - """ - Clones a parent imag - - For example:: - - tasks: - - ceph: - - rbd.clone_image: - client.0: - parent_name: testimage - image_name: cloneimage - """ - assert isinstance(config, dict) or isinstance(config, list), \ - "task clone_image only supports a list or dictionary for configuration" - - if isinstance(config, dict): - images = config.items() - else: - images = [(role, None) for role in config] - - testdir = teuthology.get_testdir(ctx) - for role, properties in images: - if properties is None: - properties = {} - - name = properties.get('image_name', default_image_name(role)) - parent_name = properties.get('parent_name') - assert parent_name is not None, \ - "parent_name is required" - parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name) - - (remote,) = ctx.cluster.only(role).remotes.keys() - log.info('Clone image {parent} to {child}'.format(parent=parent_name, - child=name)) - for cmd in [('snap', 'create', parent_spec), - ('snap', 'protect', parent_spec), - ('clone', parent_spec, name)]: - args = [ - 'adjust-ulimits', - 'ceph-coverage'.format(tdir=testdir), - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', '-p', 'rbd' - ] - args.extend(cmd) - remote.run(args=args) - - try: - yield - finally: - log.info('Deleting rbd clones...') - for role, properties in images: - if properties is None: - properties = {} - name = properties.get('image_name', default_image_name(role)) - parent_name = properties.get('parent_name') - parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name) - - (remote,) = ctx.cluster.only(role).remotes.keys() - - for cmd in [('rm', name), - ('snap', 'unprotect', parent_spec), - ('snap', 'rm', parent_spec)]: - args = [ - 'adjust-ulimits', - 'ceph-coverage'.format(tdir=testdir), - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', '-p', 'rbd' - ] - args.extend(cmd) - remote.run(args=args) - -@contextlib.contextmanager -def modprobe(ctx, config): - """ - Load the rbd kernel module.. - - For example:: - - tasks: - - ceph: - - rbd.create_image: [client.0] - - rbd.modprobe: [client.0] - """ - log.info('Loading rbd kernel module...') - for role in config: - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'sudo', - 'modprobe', - 'rbd', - ], - ) - try: - yield - finally: - log.info('Unloading rbd kernel module...') - for role in config: - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'sudo', - 'modprobe', - '-r', - 'rbd', - # force errors to be ignored; necessary if more - # than one device was created, which may mean - # the module isn't quite ready to go the first - # time through. - run.Raw('||'), - 'true', - ], - ) - -@contextlib.contextmanager -def dev_create(ctx, config): - """ - Map block devices to rbd images. - - For example:: - - tasks: - - ceph: - - rbd.create_image: [client.0] - - rbd.modprobe: [client.0] - - rbd.dev_create: - client.0: testimage.client.0 - """ - assert isinstance(config, dict) or isinstance(config, list), \ - "task dev_create only supports a list or dictionary for configuration" - - if isinstance(config, dict): - role_images = config.items() - else: - role_images = [(role, None) for role in config] - - log.info('Creating rbd block devices...') - - testdir = teuthology.get_testdir(ctx) - - for role, image in role_images: - if image is None: - image = default_image_name(role) - (remote,) = ctx.cluster.only(role).remotes.keys() - - remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '--user', role.rsplit('.')[-1], - '-p', 'rbd', - 'map', - image, - run.Raw('&&'), - # wait for the symlink to be created by udev - 'while', 'test', '!', '-e', '/dev/rbd/rbd/{image}'.format(image=image), run.Raw(';'), 'do', - 'sleep', '1', run.Raw(';'), - 'done', - ], - ) - try: - yield - finally: - log.info('Unmapping rbd devices...') - for role, image in role_images: - if image is None: - image = default_image_name(role) - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'LD_LIBRARY_PATH={tdir}/binary/usr/local/lib'.format(tdir=testdir), - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '-p', 'rbd', - 'unmap', - '/dev/rbd/rbd/{imgname}'.format(imgname=image), - run.Raw('&&'), - # wait for the symlink to be deleted by udev - 'while', 'test', '-e', '/dev/rbd/rbd/{image}'.format(image=image), - run.Raw(';'), - 'do', - 'sleep', '1', run.Raw(';'), - 'done', - ], - ) - - -def rbd_devname_rtn(ctx, image): - return '/dev/rbd/rbd/{image}'.format(image=image) - -def canonical_path(ctx, role, path): - """ - Determine the canonical path for a given path on the host - representing the given role. A canonical path contains no - . or .. components, and includes no symbolic links. - """ - version_fp = StringIO() - ctx.cluster.only(role).run( - args=[ 'readlink', '-f', path ], - stdout=version_fp, - ) - canonical_path = version_fp.getvalue().rstrip('\n') - version_fp.close() - return canonical_path - -@contextlib.contextmanager -def run_xfstests(ctx, config): - """ - Run xfstests over specified devices. - - Warning: both the test and scratch devices specified will be - overwritten. Normally xfstests modifies (but does not destroy) - the test device, but for now the run script used here re-makes - both filesystems. - - Note: Only one instance of xfstests can run on a single host at - a time, although this is not enforced. - - This task in its current form needs some improvement. For - example, it assumes all roles provided in the config are - clients, and that the config provided is a list of key/value - pairs. For now please use the xfstests() interface, below. - - For example:: - - tasks: - - ceph: - - rbd.run_xfstests: - client.0: - count: 2 - test_dev: 'test_dev' - scratch_dev: 'scratch_dev' - fs_type: 'xfs' - tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015' - exclude: - - generic/42 - randomize: true - """ - with parallel() as p: - for role, properties in config.items(): - p.spawn(run_xfstests_one_client, ctx, role, properties) - yield - -def run_xfstests_one_client(ctx, role, properties): - """ - Spawned routine to handle xfs tests for a single client - """ - testdir = teuthology.get_testdir(ctx) - try: - count = properties.get('count') - test_dev = properties.get('test_dev') - assert test_dev is not None, \ - "task run_xfstests requires test_dev to be defined" - test_dev = canonical_path(ctx, role, test_dev) - - scratch_dev = properties.get('scratch_dev') - assert scratch_dev is not None, \ - "task run_xfstests requires scratch_dev to be defined" - scratch_dev = canonical_path(ctx, role, scratch_dev) - - fs_type = properties.get('fs_type') - tests = properties.get('tests') - exclude_list = properties.get('exclude') - randomize = properties.get('randomize') - - (remote,) = ctx.cluster.only(role).remotes.keys() - - # Fetch the test script - test_root = teuthology.get_testdir(ctx) - test_script = 'run_xfstests.sh' - test_path = os.path.join(test_root, test_script) - - xfstests_url = properties.get('xfstests_url') - assert xfstests_url is not None, \ - "task run_xfstests requires xfstests_url to be defined" - - xfstests_krbd_url = xfstests_url + '/' + test_script - - log.info('Fetching {script} for {role} from {url}'.format( - script=test_script, - role=role, - url=xfstests_krbd_url)) - - args = [ 'wget', '-O', test_path, '--', xfstests_krbd_url ] - remote.run(args=args) - - log.info('Running xfstests on {role}:'.format(role=role)) - log.info(' iteration count: {count}:'.format(count=count)) - log.info(' test device: {dev}'.format(dev=test_dev)) - log.info(' scratch device: {dev}'.format(dev=scratch_dev)) - log.info(' using fs_type: {fs_type}'.format(fs_type=fs_type)) - log.info(' tests to run: {tests}'.format(tests=tests)) - log.info(' exclude list: {}'.format(' '.join(exclude_list))) - log.info(' randomize: {randomize}'.format(randomize=randomize)) - - if exclude_list: - with tempfile.NamedTemporaryFile(bufsize=0, prefix='exclude') as exclude_file: - for test in exclude_list: - exclude_file.write("{}\n".format(test)) - remote.put_file(exclude_file.name, exclude_file.name) - - # Note that the device paths are interpreted using - # readlink -f <path> in order to get their canonical - # pathname (so it matches what the kernel remembers). - args = [ - '/usr/bin/sudo', - 'TESTDIR={tdir}'.format(tdir=testdir), - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - '/bin/bash', - test_path, - '-c', str(count), - '-f', fs_type, - '-t', test_dev, - '-s', scratch_dev, - ] - if exclude_list: - args.extend(['-x', exclude_file.name]) - if randomize: - args.append('-r') - if tests: - args.extend(['--', tests]) - remote.run(args=args, logger=log.getChild(role)) - finally: - log.info('Removing {script} on {role}'.format(script=test_script, - role=role)) - remote.run(args=['rm', '-f', test_path]) - -@contextlib.contextmanager -def xfstests(ctx, config): - """ - Run xfstests over rbd devices. This interface sets up all - required configuration automatically if not otherwise specified. - Note that only one instance of xfstests can run on a single host - at a time. By default, the set of tests specified is run once. - If a (non-zero) count value is supplied, the complete set of - tests will be run that number of times. - - For example:: - - tasks: - - ceph: - # Image sizes are in MB - - rbd.xfstests: - client.0: - count: 3 - test_image: 'test_image' - test_size: 250 - test_format: 2 - scratch_image: 'scratch_image' - scratch_size: 250 - scratch_format: 1 - fs_type: 'xfs' - tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015' - exclude: - - generic/42 - randomize: true - xfstests_branch: master - xfstests_url: 'https://raw.github.com/ceph/branch/master/qa' - """ - if config is None: - config = { 'all': None } - assert isinstance(config, dict) or isinstance(config, list), \ - "task xfstests only supports a list or dictionary for configuration" - if isinstance(config, dict): - config = teuthology.replace_all_with_clients(ctx.cluster, config) - runs = config.items() - else: - runs = [(role, None) for role in config] - - running_xfstests = {} - for role, properties in runs: - assert role.startswith('client.'), \ - "task xfstests can only run on client nodes" - for host, roles_for_host in ctx.cluster.remotes.items(): - if role in roles_for_host: - assert host not in running_xfstests, \ - "task xfstests allows only one instance at a time per host" - running_xfstests[host] = True - - images_config = {} - scratch_config = {} - modprobe_config = {} - image_map_config = {} - scratch_map_config = {} - xfstests_config = {} - for role, properties in runs: - if properties is None: - properties = {} - - test_image = properties.get('test_image', 'test_image.{role}'.format(role=role)) - test_size = properties.get('test_size', 10000) # 10G - test_fmt = properties.get('test_format', 1) - scratch_image = properties.get('scratch_image', 'scratch_image.{role}'.format(role=role)) - scratch_size = properties.get('scratch_size', 10000) # 10G - scratch_fmt = properties.get('scratch_format', 1) - - images_config[role] = dict( - image_name=test_image, - image_size=test_size, - image_format=test_fmt, - ) - - scratch_config[role] = dict( - image_name=scratch_image, - image_size=scratch_size, - image_format=scratch_fmt, - ) - - xfstests_branch = properties.get('xfstests_branch', 'master') - xfstests_url = properties.get('xfstests_url', 'https://raw.github.com/ceph/ceph/{branch}/qa'.format(branch=xfstests_branch)) - - xfstests_config[role] = dict( - count=properties.get('count', 1), - test_dev='/dev/rbd/rbd/{image}'.format(image=test_image), - scratch_dev='/dev/rbd/rbd/{image}'.format(image=scratch_image), - fs_type=properties.get('fs_type', 'xfs'), - randomize=properties.get('randomize', False), - tests=properties.get('tests'), - exclude=properties.get('exclude', []), - xfstests_url=xfstests_url, - ) - - log.info('Setting up xfstests using RBD images:') - log.info(' test ({size} MB): {image}'.format(size=test_size, - image=test_image)) - log.info(' scratch ({size} MB): {image}'.format(size=scratch_size, - image=scratch_image)) - modprobe_config[role] = None - image_map_config[role] = test_image - scratch_map_config[role] = scratch_image - - with contextutil.nested( - lambda: create_image(ctx=ctx, config=images_config), - lambda: create_image(ctx=ctx, config=scratch_config), - lambda: modprobe(ctx=ctx, config=modprobe_config), - lambda: dev_create(ctx=ctx, config=image_map_config), - lambda: dev_create(ctx=ctx, config=scratch_map_config), - lambda: run_xfstests(ctx=ctx, config=xfstests_config), - ): - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Create and mount an rbd image. - - For example, you can specify which clients to run on:: - - tasks: - - ceph: - - rbd: [client.0, client.1] - - There are a few image options:: - - tasks: - - ceph: - - rbd: - client.0: # uses defaults - client.1: - image_name: foo - image_size: 2048 - image_format: 2 - fs_type: xfs - - To use default options on all clients:: - - tasks: - - ceph: - - rbd: - all: - - To create 20GiB images and format them with xfs on all clients:: - - tasks: - - ceph: - - rbd: - all: - image_size: 20480 - fs_type: xfs - """ - if config is None: - config = { 'all': None } - norm_config = config - if isinstance(config, dict): - norm_config = teuthology.replace_all_with_clients(ctx.cluster, config) - if isinstance(norm_config, dict): - role_images = {} - for role, properties in norm_config.iteritems(): - if properties is None: - properties = {} - role_images[role] = properties.get('image_name') - else: - role_images = norm_config - - log.debug('rbd config is: %s', norm_config) - - with contextutil.nested( - lambda: create_image(ctx=ctx, config=norm_config), - lambda: modprobe(ctx=ctx, config=norm_config), - lambda: dev_create(ctx=ctx, config=role_images), - lambda: generic_mkfs(ctx=ctx, config=norm_config, - devname_rtn=rbd_devname_rtn), - lambda: generic_mount(ctx=ctx, config=role_images, - devname_rtn=rbd_devname_rtn), - ): - yield diff --git a/src/ceph/qa/tasks/rbd_fio.py b/src/ceph/qa/tasks/rbd_fio.py deleted file mode 100644 index 663e8f5..0000000 --- a/src/ceph/qa/tasks/rbd_fio.py +++ /dev/null @@ -1,226 +0,0 @@ -""" - Long running fio tests on rbd mapped devices for format/features provided in config - Many fio parameters can be configured so that this task can be used along with thrash/power-cut tests - and exercise IO on full disk for all format/features - - This test should not be run on VM due to heavy use of resource - -""" -import contextlib -import json -import logging -import os -import StringIO - -from teuthology.parallel import parallel -from teuthology import misc as teuthology -from tempfile import NamedTemporaryFile -from teuthology.orchestra import run -from teuthology.packaging import install_package, remove_package - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - client.0: - fio-io-size: 100g or 80% or 100m - fio-version: 2.2.9 - formats: [2] - features: [[layering],[striping],[layering,exclusive-lock,object-map]] - test-clone-io: 1 #remove this option to not run create rbd clone and not run io on clone - io-engine: "sync or rbd or any io-engine" - rw: randrw - client.1: - fio-io-size: 100g - fio-version: 2.2.9 - rw: read - image-size:20480 - -or - all: - fio-io-size: 400g - rw: randrw - formats: [2] - features: [[layering],[striping]] - io-engine: libaio - - Create rbd image + device and exercise IO for format/features provided in config file - Config can be per client or one config can be used for all clients, fio jobs are run in parallel for client provided - - """ - if config.get('all'): - client_config = config['all'] - clients = ctx.cluster.only(teuthology.is_type('client')) - rbd_test_dir = teuthology.get_testdir(ctx) + "/rbd_fio_test" - for remote,role in clients.remotes.iteritems(): - if 'client_config' in locals(): - with parallel() as p: - p.spawn(run_fio, remote, client_config, rbd_test_dir) - else: - for client_config in config: - if client_config in role: - with parallel() as p: - p.spawn(run_fio, remote, config[client_config], rbd_test_dir) - - yield - - -def get_ioengine_package_name(ioengine, remote): - system_type = teuthology.get_system_type(remote) - if ioengine == 'rbd': - return 'librbd1-devel' if system_type == 'rpm' else 'librbd-dev' - elif ioengine == 'libaio': - return 'libaio-devel' if system_type == 'rpm' else 'libaio-dev' - else: - return None - - -def run_rbd_map(remote, image, iodepth): - iodepth = max(iodepth, 128) # RBD_QUEUE_DEPTH_DEFAULT - out = StringIO.StringIO() - remote.run(args=['sudo', 'rbd', 'map', '-o', 'queue_depth={}'.format(iodepth), image], stdout=out) - dev = out.getvalue().rstrip('\n') - teuthology.sudo_write_file( - remote, - '/sys/block/{}/queue/nr_requests'.format(os.path.basename(dev)), - str(iodepth)) - return dev - - -def run_fio(remote, config, rbd_test_dir): - """ - create fio config file with options based on above config - get the fio from github, generate binary, and use it to run on - the generated fio config file - """ - fio_config=NamedTemporaryFile(prefix='fio_rbd_', dir='/tmp/', delete=False) - fio_config.write('[global]\n') - if config.get('io-engine'): - ioengine=config['io-engine'] - fio_config.write('ioengine={ioe}\n'.format(ioe=ioengine)) - else: - fio_config.write('ioengine=sync\n') - if config.get('bs'): - bs=config['bs'] - fio_config.write('bs={bs}\n'.format(bs=bs)) - else: - fio_config.write('bs=4k\n') - iodepth = config.get('io-depth', 2) - fio_config.write('iodepth={iod}\n'.format(iod=iodepth)) - if config.get('fio-io-size'): - size=config['fio-io-size'] - fio_config.write('size={size}\n'.format(size=size)) - else: - fio_config.write('size=100m\n') - - fio_config.write('time_based\n') - if config.get('runtime'): - runtime=config['runtime'] - fio_config.write('runtime={runtime}\n'.format(runtime=runtime)) - else: - fio_config.write('runtime=1800\n') - fio_config.write('allow_file_create=0\n') - image_size=10240 - if config.get('image_size'): - image_size=config['image_size'] - - formats=[1,2] - features=[['layering'],['striping'],['exclusive-lock','object-map']] - fio_version='2.21' - if config.get('formats'): - formats=config['formats'] - if config.get('features'): - features=config['features'] - if config.get('fio-version'): - fio_version=config['fio-version'] - - # handle package required for ioengine, if any - sn=remote.shortname - ioengine_pkg = get_ioengine_package_name(ioengine, remote) - if ioengine_pkg: - install_package(ioengine_pkg, remote) - - fio_config.write('norandommap\n') - if ioengine == 'rbd': - fio_config.write('clientname=admin\n') - fio_config.write('pool=rbd\n') - fio_config.write('invalidate=0\n') - elif ioengine == 'libaio': - fio_config.write('direct=1\n') - for frmt in formats: - for feature in features: - log.info("Creating rbd images on {sn}".format(sn=sn)) - feature_name = '-'.join(feature) - rbd_name = 'i{i}f{f}{sn}'.format(i=frmt,f=feature_name,sn=sn) - rbd_snap_name = 'i{i}f{f}{sn}@i{i}f{f}{sn}Snap'.format(i=frmt,f=feature_name,sn=sn) - rbd_clone_name = 'i{i}f{f}{sn}Clone'.format(i=frmt,f=feature_name,sn=sn) - create_args=['rbd', 'create', - '--size', '{size}'.format(size=image_size), - '--image', rbd_name, - '--image-format', '{f}'.format(f=frmt)] - map(lambda x: create_args.extend(['--image-feature', x]), feature) - remote.run(args=create_args) - remote.run(args=['rbd', 'info', rbd_name]) - if ioengine != 'rbd': - rbd_dev = run_rbd_map(remote, rbd_name, iodepth) - if config.get('test-clone-io'): - log.info("Testing clones using fio") - remote.run(args=['rbd', 'snap', 'create', rbd_snap_name]) - remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name]) - remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name]) - rbd_clone_dev = run_rbd_map(remote, rbd_clone_name, iodepth) - fio_config.write('[{rbd_dev}]\n'.format(rbd_dev=rbd_dev)) - if config.get('rw'): - rw=config['rw'] - fio_config.write('rw={rw}\n'.format(rw=rw)) - else: - fio_config .write('rw=randrw\n') - fio_config.write('filename={rbd_dev}\n'.format(rbd_dev=rbd_dev)) - if config.get('test-clone-io'): - fio_config.write('[{rbd_clone_dev}]\n'.format(rbd_clone_dev=rbd_clone_dev)) - fio_config.write('rw={rw}\n'.format(rw=rw)) - fio_config.write('filename={rbd_clone_dev}\n'.format(rbd_clone_dev=rbd_clone_dev)) - else: - if config.get('test-clone-io'): - log.info("Testing clones using fio") - remote.run(args=['rbd', 'snap', 'create', rbd_snap_name]) - remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name]) - remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name]) - fio_config.write('[{img_name}]\n'.format(img_name=rbd_name)) - if config.get('rw'): - rw=config['rw'] - fio_config.write('rw={rw}\n'.format(rw=rw)) - else: - fio_config.write('rw=randrw\n') - fio_config.write('rbdname={img_name}\n'.format(img_name=rbd_name)) - if config.get('test-clone-io'): - fio_config.write('[{clone_img_name}]\n'.format(clone_img_name=rbd_clone_name)) - fio_config.write('rw={rw}\n'.format(rw=rw)) - fio_config.write('rbdname={clone_img_name}\n'.format(clone_img_name=rbd_clone_name)) - - - fio_config.close() - remote.put_file(fio_config.name,fio_config.name) - try: - log.info("Running rbd feature - fio test on {sn}".format(sn=sn)) - fio = "https://github.com/axboe/fio/archive/fio-" + fio_version + ".tar.gz" - remote.run(args=['mkdir', run.Raw(rbd_test_dir),]) - remote.run(args=['cd' , run.Raw(rbd_test_dir), - run.Raw(';'), 'wget' , fio , run.Raw(';'), run.Raw('tar -xvf fio*tar.gz'), run.Raw(';'), - run.Raw('cd fio-fio*'), 'configure', run.Raw(';') ,'make']) - remote.run(args=['ceph', '-s']) - remote.run(args=[run.Raw('{tdir}/fio-fio-{v}/fio --showcmd {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))]) - remote.run(args=['sudo', run.Raw('{tdir}/fio-fio-{v}/fio {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))]) - remote.run(args=['ceph', '-s']) - finally: - out=StringIO.StringIO() - remote.run(args=['rbd','showmapped', '--format=json'], stdout=out) - mapped_images = json.loads(out.getvalue()) - if mapped_images: - log.info("Unmapping rbd images on {sn}".format(sn=sn)) - for image in mapped_images.itervalues(): - remote.run(args=['sudo', 'rbd', 'unmap', str(image['device'])]) - log.info("Cleaning up fio install") - remote.run(args=['rm','-rf', run.Raw(rbd_test_dir)]) - if ioengine_pkg: - remove_package(ioengine_pkg, remote) diff --git a/src/ceph/qa/tasks/rbd_fsx.py b/src/ceph/qa/tasks/rbd_fsx.py deleted file mode 100644 index ab1a47f..0000000 --- a/src/ceph/qa/tasks/rbd_fsx.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Run fsx on an rbd image -""" -import contextlib -import logging - -from teuthology.parallel import parallel -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run fsx on an rbd image. - - Currently this requires running as client.admin - to create a pool. - - Specify which clients to run on as a list:: - - tasks: - ceph: - rbd_fsx: - clients: [client.0, client.1] - - You can optionally change some properties of fsx: - - tasks: - ceph: - rbd_fsx: - clients: <list of clients> - seed: <random seed number, or 0 to use the time> - ops: <number of operations to do> - size: <maximum image size in bytes> - valgrind: [--tool=<valgrind tool>] - """ - log.info('starting rbd_fsx...') - with parallel() as p: - for role in config['clients']: - p.spawn(_run_one_client, ctx, config, role) - yield - -def _run_one_client(ctx, config, role): - """Spawned task that runs the client""" - krbd = config.get('krbd', False) - nbd = config.get('nbd', False) - testdir = teuthology.get_testdir(ctx) - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - - args = [] - if krbd or nbd: - args.append('sudo') # rbd(-nbd) map/unmap need privileges - args.extend([ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir) - ]) - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('rbd_fsx', {})) - - if config.get('valgrind'): - args = teuthology.get_valgrind_args( - testdir, - 'fsx_{id}'.format(id=role), - args, - config.get('valgrind') - ) - - args.extend([ - 'ceph_test_librbd_fsx', - '-d', # debug output for all operations - '-W', '-R', # mmap doesn't work with rbd - '-p', str(config.get('progress_interval', 100)), # show progress - '-P', '{tdir}/archive'.format(tdir=testdir), - '-r', str(config.get('readbdy',1)), - '-w', str(config.get('writebdy',1)), - '-t', str(config.get('truncbdy',1)), - '-h', str(config.get('holebdy',1)), - '-l', str(config.get('size', 250000000)), - '-S', str(config.get('seed', 0)), - '-N', str(config.get('ops', 1000)), - ]) - if krbd: - args.append('-K') # -K enables krbd mode - if nbd: - args.append('-M') # -M enables nbd mode - if config.get('direct_io', False): - args.append('-Z') # -Z use direct IO - if not config.get('randomized_striping', True): - args.append('-U') # -U disables randomized striping - if not config.get('punch_holes', True): - args.append('-H') # -H disables discard ops - if config.get('journal_replay', False): - args.append('-j') # -j replay all IO events from journal - args.extend([ - 'pool_{pool}'.format(pool=role), - 'image_{image}'.format(image=role), - ]) - - remote.run(args=args) diff --git a/src/ceph/qa/tasks/rbd_mirror.py b/src/ceph/qa/tasks/rbd_mirror.py deleted file mode 100644 index 851b64f..0000000 --- a/src/ceph/qa/tasks/rbd_mirror.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Task for running rbd mirroring daemons and configuring mirroring -""" - -import logging - -from teuthology.orchestra import run -from teuthology import misc -from teuthology.exceptions import ConfigError -from teuthology.task import Task -from util import get_remote_for_role - -log = logging.getLogger(__name__) - - -class RBDMirror(Task): - """ - Run an rbd-mirror daemon to sync rbd images between clusters. - - This requires two clients (one from each cluster) on the same host - to connect with. The pool configuration should be adjusted by later - test scripts to include the remote client and cluster name. This task - just needs to know how to connect to the local cluster. - - For example: - - roles: - - [primary.mon.a, primary.osd.0, primary.osd.1, primary.osd.2] - - [secondary.mon.a, secondary.osd.0, secondary.osd.1, secondary.osd.2] - - [primary.client.mirror, secondary.client.mirror] - tasks: - - ceph: - cluster: primary - - ceph: - cluster: secondary - - rbd-mirror: - client: primary.client.mirror - - To mirror back to the primary cluster as well, add another - rbd_mirror instance: - - - rbd-mirror: - client: secondary.client.mirror - - Possible options for this task are: - - client: role - ceph client to connect as - valgrind: [--tool=<valgrind tool>] - none by default - coverage: bool - whether this run may be collecting coverage data - """ - def __init__(self, ctx, config): - super(RBDMirror, self).__init__(ctx, config) - self.log = log - - def setup(self): - super(RBDMirror, self).setup() - try: - self.client = self.config['client'] - except KeyError: - raise ConfigError('rbd-mirror requires a client to connect with') - - self.cluster_name, type_, self.client_id = misc.split_role(self.client) - - if type_ != 'client': - msg = 'client role ({0}) must be a client'.format(self.client) - raise ConfigError(msg) - - self.remote = get_remote_for_role(self.ctx, self.client) - - def begin(self): - super(RBDMirror, self).begin() - testdir = misc.get_testdir(self.ctx) - daemon_signal = 'kill' - if 'coverage' in self.config or 'valgrind' in self.config: - daemon_signal = 'term' - - args = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'daemon-helper', - daemon_signal, - ] - - if 'valgrind' in self.config: - args = misc.get_valgrind_args( - testdir, - 'rbd-mirror-{id}'.format(id=self.client), - args, - self.config.get('valgrind') - ) - - args.extend([ - 'rbd-mirror', '--foreground', - '--cluster', - self.cluster_name, - '--id', - self.client_id, - ]) - - self.ctx.daemons.add_daemon( - self.remote, 'rbd-mirror', self.client, - cluster=self.cluster_name, - args=args, - logger=self.log.getChild(self.client), - stdin=run.PIPE, - wait=False, - ) - - def end(self): - mirror_daemon = self.ctx.daemons.get_daemon('rbd-mirror', - self.client, - self.cluster_name) - mirror_daemon.stop() - super(RBDMirror, self).end() - -task = RBDMirror diff --git a/src/ceph/qa/tasks/rebuild_mondb.py b/src/ceph/qa/tasks/rebuild_mondb.py deleted file mode 100644 index 900bd16..0000000 --- a/src/ceph/qa/tasks/rebuild_mondb.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -Test if we can recover the leveldb from OSD after where all leveldbs are -corrupted -""" - -import logging -import os.path -import shutil -import tempfile - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - - -def _push_directory(path, remote, remote_dir): - """ - local_temp_path=`mktemp` - tar czf $local_temp_path $path - ssh remote mkdir -p remote_dir - remote_temp_path=`mktemp` - scp $local_temp_path $remote_temp_path - rm $local_temp_path - tar xzf $remote_temp_path -C $remote_dir - ssh remote:$remote_temp_path - """ - fd, local_temp_path = tempfile.mkstemp(suffix='.tgz', - prefix='rebuild_mondb-') - os.close(fd) - cmd = ' '.join(['tar', 'cz', - '-f', local_temp_path, - '-C', path, - '--', '.']) - teuthology.sh(cmd) - _, fname = os.path.split(local_temp_path) - fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz', - prefix='rebuild_mondb-') - os.close(fd) - remote.put_file(local_temp_path, remote_temp_path) - os.remove(local_temp_path) - remote.run(args=['sudo', - 'tar', 'xz', - '-C', remote_dir, - '-f', remote_temp_path]) - remote.run(args=['sudo', 'rm', '-fr', remote_temp_path]) - - -def _nuke_mons(manager, mons, mon_id): - assert mons - is_mon = teuthology.is_type('mon') - for remote, roles in mons.remotes.iteritems(): - for role in roles: - if not is_mon(role): - continue - cluster, _, m = teuthology.split_role(role) - log.info('killing {cluster}:mon.{mon}'.format( - cluster=cluster, - mon=m)) - manager.kill_mon(m) - mon_data = os.path.join('/var/lib/ceph/mon/', - '{0}-{1}'.format(cluster, m)) - if m == mon_id: - # so we will only need to recreate the store.db for the - # first mon, would be easier than mkfs on it then replace - # the its store.db with the recovered one - store_dir = os.path.join(mon_data, 'store.db') - remote.run(args=['sudo', 'rm', '-r', store_dir]) - else: - remote.run(args=['sudo', 'rm', '-r', mon_data]) - - -def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path): - local_mstore = tempfile.mkdtemp() - - # collect the maps from all OSDs - is_osd = teuthology.is_type('osd') - osds = ctx.cluster.only(is_osd) - assert osds - for osd, roles in osds.remotes.iteritems(): - for role in roles: - if not is_osd(role): - continue - cluster, _, osd_id = teuthology.split_role(role) - assert cluster_name == cluster - log.info('collecting maps from {cluster}:osd.{osd}'.format( - cluster=cluster, - osd=osd_id)) - # push leveldb to OSD - osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store') - osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore]) - - _push_directory(local_mstore, osd, osd_mstore) - log.info('rm -rf {0}'.format(local_mstore)) - shutil.rmtree(local_mstore) - # update leveldb with OSD data - options = '--op update-mon-db --mon-store-path {0}' - log.info('cot {0}'.format(osd_mstore)) - manager.objectstore_tool(pool=None, - options=options.format(osd_mstore), - args='', - osd=osd_id, - do_revive=False) - # pull the updated mon db - log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore)) - local_mstore = tempfile.mkdtemp() - teuthology.pull_directory(osd, osd_mstore, local_mstore) - log.info('rm -rf osd:{0}'.format(osd_mstore)) - osd.run(args=['sudo', 'rm', '-fr', osd_mstore]) - - # recover the first_mon with re-built mon db - # pull from recovered leveldb from client - mon_store_dir = os.path.join('/var/lib/ceph/mon', - '{0}-{1}'.format(cluster_name, mon_id)) - _push_directory(local_mstore, mon, mon_store_dir) - mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir]) - shutil.rmtree(local_mstore) - - # fill up the caps in the keyring file - mon.run(args=['sudo', - 'ceph-authtool', keyring_path, - '-n', 'mon.', - '--cap', 'mon', 'allow *']) - mon.run(args=['sudo', - 'ceph-authtool', keyring_path, - '-n', 'client.admin', - '--cap', 'mon', 'allow *', - '--cap', 'osd', 'allow *', - '--cap', 'mds', 'allow *', - '--cap', 'mgr', 'allow *']) - mon.run(args=['sudo', '-u', 'ceph', - 'ceph-monstore-tool', mon_store_dir, - 'rebuild', '--', '--keyring', - keyring_path]) - - -def _revive_mons(manager, mons, recovered, keyring_path): - # revive monitors - # the initial monmap is in the ceph.conf, so we are good. - n_mons = 0 - is_mon = teuthology.is_type('mon') - for remote, roles in mons.remotes.iteritems(): - for role in roles: - if not is_mon(role): - continue - cluster, _, m = teuthology.split_role(role) - if recovered != m: - log.info('running mkfs on {cluster}:mon.{mon}'.format( - cluster=cluster, - mon=m)) - remote.run( - args=[ - 'sudo', - 'ceph-mon', - '--cluster', cluster, - '--mkfs', - '-i', m, - '--keyring', keyring_path]) - log.info('reviving mon.{0}'.format(m)) - manager.revive_mon(m) - n_mons += 1 - manager.wait_for_mon_quorum_size(n_mons, timeout=30) - - -def _revive_mgrs(ctx, manager): - is_mgr = teuthology.is_type('mgr') - mgrs = ctx.cluster.only(is_mgr) - for _, roles in mgrs.remotes.iteritems(): - for role in roles: - if not is_mgr(role): - continue - _, _, mgr_id = teuthology.split_role(role) - log.info('reviving mgr.{0}'.format(mgr_id)) - manager.revive_mgr(mgr_id) - - -def _revive_osds(ctx, manager): - is_osd = teuthology.is_type('osd') - osds = ctx.cluster.only(is_osd) - for _, roles in osds.remotes.iteritems(): - for role in roles: - if not is_osd(role): - continue - _, _, osd_id = teuthology.split_role(role) - log.info('reviving osd.{0}'.format(osd_id)) - manager.revive_osd(osd_id) - - -def task(ctx, config): - """ - Test monitor recovery from OSD - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'task only accepts a dict for configuration' - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager')) - - mons = ctx.cluster.only(teuthology.is_type('mon')) - # note down the first cluster_name and mon_id - # we will recover it later on - cluster_name, _, mon_id = teuthology.split_role(first_mon) - _nuke_mons(manager, mons, mon_id) - default_keyring = '/etc/ceph/{cluster}.keyring'.format( - cluster=cluster_name) - keyring_path = config.get('keyring_path', default_keyring) - _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path) - _revive_mons(manager, mons, mon_id, keyring_path) - _revive_mgrs(ctx, manager) - _revive_osds(ctx, manager) diff --git a/src/ceph/qa/tasks/recovery_bench.py b/src/ceph/qa/tasks/recovery_bench.py deleted file mode 100644 index 5eb9fd2..0000000 --- a/src/ceph/qa/tasks/recovery_bench.py +++ /dev/null @@ -1,208 +0,0 @@ -""" -Recovery system benchmarking -""" -from cStringIO import StringIO - -import contextlib -import gevent -import json -import logging -import random -import time - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Benchmark the recovery system. - - Generates objects with smalliobench, runs it normally to get a - baseline performance measurement, then marks an OSD out and reruns - to measure performance during recovery. - - The config should be as follows: - - recovery_bench: - duration: <seconds for each measurement run> - num_objects: <number of objects> - io_size: <io size in bytes> - - example: - - tasks: - - ceph: - - recovery_bench: - duration: 60 - num_objects: 500 - io_size: 4096 - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'recovery_bench task only accepts a dict for configuration' - - log.info('Beginning recovery bench...') - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - while len(manager.get_osd_status()['up']) < num_osds: - time.sleep(10) - - bench_proc = RecoveryBencher( - manager, - config, - ) - try: - yield - finally: - log.info('joining recovery bencher') - bench_proc.do_join() - -class RecoveryBencher: - """ - RecoveryBencher - """ - def __init__(self, manager, config): - self.ceph_manager = manager - self.ceph_manager.wait_for_clean() - - osd_status = self.ceph_manager.get_osd_status() - self.osds = osd_status['up'] - - self.config = config - if self.config is None: - self.config = dict() - - else: - def tmp(x): - """ - Local wrapper to print value. - """ - print x - self.log = tmp - - log.info("spawning thread") - - self.thread = gevent.spawn(self.do_bench) - - def do_join(self): - """ - Join the recovery bencher. This is called after the main - task exits. - """ - self.thread.get() - - def do_bench(self): - """ - Do the benchmarking. - """ - duration = self.config.get("duration", 60) - num_objects = self.config.get("num_objects", 500) - io_size = self.config.get("io_size", 4096) - - osd = str(random.choice(self.osds)) - (osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys() - - testdir = teuthology.get_testdir(self.ceph_manager.ctx) - - # create the objects - osd_remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'smalliobench'.format(tdir=testdir), - '--use-prefix', 'recovery_bench', - '--init-only', '1', - '--num-objects', str(num_objects), - '--io-size', str(io_size), - ], - wait=True, - ) - - # baseline bench - log.info('non-recovery (baseline)') - p = osd_remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'smalliobench', - '--use-prefix', 'recovery_bench', - '--do-not-init', '1', - '--duration', str(duration), - '--io-size', str(io_size), - ], - stdout=StringIO(), - stderr=StringIO(), - wait=True, - ) - self.process_samples(p.stderr.getvalue()) - - self.ceph_manager.raw_cluster_cmd('osd', 'out', osd) - time.sleep(5) - - # recovery bench - log.info('recovery active') - p = osd_remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'smalliobench', - '--use-prefix', 'recovery_bench', - '--do-not-init', '1', - '--duration', str(duration), - '--io-size', str(io_size), - ], - stdout=StringIO(), - stderr=StringIO(), - wait=True, - ) - self.process_samples(p.stderr.getvalue()) - - self.ceph_manager.raw_cluster_cmd('osd', 'in', osd) - - def process_samples(self, input): - """ - Extract samples from the input and process the results - - :param input: input lines in JSON format - """ - lat = {} - for line in input.split('\n'): - try: - sample = json.loads(line) - samples = lat.setdefault(sample['type'], []) - samples.append(float(sample['latency'])) - except Exception: - pass - - for type in lat: - samples = lat[type] - samples.sort() - - num = len(samples) - - # median - if num & 1 == 1: # odd number of samples - median = samples[num / 2] - else: - median = (samples[num / 2] + samples[num / 2 - 1]) / 2 - - # 99% - ninety_nine = samples[int(num * 0.99)] - - log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine)) diff --git a/src/ceph/qa/tasks/reg11184.py b/src/ceph/qa/tasks/reg11184.py deleted file mode 100644 index f248623..0000000 --- a/src/ceph/qa/tasks/reg11184.py +++ /dev/null @@ -1,241 +0,0 @@ -""" -Special regression test for tracker #11184 - -Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid)) - -This is accomplished by moving a pg that wasn't part of split and still include -divergent priors. -""" -import logging -import time -from cStringIO import StringIO - -from teuthology.orchestra import run -from teuthology import misc as teuthology -from util.rados import rados -import os - - -log = logging.getLogger(__name__) - - -def task(ctx, config): - """ - Test handling of divergent entries during export / import - to regression test tracker #11184 - - overrides: - ceph: - conf: - osd: - debug osd: 5 - - Requires 3 osds on a single test node. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'divergent_priors task only accepts a dict for configuration' - - manager = ctx.managers['ceph'] - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - osds = [0, 1, 2] - manager.flush_pg_stats(osds) - manager.raw_cluster_cmd('osd', 'set', 'noout') - manager.raw_cluster_cmd('osd', 'set', 'noin') - manager.raw_cluster_cmd('osd', 'set', 'nodown') - manager.wait_for_clean() - - # something that is always there - dummyfile = '/etc/fstab' - dummyfile2 = '/etc/resolv.conf' - testdir = teuthology.get_testdir(ctx) - - # create 1 pg pool - log.info('creating foo') - manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1') - manager.raw_cluster_cmd( - 'osd', 'pool', 'application', 'enable', - 'foo', 'rados', run.Raw('||'), 'true') - - # Remove extra pool to simlify log output - manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it') - - for i in osds: - manager.set_config(i, osd_min_pg_log_entries=10) - manager.set_config(i, osd_max_pg_log_entries=10) - manager.set_config(i, osd_pg_log_trim_min=5) - - # determine primary - divergent = manager.get_pg_primary('foo', 0) - log.info("primary and soon to be divergent is %d", divergent) - non_divergent = list(osds) - non_divergent.remove(divergent) - - log.info('writing initial objects') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - # write 100 objects - for i in range(100): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) - - manager.wait_for_clean() - - # blackhole non_divergent - log.info("blackholing osds %s", str(non_divergent)) - for i in non_divergent: - manager.set_config(i, objectstore_blackhole=1) - - DIVERGENT_WRITE = 5 - DIVERGENT_REMOVE = 5 - # Write some soon to be divergent - log.info('writing divergent objects') - for i in range(DIVERGENT_WRITE): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, - dummyfile2], wait=False) - # Remove some soon to be divergent - log.info('remove divergent objects') - for i in range(DIVERGENT_REMOVE): - rados(ctx, mon, ['-p', 'foo', 'rm', - 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False) - time.sleep(10) - mon.run( - args=['killall', '-9', 'rados'], - wait=True, - check_status=False) - - # kill all the osds but leave divergent in - log.info('killing all the osds') - for i in osds: - manager.kill_osd(i) - for i in osds: - manager.mark_down_osd(i) - for i in non_divergent: - manager.mark_out_osd(i) - - # bring up non-divergent - log.info("bringing up non_divergent %s", str(non_divergent)) - for i in non_divergent: - manager.revive_osd(i) - for i in non_divergent: - manager.mark_in_osd(i) - - # write 1 non-divergent object (ensure that old divergent one is divergent) - objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE) - log.info('writing non-divergent object ' + objname) - rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2]) - - manager.wait_for_recovery() - - # ensure no recovery of up osds first - log.info('delay recovery') - for i in non_divergent: - manager.wait_run_admin_socket( - 'osd', i, ['set_recovery_delay', '100000']) - - # bring in our divergent friend - log.info("revive divergent %d", divergent) - manager.raw_cluster_cmd('osd', 'set', 'noup') - manager.revive_osd(divergent) - - log.info('delay recovery divergent') - manager.wait_run_admin_socket( - 'osd', divergent, ['set_recovery_delay', '100000']) - - manager.raw_cluster_cmd('osd', 'unset', 'noup') - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - - log.info('wait for peering') - rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile]) - - # At this point the divergent_priors should have been detected - - log.info("killing divergent %d", divergent) - manager.kill_osd(divergent) - - # Split pgs for pool foo - manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2') - time.sleep(5) - - manager.raw_cluster_cmd('pg','dump') - - # Export a pg - (exp_remote,) = ctx.\ - cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys() - FSPATH = manager.get_filepath() - JPATH = os.path.join(FSPATH, "journal") - prefix = ("sudo adjust-ulimits ceph-objectstore-tool " - "--data-path {fpath} --journal-path {jpath} " - "--log-file=" - "/var/log/ceph/objectstore_tool.$$.log ". - format(fpath=FSPATH, jpath=JPATH)) - pid = os.getpid() - expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid)) - cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}"). - format(id=divergent, file=expfile)) - proc = exp_remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - assert proc.exitstatus == 0 - - # Kill one of non-divergent OSDs - log.info('killing osd.%d' % non_divergent[0]) - manager.kill_osd(non_divergent[0]) - manager.mark_down_osd(non_divergent[0]) - # manager.mark_out_osd(non_divergent[0]) - - # An empty collection for pg 2.0 might need to be cleaned up - cmd = ((prefix + "--force --op remove --pgid 2.0"). - format(id=non_divergent[0])) - proc = exp_remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - - cmd = ((prefix + "--op import --file {file}"). - format(id=non_divergent[0], file=expfile)) - proc = exp_remote.run(args=cmd, wait=True, - check_status=False, stdout=StringIO()) - assert proc.exitstatus == 0 - - # bring in our divergent friend and other node - log.info("revive divergent %d", divergent) - manager.revive_osd(divergent) - manager.mark_in_osd(divergent) - log.info("revive %d", non_divergent[0]) - manager.revive_osd(non_divergent[0]) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - - log.info('delay recovery divergent') - manager.set_config(divergent, osd_recovery_delay_start=100000) - log.info('mark divergent in') - manager.mark_in_osd(divergent) - - log.info('wait for peering') - rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile]) - - log.info("killing divergent %d", divergent) - manager.kill_osd(divergent) - log.info("reviving divergent %d", divergent) - manager.revive_osd(divergent) - time.sleep(3) - - log.info('allowing recovery') - # Set osd_recovery_delay_start back to 0 and kick the queue - for i in osds: - manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug', - 'kick_recovery_wq', ' 0') - - log.info('reading divergent objects') - for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE): - exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i, - '/tmp/existing']) - assert exit_status is 0 - - (remote,) = ctx.\ - cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys() - cmd = 'rm {file}'.format(file=expfile) - remote.run(args=cmd, wait=True) - log.info("success") diff --git a/src/ceph/qa/tasks/rep_lost_unfound_delete.py b/src/ceph/qa/tasks/rep_lost_unfound_delete.py deleted file mode 100644 index 4e5678d..0000000 --- a/src/ceph/qa/tasks/rep_lost_unfound_delete.py +++ /dev/null @@ -1,177 +0,0 @@ -""" -Lost_unfound -""" -import logging -from teuthology.orchestra import run -import ceph_manager -import time -from teuthology import misc as teuthology -from util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of lost objects. - - A pretty rigid cluseter is brought up andtested by this task - """ - POOL = 'unfounddel_pool' - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.flush_pg_stats([0, 1, 2]) - manager.wait_for_clean() - - manager.create_pool(POOL) - - # something that is always there - dummyfile = '/etc/fstab' - - # take an osd out until the very end - manager.kill_osd(2) - manager.mark_down_osd(2) - manager.mark_out_osd(2) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) - - manager.flush_pg_stats([0, 1]) - manager.wait_for_recovery() - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f]) - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' - ) - - manager.kill_osd(0) - manager.mark_down_osd(0) - - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - - # bring osd.0 back up, let it peer, but don't replicate the new - # objects... - log.info('osd.0 command_args is %s' % 'foo') - log.info(ctx.daemons.get_daemon('osd', 0).command_args) - ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ - '--osd-recovery-delay-start', '1000' - ]) - manager.revive_osd(0) - manager.mark_in_osd(0) - manager.wait_till_osd_is_up(0) - - manager.flush_pg_stats([0, 1]) - manager.wait_till_active() - - # take out osd.1 and the only copy of those objects. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.mark_out_osd(1) - manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') - - # bring up osd.2 so that things would otherwise, in theory, recovery fully - manager.revive_osd(2) - manager.mark_in_osd(2) - manager.wait_till_osd_is_up(2) - - manager.flush_pg_stats([0, 2]) - manager.wait_till_active() - manager.flush_pg_stats([0, 2]) - - # verify that there are unfound objects - unfound = manager.get_num_unfound_objects() - log.info("there are %d unfound objects" % unfound) - assert unfound - - testdir = teuthology.get_testdir(ctx) - procs = [] - if config.get('parallel_bench', True): - procs.append(mon.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'rados', - '--no-log-to-stderr', - '--name', 'client.admin', - '-b', str(4<<10), - '-p' , POOL, - '-t', '20', - 'bench', '240', 'write', - ]).format(tdir=testdir), - ], - logger=log.getChild('radosbench.{id}'.format(id='client.admin')), - stdin=run.PIPE, - wait=False - )) - time.sleep(10) - - # mark stuff lost - pgs = manager.get_pg_stats() - for pg in pgs: - if pg['stat_sum']['num_objects_unfound'] > 0: - primary = 'osd.%d' % pg['acting'][0] - - # verify that i can list them direct from the osd - log.info('listing missing/lost in %s state %s', pg['pgid'], - pg['state']); - m = manager.list_pg_missing(pg['pgid']) - #log.info('%s' % m) - assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] - num_unfound=0 - for o in m['objects']: - if len(o['locations']) == 0: - num_unfound += 1 - assert m['num_unfound'] == num_unfound - - log.info("reverting unfound in %s on %s", pg['pgid'], primary) - manager.raw_cluster_cmd('pg', pg['pgid'], - 'mark_unfound_lost', 'delete') - else: - log.info("no unfound in %s", pg['pgid']) - - manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.flush_pg_stats([0, 2]) - manager.wait_for_recovery() - - # verify result - for f in range(1, 10): - err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-']) - assert err - - # see if osd.1 can cope - manager.revive_osd(1) - manager.mark_in_osd(1) - manager.wait_till_osd_is_up(1) - manager.wait_for_clean() - run.wait(procs) - diff --git a/src/ceph/qa/tasks/repair_test.py b/src/ceph/qa/tasks/repair_test.py deleted file mode 100644 index 5a63bd6..0000000 --- a/src/ceph/qa/tasks/repair_test.py +++ /dev/null @@ -1,308 +0,0 @@ -""" -Test pool repairing after objects are damaged. -""" -import logging -import time - -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - - -def choose_primary(manager, pool, num): - """ - Return primary to test on. - """ - log.info("Choosing primary") - return manager.get_pg_primary(pool, num) - - -def choose_replica(manager, pool, num): - """ - Return replica to test on. - """ - log.info("Choosing replica") - return manager.get_pg_replica(pool, num) - - -def trunc(manager, osd, pool, obj): - """ - truncate an object - """ - log.info("truncating object") - return manager.osd_admin_socket( - osd, - ['truncobj', pool, obj, '1']) - - -def dataerr(manager, osd, pool, obj): - """ - cause an error in the data - """ - log.info("injecting data err on object") - return manager.osd_admin_socket( - osd, - ['injectdataerr', pool, obj]) - - -def mdataerr(manager, osd, pool, obj): - """ - cause an error in the mdata - """ - log.info("injecting mdata err on object") - return manager.osd_admin_socket( - osd, - ['injectmdataerr', pool, obj]) - - -def omaperr(manager, osd, pool, obj): - """ - Cause an omap error. - """ - log.info("injecting omap err on object") - return manager.osd_admin_socket(osd, ['setomapval', pool, obj, - 'badkey', 'badval']) - - -def repair_test_1(manager, corrupter, chooser, scrub_type): - """ - Creates an object in the pool, corrupts it, - scrubs it, and verifies that the pool is inconsistent. It then repairs - the pool, rescrubs it, and verifies that the pool is consistent - - :param corrupter: error generating function (truncate, data-error, or - meta-data error, for example). - :param chooser: osd type chooser (primary or replica) - :param scrub_type: regular scrub or deep-scrub - """ - pool = "repair_pool_1" - manager.wait_for_clean() - with manager.pool(pool, 1): - - log.info("starting repair test type 1") - victim_osd = chooser(manager, pool, 0) - - # create object - log.info("doing put") - manager.do_put(pool, 'repair_test_obj', '/etc/hosts') - - # corrupt object - log.info("corrupting object") - corrupter(manager, victim_osd, pool, 'repair_test_obj') - - # verify inconsistent - log.info("scrubbing") - manager.do_pg_scrub(pool, 0, scrub_type) - - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) - - # repair - log.info("repairing") - manager.do_pg_scrub(pool, 0, "repair") - - log.info("re-scrubbing") - manager.do_pg_scrub(pool, 0, scrub_type) - - # verify consistent - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) - log.info("done") - - -def repair_test_2(ctx, manager, config, chooser): - """ - First creates a set of objects and - sets the omap value. It then corrupts an object, does both a scrub - and a deep-scrub, and then corrupts more objects. After that, it - repairs the pool and makes sure that the pool is consistent some - time after a deep-scrub. - - :param chooser: primary or replica selection routine. - """ - pool = "repair_pool_2" - manager.wait_for_clean() - with manager.pool(pool, 1): - log.info("starting repair test type 2") - victim_osd = chooser(manager, pool, 0) - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - # create object - log.info("doing put and setomapval") - manager.do_put(pool, 'file1', '/etc/hosts') - manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1', - 'key', 'val']) - manager.do_put(pool, 'file2', '/etc/hosts') - manager.do_put(pool, 'file3', '/etc/hosts') - manager.do_put(pool, 'file4', '/etc/hosts') - manager.do_put(pool, 'file5', '/etc/hosts') - manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5', - 'key', 'val']) - manager.do_put(pool, 'file6', '/etc/hosts') - - # corrupt object - log.info("corrupting object") - omaperr(manager, victim_osd, pool, 'file1') - - # verify inconsistent - log.info("scrubbing") - manager.do_pg_scrub(pool, 0, 'deep-scrub') - - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) - - # Regression test for bug #4778, should still - # be inconsistent after scrub - manager.do_pg_scrub(pool, 0, 'scrub') - - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) - - # Additional corruptions including 2 types for file1 - log.info("corrupting more objects") - dataerr(manager, victim_osd, pool, 'file1') - mdataerr(manager, victim_osd, pool, 'file2') - trunc(manager, victim_osd, pool, 'file3') - omaperr(manager, victim_osd, pool, 'file6') - - # see still inconsistent - log.info("scrubbing") - manager.do_pg_scrub(pool, 0, 'deep-scrub') - - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) - - # repair - log.info("repairing") - manager.do_pg_scrub(pool, 0, "repair") - - # Let repair clear inconsistent flag - time.sleep(10) - - # verify consistent - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) - - # In the future repair might determine state of - # inconsistency itself, verify with a deep-scrub - log.info("scrubbing") - manager.do_pg_scrub(pool, 0, 'deep-scrub') - - # verify consistent - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) - - log.info("done") - - -def hinfoerr(manager, victim, pool, obj): - """ - cause an error in the hinfo_key - """ - log.info("remove the hinfo_key") - manager.objectstore_tool(pool, - options='', - args='rm-attr hinfo_key', - object_name=obj, - osd=victim) - - -def repair_test_erasure_code(manager, corrupter, victim, scrub_type): - """ - Creates an object in the pool, corrupts it, - scrubs it, and verifies that the pool is inconsistent. It then repairs - the pool, rescrubs it, and verifies that the pool is consistent - - :param corrupter: error generating function. - :param chooser: osd type chooser (primary or replica) - :param scrub_type: regular scrub or deep-scrub - """ - pool = "repair_pool_3" - manager.wait_for_clean() - with manager.pool(pool_name=pool, pg_num=1, - erasure_code_profile_name='default'): - - log.info("starting repair test for erasure code") - - # create object - log.info("doing put") - manager.do_put(pool, 'repair_test_obj', '/etc/hosts') - - # corrupt object - log.info("corrupting object") - corrupter(manager, victim, pool, 'repair_test_obj') - - # verify inconsistent - log.info("scrubbing") - manager.do_pg_scrub(pool, 0, scrub_type) - - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) - - # repair - log.info("repairing") - manager.do_pg_scrub(pool, 0, "repair") - - log.info("re-scrubbing") - manager.do_pg_scrub(pool, 0, scrub_type) - - # verify consistent - manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) - log.info("done") - - -def task(ctx, config): - """ - Test [deep] repair in several situations: - Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica] - - The config should be as follows: - - Must include the log-whitelist below - Must enable filestore_debug_inject_read_err config - - example: - - tasks: - - chef: - - install: - - ceph: - log-whitelist: - - 'candidate had a stat error' - - 'candidate had a read error' - - 'deep-scrub 0 missing, 1 inconsistent objects' - - 'deep-scrub 0 missing, 4 inconsistent objects' - - 'deep-scrub [0-9]+ errors' - - '!= omap_digest' - - '!= data_digest' - - 'repair 0 missing, 1 inconsistent objects' - - 'repair 0 missing, 4 inconsistent objects' - - 'repair [0-9]+ errors, [0-9]+ fixed' - - 'scrub 0 missing, 1 inconsistent objects' - - 'scrub [0-9]+ errors' - - 'size 1 != size' - - 'attr name mismatch' - - 'Regular scrub request, deep-scrub details will be lost' - conf: - osd: - filestore debug inject read err: true - - repair_test: - - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'repair_test task only accepts a dict for config' - - manager = ctx.managers['ceph'] - manager.wait_for_all_osds_up() - - manager.raw_cluster_cmd('osd', 'set', 'noscrub') - manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub') - - repair_test_1(manager, mdataerr, choose_primary, "scrub") - repair_test_1(manager, mdataerr, choose_replica, "scrub") - repair_test_1(manager, dataerr, choose_primary, "deep-scrub") - repair_test_1(manager, dataerr, choose_replica, "deep-scrub") - repair_test_1(manager, trunc, choose_primary, "scrub") - repair_test_1(manager, trunc, choose_replica, "scrub") - repair_test_2(ctx, manager, config, choose_primary) - repair_test_2(ctx, manager, config, choose_replica) - - repair_test_erasure_code(manager, hinfoerr, 'primary', "deep-scrub") - - manager.raw_cluster_cmd('osd', 'unset', 'noscrub') - manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub') diff --git a/src/ceph/qa/tasks/resolve_stuck_peering.py b/src/ceph/qa/tasks/resolve_stuck_peering.py deleted file mode 100644 index bdf86e9..0000000 --- a/src/ceph/qa/tasks/resolve_stuck_peering.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -Resolve stuck peering -""" -import logging -import time - -from teuthology import misc as teuthology -from util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling resolve stuck peering - - requires 3 osds on a single test node - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'Resolve stuck peering only accepts a dict for config' - - manager = ctx.managers['ceph'] - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - - - manager.wait_for_clean() - - dummyfile = '/etc/fstab' - dummyfile1 = '/etc/resolv.conf' - - #create 1 PG pool - pool='foo' - log.info('creating pool foo') - manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1') - - #set min_size of the pool to 1 - #so that we can continue with I/O - #when 2 osds are down - manager.set_pool_property(pool, "min_size", 1) - - osds = [0, 1, 2] - - primary = manager.get_pg_primary('foo', 0) - log.info("primary osd is %d", primary) - - others = list(osds) - others.remove(primary) - - log.info('writing initial objects') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - #create few objects - for i in range(100): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) - - manager.wait_for_clean() - - #kill other osds except primary - log.info('killing other osds except primary') - for i in others: - manager.kill_osd(i) - for i in others: - manager.mark_down_osd(i) - - - for i in range(100): - rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1]) - - #kill primary osd - manager.kill_osd(primary) - manager.mark_down_osd(primary) - - #revive other 2 osds - for i in others: - manager.revive_osd(i) - - #make sure that pg is down - #Assuming pg number for single pg pool will start from 0 - pgnum=0 - pgstr = manager.get_pgid(pool, pgnum) - stats = manager.get_single_pg_stats(pgstr) - print stats['state'] - - timeout=60 - start=time.time() - - while 'down' not in stats['state']: - assert time.time() - start < timeout, \ - 'failed to reach down state before timeout expired' - stats = manager.get_single_pg_stats(pgstr) - - #mark primary as lost - manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\ - '--yes-i-really-mean-it') - - - #expect the pg status to be active+undersized+degraded - #pg should recover and become active+clean within timeout - stats = manager.get_single_pg_stats(pgstr) - print stats['state'] - - timeout=10 - start=time.time() - - while manager.get_num_down(): - assert time.time() - start < timeout, \ - 'failed to recover before timeout expired' - - manager.revive_osd(primary) diff --git a/src/ceph/qa/tasks/rest_api.py b/src/ceph/qa/tasks/rest_api.py deleted file mode 100644 index e86f77e..0000000 --- a/src/ceph/qa/tasks/rest_api.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -Rest Api -""" -import logging -import contextlib -import time - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.orchestra import run -from teuthology.orchestra.daemon import DaemonGroup - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def run_rest_api_daemon(ctx, api_clients): - """ - Wrapper starts the rest api daemons - """ - if not hasattr(ctx, 'daemons'): - ctx.daemons = DaemonGroup() - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - for rems, roles in remotes.iteritems(): - for whole_id_ in roles: - if whole_id_ in api_clients: - id_ = whole_id_[len('clients'):] - run_cmd = [ - 'sudo', - 'daemon-helper', - 'kill', - 'ceph-rest-api', - '-n', - 'client.rest{id}'.format(id=id_), ] - cl_rest_id = 'client.rest{id}'.format(id=id_) - ctx.daemons.add_daemon(rems, 'restapi', - cl_rest_id, - args=run_cmd, - logger=log.getChild(cl_rest_id), - stdin=run.PIPE, - wait=False, - ) - for i in range(1, 12): - log.info('testing for ceph-rest-api try {0}'.format(i)) - run_cmd = [ - 'wget', - '-O', - '/dev/null', - '-q', - 'http://localhost:5000/api/v0.1/status' - ] - proc = rems.run( - args=run_cmd, - check_status=False - ) - if proc.exitstatus == 0: - break - time.sleep(5) - if proc.exitstatus != 0: - raise RuntimeError('Cannot contact ceph-rest-api') - try: - yield - - finally: - """ - TO DO: destroy daemons started -- modify iter_daemons_of_role - """ - teuthology.stop_daemons_of_type(ctx, 'restapi') - -@contextlib.contextmanager -def task(ctx, config): - """ - Start up rest-api. - - To start on on all clients:: - - tasks: - - ceph: - - rest-api: - - To only run on certain clients:: - - tasks: - - ceph: - - rest-api: [client.0, client.3] - - or - - tasks: - - ceph: - - rest-api: - client.0: - client.3: - - The general flow of things here is: - 1. Find clients on which rest-api is supposed to run (api_clients) - 2. Generate keyring values - 3. Start up ceph-rest-api daemons - On cleanup: - 4. Stop the daemons - 5. Delete keyring value files. - """ - api_clients = [] - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - log.info(remotes) - if config == None: - api_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - else: - api_clients = config - log.info(api_clients) - testdir = teuthology.get_testdir(ctx) - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - for rems, roles in remotes.iteritems(): - for whole_id_ in roles: - if whole_id_ in api_clients: - id_ = whole_id_[len('client.'):] - keyring = '/etc/ceph/ceph.client.rest{id}.keyring'.format( - id=id_) - rems.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - '--gen-key', - '--name=client.rest{id}'.format(id=id_), - '--set-uid=0', - '--cap', 'mon', 'allow *', - '--cap', 'osd', 'allow *', - '--cap', 'mds', 'allow', - keyring, - run.Raw('&&'), - 'sudo', - 'chmod', - '0644', - keyring, - ], - ) - rems.run( - args=[ - 'sudo', - 'sh', - '-c', - run.Raw("'"), - "echo", - '[client.rest{id}]'.format(id=id_), - run.Raw('>>'), - "/etc/ceph/ceph.conf", - run.Raw("'") - ] - ) - rems.run( - args=[ - 'sudo', - 'sh', - '-c', - run.Raw("'"), - 'echo', - 'restapi', - 'keyring', - '=', - '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_), - run.Raw('>>'), - '/etc/ceph/ceph.conf', - run.Raw("'"), - ] - ) - rems.run( - args=[ - 'sudo', - 'ceph', - 'auth', - 'import', - '-i', - '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_), - ] - ) - with contextutil.nested( - lambda: run_rest_api_daemon(ctx=ctx, api_clients=api_clients),): - yield - diff --git a/src/ceph/qa/tasks/restart.py b/src/ceph/qa/tasks/restart.py deleted file mode 100644 index 697345a..0000000 --- a/src/ceph/qa/tasks/restart.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Daemon restart -""" -import logging -import pipes - -from teuthology import misc as teuthology -from teuthology.orchestra import run as tor - -from teuthology.orchestra import run -log = logging.getLogger(__name__) - -def restart_daemon(ctx, config, role, id_, *args): - """ - Handle restart (including the execution of the command parameters passed) - """ - log.info('Restarting {r}.{i} daemon...'.format(r=role, i=id_)) - daemon = ctx.daemons.get_daemon(role, id_) - log.debug('Waiting for exit of {r}.{i} daemon...'.format(r=role, i=id_)) - try: - daemon.wait_for_exit() - except tor.CommandFailedError as e: - log.debug('Command Failed: {e}'.format(e=e)) - if len(args) > 0: - confargs = ['--{k}={v}'.format(k=k, v=v) for k,v in zip(args[0::2], args[1::2])] - log.debug('Doing restart of {r}.{i} daemon with args: {a}...'.format(r=role, i=id_, a=confargs)) - daemon.restart_with_args(confargs) - else: - log.debug('Doing restart of {r}.{i} daemon...'.format(r=role, i=id_)) - daemon.restart() - -def get_tests(ctx, config, role, remote, testdir): - """Download restart tests""" - srcdir = '{tdir}/restart.{role}'.format(tdir=testdir, role=role) - - refspec = config.get('branch') - if refspec is None: - refspec = config.get('sha1') - if refspec is None: - refspec = config.get('tag') - if refspec is None: - refspec = 'HEAD' - log.info('Pulling restart qa/workunits from ref %s', refspec) - - remote.run( - logger=log.getChild(role), - args=[ - 'mkdir', '--', srcdir, - run.Raw('&&'), - 'git', - 'archive', - '--remote=git://git.ceph.com/ceph.git', - '%s:qa/workunits' % refspec, - run.Raw('|'), - 'tar', - '-C', srcdir, - '-x', - '-f-', - run.Raw('&&'), - 'cd', '--', srcdir, - run.Raw('&&'), - 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi', - run.Raw('&&'), - 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir), - run.Raw('>{tdir}/restarts.list'.format(tdir=testdir)), - ], - ) - restarts = sorted(teuthology.get_file( - remote, - '{tdir}/restarts.list'.format(tdir=testdir)).split('\0')) - return (srcdir, restarts) - -def task(ctx, config): - """ - Execute commands and allow daemon restart with config options. - Each process executed can output to stdout restart commands of the form: - restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2> - This will restart the daemon <role>.<id> with the specified config values once - by modifying the conf file with those values, and then replacing the old conf file - once the daemon is restarted. - This task does not kill a running daemon, it assumes the daemon will abort on an - assert specified in the config. - - tasks: - - install: - - ceph: - - restart: - exec: - client.0: - - test_backtraces.py - - """ - assert isinstance(config, dict), "task kill got invalid config" - - testdir = teuthology.get_testdir(ctx) - - try: - assert 'exec' in config, "config requires exec key with <role>: <command> entries" - for role, task in config['exec'].iteritems(): - log.info('restart for role {r}'.format(r=role)) - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - srcdir, restarts = get_tests(ctx, config, role, remote, testdir) - log.info('Running command on role %s host %s', role, remote.name) - spec = '{spec}'.format(spec=task[0]) - log.info('Restarts list: %s', restarts) - log.info('Spec is %s', spec) - to_run = [w for w in restarts if w == task or w.find(spec) != -1] - log.info('To run: %s', to_run) - for c in to_run: - log.info('Running restart script %s...', c) - args = [ - run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)), - ] - env = config.get('env') - if env is not None: - for var, val in env.iteritems(): - quoted_val = pipes.quote(val) - env_arg = '{var}={val}'.format(var=var, val=quoted_val) - args.append(run.Raw(env_arg)) - args.extend([ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - '{srcdir}/{c}'.format( - srcdir=srcdir, - c=c, - ), - ]) - proc = remote.run( - args=args, - stdout=tor.PIPE, - stdin=tor.PIPE, - stderr=log, - wait=False, - ) - log.info('waiting for a command from script') - while True: - l = proc.stdout.readline() - if not l or l == '': - break - log.debug('script command: {c}'.format(c=l)) - ll = l.strip() - cmd = ll.split(' ') - if cmd[0] == "done": - break - assert cmd[0] == 'restart', "script sent invalid command request to kill task" - # cmd should be: restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2> - # or to clear, just: restart <role> <id> - restart_daemon(ctx, config, cmd[1], cmd[2], *cmd[3:]) - proc.stdin.writelines(['restarted\n']) - proc.stdin.flush() - try: - proc.wait() - except tor.CommandFailedError: - raise Exception('restart task got non-zero exit status from script: {s}'.format(s=c)) - finally: - log.info('Finishing %s on %s...', task, role) - remote.run( - logger=log.getChild(role), - args=[ - 'rm', '-rf', '--', '{tdir}/restarts.list'.format(tdir=testdir), srcdir, - ], - ) diff --git a/src/ceph/qa/tasks/rgw.py b/src/ceph/qa/tasks/rgw.py deleted file mode 100644 index cec0b64..0000000 --- a/src/ceph/qa/tasks/rgw.py +++ /dev/null @@ -1,241 +0,0 @@ -""" -rgw routines -""" -import argparse -import contextlib -import json -import logging -import os -import errno -import util.rgw as rgw_utils - -from teuthology.orchestra import run -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.orchestra.run import CommandFailedError -from util.rgw import rgwadmin, wait_for_radosgw -from util.rados import (rados, create_ec_pool, - create_replicated_pool, - create_cache_pool) - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def start_rgw(ctx, config, clients): - """ - Start rgw on remote sites. - """ - log.info('Starting rgw...') - testdir = teuthology.get_testdir(ctx) - for client in clients: - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - cluster_name, daemon_type, client_id = teuthology.split_role(client) - client_with_id = daemon_type + '.' + client_id - client_with_cluster = cluster_name + '.' + client_with_id - - client_config = config.get(client) - if client_config is None: - client_config = {} - log.info("rgw %s config is %s", client, client_config) - cmd_prefix = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'daemon-helper', - 'term', - ] - - rgw_cmd = ['radosgw'] - - log.info("Using %s as radosgw frontend", ctx.rgw.frontend) - - host, port = ctx.rgw.role_endpoints[client] - rgw_cmd.extend([ - '--rgw-frontends', - '{frontend} port={port}'.format(frontend=ctx.rgw.frontend, port=port), - '-n', client_with_id, - '--cluster', cluster_name, - '-k', '/etc/ceph/{client_with_cluster}.keyring'.format(client_with_cluster=client_with_cluster), - '--log-file', - '/var/log/ceph/rgw.{client_with_cluster}.log'.format(client_with_cluster=client_with_cluster), - '--rgw_ops_log_socket_path', - '{tdir}/rgw.opslog.{client_with_cluster}.sock'.format(tdir=testdir, - client_with_cluster=client_with_cluster), - '--foreground', - run.Raw('|'), - 'sudo', - 'tee', - '/var/log/ceph/rgw.{client_with_cluster}.stdout'.format(tdir=testdir, - client_with_cluster=client_with_cluster), - run.Raw('2>&1'), - ]) - - if client_config.get('valgrind'): - cmd_prefix = teuthology.get_valgrind_args( - testdir, - client_with_cluster, - cmd_prefix, - client_config.get('valgrind') - ) - - run_cmd = list(cmd_prefix) - run_cmd.extend(rgw_cmd) - - ctx.daemons.add_daemon( - remote, 'rgw', client_with_id, - cluster=cluster_name, - args=run_cmd, - logger=log.getChild(client), - stdin=run.PIPE, - wait=False, - ) - - # XXX: add_daemon() doesn't let us wait until radosgw finishes startup - for client in config.keys(): - host, port = ctx.rgw.role_endpoints[client] - endpoint = 'http://{host}:{port}/'.format(host=host, port=port) - log.info('Polling {client} until it starts accepting connections on {endpoint}'.format(client=client, endpoint=endpoint)) - wait_for_radosgw(endpoint) - - try: - yield - finally: - for client in config.iterkeys(): - cluster_name, daemon_type, client_id = teuthology.split_role(client) - client_with_id = daemon_type + '.' + client_id - client_with_cluster = cluster_name + '.' + client_with_id - ctx.daemons.get_daemon('rgw', client_with_id, cluster_name).stop() - ctx.cluster.only(client).run( - args=[ - 'rm', - '-f', - '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir, - client=client_with_cluster), - ], - ) - -def assign_ports(ctx, config): - """ - Assign port numberst starting with port 7280. - """ - port = 7280 - role_endpoints = {} - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - for role in roles_for_host: - if role in config: - role_endpoints[role] = (remote.name.split('@')[1], port) - port += 1 - - return role_endpoints - -@contextlib.contextmanager -def create_pools(ctx, clients): - """Create replicated or erasure coded data pools for rgw.""" - - log.info('Creating data pools') - for client in clients: - log.debug("Obtaining remote for client {}".format(client)) - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - data_pool = '.rgw.buckets' - cluster_name, daemon_type, client_id = teuthology.split_role(client) - - if ctx.rgw.ec_data_pool: - create_ec_pool(remote, data_pool, client, 64, - ctx.rgw.erasure_code_profile, cluster_name, 'rgw') - else: - create_replicated_pool(remote, data_pool, 64, cluster_name, 'rgw') - if ctx.rgw.cache_pools: - create_cache_pool(remote, data_pool, data_pool + '.cache', 64, - 64*1024*1024, cluster_name) - log.debug('Pools created') - yield - -@contextlib.contextmanager -def configure_compression(ctx, clients, compression): - """ set a compression type in the default zone placement """ - log.info('Configuring compression type = %s', compression) - for client in clients: - # XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete(). - # issue a 'radosgw-admin user list' command to trigger this - rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True) - - rgwadmin(ctx, client, - cmd=['zone', 'placement', 'modify', '--rgw-zone', 'default', - '--placement-id', 'default-placement', - '--compression', compression], - check_status=True) - yield - -@contextlib.contextmanager -def task(ctx, config): - """ - For example, to run rgw on all clients:: - - tasks: - - ceph: - - rgw: - - To only run on certain clients:: - - tasks: - - ceph: - - rgw: [client.0, client.3] - - or - - tasks: - - ceph: - - rgw: - client.0: - client.3: - - To run radosgw through valgrind: - - tasks: - - ceph: - - rgw: - client.0: - valgrind: [--tool=memcheck] - client.3: - valgrind: [--tool=memcheck] - """ - if config is None: - config = dict(('client.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type( - ctx.cluster, 'client')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - clients = config.keys() # http://tracker.ceph.com/issues/20417 - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('rgw', {})) - - role_endpoints = assign_ports(ctx, config) - ctx.rgw = argparse.Namespace() - ctx.rgw.role_endpoints = role_endpoints - - ctx.rgw.ec_data_pool = bool(config.pop('ec-data-pool', False)) - ctx.rgw.erasure_code_profile = config.pop('erasure_code_profile', {}) - ctx.rgw.cache_pools = bool(config.pop('cache-pools', False)) - ctx.rgw.frontend = config.pop('frontend', 'civetweb') - ctx.rgw.compression_type = config.pop('compression type', None) - ctx.rgw.config = config - - log.debug("config is {}".format(config)) - log.debug("client list is {}".format(clients)) - subtasks = [ - lambda: create_pools(ctx=ctx, clients=clients), - ] - if ctx.rgw.compression_type: - subtasks.extend([ - lambda: configure_compression(ctx=ctx, clients=clients, - compression=ctx.rgw.compression_type), - ]) - subtasks.extend([ - lambda: start_rgw(ctx=ctx, config=config, clients=clients), - ]) - - with contextutil.nested(*subtasks): - yield diff --git a/src/ceph/qa/tasks/rgw_logsocket.py b/src/ceph/qa/tasks/rgw_logsocket.py deleted file mode 100644 index 6f49b00..0000000 --- a/src/ceph/qa/tasks/rgw_logsocket.py +++ /dev/null @@ -1,161 +0,0 @@ -""" -rgw s3tests logging wrappers -""" -from cStringIO import StringIO -from configobj import ConfigObj -import contextlib -import logging -import s3tests - -from teuthology import misc as teuthology -from teuthology import contextutil - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download(ctx, config): - """ - Run s3tests download function - """ - return s3tests.download(ctx, config) - -def _config_user(s3tests_conf, section, user): - """ - Run s3tests user config function - """ - return s3tests._config_user(s3tests_conf, section, user) - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Run s3tests user create function - """ - return s3tests.create_users(ctx, config) - -@contextlib.contextmanager -def configure(ctx, config): - """ - Run s3tests user configure function - """ - return s3tests.configure(ctx, config) - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run remote netcat tests - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - client_config['extra_args'] = [ - 's3tests.functional.test_s3:test_bucket_list_return_data', - ] -# args = [ -# 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client), -# '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir), -# '-w', -# '{tdir}/s3-tests'.format(tdir=testdir), -# '-v', -# 's3tests.functional.test_s3:test_bucket_list_return_data', -# ] -# if client_config is not None and 'extra_args' in client_config: -# args.extend(client_config['extra_args']) -# -# ctx.cluster.only(client).run( -# args=args, -# ) - - s3tests.run_tests(ctx, config) - - netcat_out = StringIO() - - for client, client_config in config.iteritems(): - ctx.cluster.only(client).run( - args = [ - 'netcat', - '-w', '5', - '-U', '{tdir}/rgw.opslog.sock'.format(tdir=testdir), - ], - stdout = netcat_out, - ) - - out = netcat_out.getvalue() - - assert len(out) > 100 - - log.info('Received', out) - - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run some s3-tests suite against rgw, verify opslog socket returns data - - Must restrict testing to a particular client:: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: [client.0] - - To pass extra arguments to nose (e.g. to run a certain test):: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: - client.0: - extra_args: ['test_s3:test_object_acl_grand_public_read'] - client.1: - extra_args: ['--exclude', 'test_100_continue'] - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - overrides = ctx.config.get('overrides', {}) - # merge each client section, not the top level. - for (client, cconf) in config.iteritems(): - teuthology.deep_merge(cconf, overrides.get('rgw-logsocket', {})) - - log.debug('config is %s', config) - - s3tests_conf = {} - for client in clients: - s3tests_conf[client] = ConfigObj( - indent_type='', - infile={ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : 'no', - }, - 'fixtures' : {}, - 's3 main' : {}, - 's3 alt' : {}, - } - ) - - with contextutil.nested( - lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - yield diff --git a/src/ceph/qa/tasks/rgw_multi b/src/ceph/qa/tasks/rgw_multi deleted file mode 120000 index abfc703..0000000 --- a/src/ceph/qa/tasks/rgw_multi +++ /dev/null @@ -1 +0,0 @@ -../../src/test/rgw/rgw_multi
\ No newline at end of file diff --git a/src/ceph/qa/tasks/rgw_multisite.py b/src/ceph/qa/tasks/rgw_multisite.py deleted file mode 100644 index 74c1f3f..0000000 --- a/src/ceph/qa/tasks/rgw_multisite.py +++ /dev/null @@ -1,427 +0,0 @@ -""" -rgw multisite configuration routines -""" -import argparse -import contextlib -import logging -import random -import string -from copy import deepcopy -from util.rgw import rgwadmin, wait_for_radosgw -from util.rados import create_ec_pool, create_replicated_pool -from rgw_multi import multisite -from rgw_multi.zone_rados import RadosZone as RadosZone - -from teuthology.orchestra import run -from teuthology import misc -from teuthology.exceptions import ConfigError -from teuthology.task import Task - -log = logging.getLogger(__name__) - -class RGWMultisite(Task): - """ - Performs rgw multisite configuration to match the given realm definition. - - - rgw-multisite: - realm: - name: test-realm - is_default: true - - List one or more zonegroup definitions. These are provided as json - input to `radosgw-admin zonegroup set`, with the exception of these keys: - - * 'is_master' is passed on the command line as --master - * 'is_default' is passed on the command line as --default - * 'endpoints' given as client names are replaced with actual endpoints - - zonegroups: - - name: test-zonegroup - api_name: test-api - is_master: true - is_default: true - endpoints: [c1.client.0] - - List each of the zones to be created in this zonegroup. - - zones: - - name: test-zone1 - is_master: true - is_default: true - endpoints: [c1.client.0] - - name: test-zone2 - is_default: true - endpoints: [c2.client.0] - - A complete example: - - tasks: - - install: - - ceph: {cluster: c1} - - ceph: {cluster: c2} - - rgw: - c1.client.0: - c2.client.0: - - rgw-multisite: - realm: - name: test-realm - is_default: true - zonegroups: - - name: test-zonegroup - is_master: true - is_default: true - zones: - - name: test-zone1 - is_master: true - is_default: true - endpoints: [c1.client.0] - - name: test-zone2 - is_default: true - endpoints: [c2.client.0] - - """ - def __init__(self, ctx, config): - super(RGWMultisite, self).__init__(ctx, config) - - def setup(self): - super(RGWMultisite, self).setup() - - overrides = self.ctx.config.get('overrides', {}) - misc.deep_merge(self.config, overrides.get('rgw-multisite', {})) - - if not self.ctx.rgw: - raise ConfigError('rgw-multisite must run after the rgw task') - role_endpoints = self.ctx.rgw.role_endpoints - - # construct Clusters and Gateways for each client in the rgw task - clusters, gateways = extract_clusters_and_gateways(self.ctx, - role_endpoints) - - # get the master zone and zonegroup configuration - mz, mzg = extract_master_zone_zonegroup(self.config['zonegroups']) - cluster1 = cluster_for_zone(clusters, mz) - - # create the realm and period on the master zone's cluster - log.info('creating realm..') - realm = create_realm(cluster1, self.config['realm']) - period = realm.current_period - - creds = gen_credentials() - - # create the master zonegroup and its master zone - log.info('creating master zonegroup..') - master_zonegroup = create_zonegroup(cluster1, gateways, period, - deepcopy(mzg)) - period.master_zonegroup = master_zonegroup - - log.info('creating master zone..') - master_zone = create_zone(self.ctx, cluster1, gateways, creds, - master_zonegroup, deepcopy(mz)) - master_zonegroup.master_zone = master_zone - - period.update(master_zone, commit=True) - restart_zone_gateways(master_zone) # restart with --rgw-zone - - # create the admin user on the master zone - log.info('creating admin user..') - user_args = ['--display-name', 'Realm Admin', '--system'] - user_args += creds.credential_args() - admin_user = multisite.User('realm-admin') - admin_user.create(master_zone, user_args) - - # process 'zonegroups' - for zg_config in self.config['zonegroups']: - zones_config = zg_config.pop('zones') - - zonegroup = None - for zone_config in zones_config: - # get the cluster for this zone - cluster = cluster_for_zone(clusters, zone_config) - - if cluster != cluster1: # already created on master cluster - log.info('pulling realm configuration to %s', cluster.name) - realm.pull(cluster, master_zone.gateways[0], creds) - - # use the first zone's cluster to create the zonegroup - if not zonegroup: - if zg_config['name'] == master_zonegroup.name: - zonegroup = master_zonegroup - else: - log.info('creating zonegroup..') - zonegroup = create_zonegroup(cluster, gateways, - period, zg_config) - - if zone_config['name'] == master_zone.name: - # master zone was already created - zone = master_zone - else: - # create the zone and commit the period - log.info('creating zone..') - zone = create_zone(self.ctx, cluster, gateways, creds, - zonegroup, zone_config) - period.update(zone, commit=True) - - restart_zone_gateways(zone) # restart with --rgw-zone - - # attach configuration to the ctx for other tasks - self.ctx.rgw_multisite = argparse.Namespace() - self.ctx.rgw_multisite.clusters = clusters - self.ctx.rgw_multisite.gateways = gateways - self.ctx.rgw_multisite.realm = realm - self.ctx.rgw_multisite.admin_user = admin_user - - log.info('rgw multisite configuration completed') - - def end(self): - del self.ctx.rgw_multisite - -class Cluster(multisite.Cluster): - """ Issues 'radosgw-admin' commands with the rgwadmin() helper """ - def __init__(self, ctx, name, client): - super(Cluster, self).__init__() - self.ctx = ctx - self.name = name - self.client = client - - def admin(self, args = None, **kwargs): - """ radosgw-admin command """ - args = args or [] - args += ['--cluster', self.name] - args += ['--debug-rgw', '0'] - if kwargs.pop('read_only', False): - args += ['--rgw-cache-enabled', 'false'] - kwargs['decode'] = False - check_retcode = kwargs.pop('check_retcode', True) - r, s = rgwadmin(self.ctx, self.client, args, **kwargs) - if check_retcode: - assert r == 0 - return s, r - -class Gateway(multisite.Gateway): - """ Controls a radosgw instance using its daemon """ - def __init__(self, role, remote, daemon, *args, **kwargs): - super(Gateway, self).__init__(*args, **kwargs) - self.role = role - self.remote = remote - self.daemon = daemon - - def set_zone(self, zone): - """ set the zone and add its args to the daemon's command line """ - assert self.zone is None, 'zone can only be set once' - self.zone = zone - # daemon.restart_with_args() would be perfect for this, except that - # radosgw args likely include a pipe and redirect. zone arguments at - # the end won't actually apply to radosgw - args = self.daemon.command_kwargs.get('args', []) - try: - # insert zone args before the first | - pipe = args.index(run.Raw('|')) - args = args[0:pipe] + zone.zone_args() + args[pipe:] - except ValueError, e: - args += zone.zone_args() - self.daemon.command_kwargs['args'] = args - - def start(self, args = None): - """ (re)start the daemon """ - self.daemon.restart() - # wait until startup completes - wait_for_radosgw(self.endpoint()) - - def stop(self): - """ stop the daemon """ - self.daemon.stop() - -def extract_clusters_and_gateways(ctx, role_endpoints): - """ create cluster and gateway instances for all of the radosgw roles """ - clusters = {} - gateways = {} - for role, (host, port) in role_endpoints.iteritems(): - cluster_name, daemon_type, client_id = misc.split_role(role) - # find or create the cluster by name - cluster = clusters.get(cluster_name) - if not cluster: - clusters[cluster_name] = cluster = Cluster(ctx, cluster_name, role) - # create a gateway for this daemon - client_with_id = daemon_type + '.' + client_id # match format from rgw.py - daemon = ctx.daemons.get_daemon('rgw', client_with_id, cluster_name) - if not daemon: - raise ConfigError('no daemon for role=%s cluster=%s type=rgw id=%s' % \ - (role, cluster_name, client_id)) - (remote,) = ctx.cluster.only(role).remotes.keys() - gateways[role] = Gateway(role, remote, daemon, host, port, cluster) - return clusters, gateways - -def create_realm(cluster, config): - """ create a realm from configuration and initialize its first period """ - realm = multisite.Realm(config['name']) - args = [] - if config.get('is_default', False): - args += ['--default'] - realm.create(cluster, args) - realm.current_period = multisite.Period(realm) - return realm - -def extract_user_credentials(config): - """ extract keys from configuration """ - return multisite.Credentials(config['access_key'], config['secret_key']) - -def extract_master_zone(zonegroup_config): - """ find and return the master zone definition """ - master = None - for zone in zonegroup_config['zones']: - if not zone.get('is_master', False): - continue - if master: - raise ConfigError('zones %s and %s cannot both set \'is_master\'' % \ - (master['name'], zone['name'])) - master = zone - # continue the loop so we can detect duplicates - if not master: - raise ConfigError('one zone must set \'is_master\' in zonegroup %s' % \ - zonegroup_config['name']) - return master - -def extract_master_zone_zonegroup(zonegroups_config): - """ find and return the master zone and zonegroup definitions """ - master_zone, master_zonegroup = (None, None) - for zonegroup in zonegroups_config: - # verify that all zonegroups have a master zone set, even if they - # aren't in the master zonegroup - zone = extract_master_zone(zonegroup) - if not zonegroup.get('is_master', False): - continue - if master_zonegroup: - raise ConfigError('zonegroups %s and %s cannot both set \'is_master\'' % \ - (master_zonegroup['name'], zonegroup['name'])) - master_zonegroup = zonegroup - master_zone = zone - # continue the loop so we can detect duplicates - if not master_zonegroup: - raise ConfigError('one zonegroup must set \'is_master\'') - return master_zone, master_zonegroup - -def extract_zone_cluster_name(zone_config): - """ return the cluster (must be common to all zone endpoints) """ - cluster_name = None - endpoints = zone_config.get('endpoints') - if not endpoints: - raise ConfigError('zone %s missing \'endpoints\' list' % \ - zone_config['name']) - for role in endpoints: - name, _, _ = misc.split_role(role) - if not cluster_name: - cluster_name = name - elif cluster_name != name: - raise ConfigError('all zone %s endpoints must be in the same cluster' % \ - zone_config['name']) - return cluster_name - -def cluster_for_zone(clusters, zone_config): - """ return the cluster entry for the given zone """ - name = extract_zone_cluster_name(zone_config) - try: - return clusters[name] - except KeyError: - raise ConfigError('no cluster %s found' % name) - -def gen_access_key(): - return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16)) - -def gen_secret(): - return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(32)) - -def gen_credentials(): - return multisite.Credentials(gen_access_key(), gen_secret()) - -def extract_gateway_endpoints(gateways, endpoints_config): - """ return a list of gateway endpoints associated with the given roles """ - endpoints = [] - for role in endpoints_config: - try: - # replace role names with their gateway's endpoint - endpoints.append(gateways[role].endpoint()) - except KeyError: - raise ConfigError('no radosgw endpoint found for role %s' % role) - return endpoints - -def is_default_arg(config): - return ['--default'] if config.pop('is_default', False) else [] - -def is_master_arg(config): - return ['--master'] if config.pop('is_master', False) else [] - -def create_zonegroup(cluster, gateways, period, config): - """ pass the zonegroup configuration to `zonegroup set` """ - config.pop('zones', None) # remove 'zones' from input to `zonegroup set` - endpoints = config.get('endpoints') - if endpoints: - # replace client names with their gateway endpoints - config['endpoints'] = extract_gateway_endpoints(gateways, endpoints) - zonegroup = multisite.ZoneGroup(config['name'], period) - # `zonegroup set` needs --default on command line, and 'is_master' in json - args = is_default_arg(config) - zonegroup.set(cluster, config, args) - period.zonegroups.append(zonegroup) - return zonegroup - -def create_zone(ctx, cluster, gateways, creds, zonegroup, config): - """ create a zone with the given configuration """ - zone = multisite.Zone(config['name'], zonegroup, cluster) - zone = RadosZone(config['name'], zonegroup, cluster) - - # collect Gateways for the zone's endpoints - endpoints = config.get('endpoints') - if not endpoints: - raise ConfigError('no \'endpoints\' for zone %s' % config['name']) - zone.gateways = [gateways[role] for role in endpoints] - for gateway in zone.gateways: - gateway.set_zone(zone) - - # format the gateway endpoints - endpoints = [g.endpoint() for g in zone.gateways] - - args = is_default_arg(config) - args += is_master_arg(config) - args += creds.credential_args() - if len(endpoints): - args += ['--endpoints', ','.join(endpoints)] - zone.create(cluster, args) - zonegroup.zones.append(zone) - - create_zone_pools(ctx, zone) - if ctx.rgw.compression_type: - configure_zone_compression(zone, ctx.rgw.compression_type) - - zonegroup.zones_by_type.setdefault(zone.tier_type(), []).append(zone) - - if zone.is_read_only(): - zonegroup.ro_zones.append(zone) - else: - zonegroup.rw_zones.append(zone) - - return zone - -def create_zone_pools(ctx, zone): - """ Create the data_pool for each placement type """ - gateway = zone.gateways[0] - cluster = zone.cluster - for pool_config in zone.data.get('placement_pools', []): - pool_name = pool_config['val']['data_pool'] - if ctx.rgw.ec_data_pool: - create_ec_pool(gateway.remote, pool_name, zone.name, 64, - ctx.rgw.erasure_code_profile, cluster.name, 'rgw') - else: - create_replicated_pool(gateway.remote, pool_name, 64, cluster.name, 'rgw') - -def configure_zone_compression(zone, compression): - """ Set compression type in the zone's default-placement """ - zone.json_command(zone.cluster, 'placement', ['modify', - '--placement-id', 'default-placement', - '--compression', compression - ]) - -def restart_zone_gateways(zone): - zone.stop() - zone.start() - -task = RGWMultisite diff --git a/src/ceph/qa/tasks/rgw_multisite_tests.py b/src/ceph/qa/tasks/rgw_multisite_tests.py deleted file mode 100644 index 4e6e2b3..0000000 --- a/src/ceph/qa/tasks/rgw_multisite_tests.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -rgw multisite testing -""" -import logging -import sys -import nose.core -import nose.config - -from teuthology.exceptions import ConfigError -from teuthology.task import Task -from teuthology import misc - -from rgw_multi import multisite, tests - -log = logging.getLogger(__name__) - -class RGWMultisiteTests(Task): - """ - Runs the rgw_multi tests against a multisite configuration created by the - rgw-multisite task. Tests are run with nose, using any additional 'args' - provided. Overrides for tests.Config can be set in 'config'. - - - rgw-multisite-tests: - args: - - tasks.rgw_multi.tests:test_object_sync - config: - reconfigure_delay: 60 - - """ - def __init__(self, ctx, config): - super(RGWMultisiteTests, self).__init__(ctx, config) - - def setup(self): - super(RGWMultisiteTests, self).setup() - - overrides = self.ctx.config.get('overrides', {}) - misc.deep_merge(self.config, overrides.get('rgw-multisite-tests', {})) - - if not self.ctx.rgw_multisite: - raise ConfigError('rgw-multisite-tests must run after the rgw-multisite task') - realm = self.ctx.rgw_multisite.realm - master_zone = realm.meta_master_zone() - - # create the test user - log.info('creating test user..') - user = multisite.User('rgw-multisite-test-user') - user.create(master_zone, ['--display-name', 'Multisite Test User', - '--gen-access-key', '--gen-secret']) - - config = self.config.get('config', {}) - tests.init_multi(realm, user, tests.Config(**config)) - tests.realm_meta_checkpoint(realm) - - def begin(self): - # extra arguments for nose can be passed as a string or list - extra_args = self.config.get('args', []) - if not isinstance(extra_args, list): - extra_args = [extra_args] - argv = [__name__] + extra_args - - log.info("running rgw multisite tests on '%s' with args=%r", - tests.__name__, extra_args) - - # run nose tests in the rgw_multi.tests module - conf = nose.config.Config(stream=get_log_stream(), verbosity=2) - result = nose.run(defaultTest=tests.__name__, argv=argv, config=conf) - if not result: - raise RuntimeError('rgw multisite test failures') - -def get_log_stream(): - """ return a log stream for nose output """ - # XXX: this is a workaround for IOErrors when nose writes to stderr, - # copied from vstart_runner.py - class LogStream(object): - def __init__(self): - self.buffer = "" - - def write(self, data): - self.buffer += data - if "\n" in self.buffer: - lines = self.buffer.split("\n") - for line in lines[:-1]: - log.info(line) - self.buffer = lines[-1] - - def flush(self): - pass - - return LogStream() - -task = RGWMultisiteTests diff --git a/src/ceph/qa/tasks/s3a_hadoop.py b/src/ceph/qa/tasks/s3a_hadoop.py deleted file mode 100644 index c01fe1d..0000000 --- a/src/ceph/qa/tasks/s3a_hadoop.py +++ /dev/null @@ -1,343 +0,0 @@ -import contextlib -import logging -import time -from teuthology import misc -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run Hadoop S3A tests using Ceph - usage: - -tasks: - ceph-ansible: - s3a-hadoop: - maven-version: '3.3.9' (default) - hadoop-version: '2.7.3' - bucket-name: 's3atest' (default) - access-key: 'anykey' (uses a default value) - secret-key: 'secretkey' ( uses a default value) - """ - if config is None: - config = {} - - assert isinstance(config, dict), \ - "task only supports a dictionary for configuration" - - overrides = ctx.config.get('overrides', {}) - misc.deep_merge(config, overrides.get('s3a-hadoop', {})) - testdir = misc.get_testdir(ctx) - rgws = ctx.cluster.only(misc.is_type('rgw')) - # use the first rgw node to test s3a - rgw_node = rgws.remotes.keys()[0] - # get versions - maven_major = config.get('maven-major', 'maven-3') - maven_version = config.get('maven-version', '3.3.9') - hadoop_ver = config.get('hadoop-version', '2.7.3') - bucket_name = config.get('bucket-name', 's3atest') - access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F') - secret_key = config.get( - 'secret-key', - 'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb') - - # set versions for cloning the repo - apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format( - maven_version=maven_version) - maven_link = 'http://mirror.jax.hugeserver.com/apache/maven/' + \ - '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven - hadoop_git = 'https://github.com/apache/hadoop' - hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver) - install_prereq(rgw_node) - rgw_node.run( - args=[ - 'cd', - testdir, - run.Raw('&&'), - 'wget', - maven_link, - run.Raw('&&'), - 'tar', - '-xvf', - apache_maven, - run.Raw('&&'), - 'git', - 'clone', - run.Raw(hadoop_git), - run.Raw('&&'), - 'cd', - 'hadoop', - run.Raw('&&'), - 'git', - 'checkout', - '-b', - run.Raw(hadoop_rel) - ] - ) - dnsmasq_name = 's3.ceph.com' - configure_s3a(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir) - setup_dnsmasq(rgw_node, dnsmasq_name) - fix_rgw_config(rgw_node, dnsmasq_name) - setup_user_bucket(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir) - if hadoop_ver.startswith('2.8'): - # test all ITtests but skip AWS test using public bucket landsat-pds - # which is not available from within this test - test_options = '-Dit.test=ITestS3A* -Dit.test=\!ITestS3AAWSCredentialsProvider* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify' - else: - test_options = 'test -Dtest=S3a*,TestS3A*' - try: - run_s3atest(rgw_node, maven_version, testdir, test_options) - yield - finally: - log.info("Done s3a testing, Cleaning up") - for fil in ['apache*', 'hadoop*', 'venv*', 'create*']: - rgw_node.run(args=['rm', run.Raw('-rf'), run.Raw('{tdir}/{file}'.format(tdir=testdir, file=fil))]) - # restart and let NM restore original config - rgw_node.run(args=['sudo', 'systemctl', 'stop', 'dnsmasq']) - rgw_node.run(args=['sudo', 'systemctl', 'restart', 'network.service'], check_status=False) - rgw_node.run(args=['sudo', 'systemctl', 'status', 'network.service'], check_status=False) - - -def install_prereq(client): - """ - Install pre requisites for RHEL and CentOS - TBD: Ubuntu - """ - if client.os.name == 'rhel' or client.os.name == 'centos': - client.run( - args=[ - 'sudo', - 'yum', - 'install', - '-y', - 'protobuf-c.x86_64', - 'java', - 'java-1.8.0-openjdk-devel', - 'dnsmasq' - ] - ) - - -def setup_dnsmasq(client, name): - """ - Setup simple dnsmasq name eg: s3.ceph.com - Local RGW host can then be used with whatever name has been setup with. - """ - resolv_conf = "nameserver 127.0.0.1\n" - dnsmasq_template = """address=/{name}/{ip_address} -server=8.8.8.8 -server=8.8.4.4 -""".format(name=name, ip_address=client.ip_address) - dnsmasq_config_path = '/etc/dnsmasq.d/ceph' - # point resolv.conf to local dnsmasq - misc.sudo_write_file( - remote=client, - path='/etc/resolv.conf', - data=resolv_conf, - ) - misc.sudo_write_file( - remote=client, - path=dnsmasq_config_path, - data=dnsmasq_template, - ) - client.run(args=['cat', dnsmasq_config_path]) - # restart dnsmasq - client.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq']) - client.run(args=['sudo', 'systemctl', 'status', 'dnsmasq']) - time.sleep(5) - # verify dns name is set - client.run(args=['ping', '-c', '4', name]) - - -def fix_rgw_config(client, name): - """ - Fix RGW config in ceph.conf, we need rgw dns name entry - and also modify the port to use :80 for s3a tests to work - """ - rgw_dns_name = 'rgw dns name = {name}'.format(name=name) - ceph_conf_path = '/etc/ceph/ceph.conf' - # append rgw_dns_name - client.run( - args=[ - 'sudo', - 'sed', - run.Raw('-i'), - run.Raw("'/client.rgw*/a {rgw_name}'".format(rgw_name=rgw_dns_name)), - ceph_conf_path - - ] - ) - # listen on port 80 - client.run( - args=[ - 'sudo', - 'sed', - run.Raw('-i'), - run.Raw('s/:8080/:80/'), - ceph_conf_path - ] - ) - client.run(args=['cat', ceph_conf_path]) - client.run(args=['sudo', 'systemctl', 'restart', 'ceph-radosgw.target']) - client.run(args=['sudo', 'systemctl', 'status', 'ceph-radosgw.target']) - - -def setup_user_bucket(client, dns_name, access_key, secret_key, bucket_name, testdir): - """ - Create user with access_key and secret_key that will be - used for the s3a testdir - """ - client.run( - args=[ - 'sudo', - 'radosgw-admin', - 'user', - 'create', - run.Raw('--uid'), - 's3a', - run.Raw('--display-name=s3a cephtests'), - run.Raw('--access-key={access_key}'.format(access_key=access_key)), - run.Raw('--secret-key={secret_key}'.format(secret_key=secret_key)), - run.Raw('--email=s3a@ceph.com'), - ] - ) - client.run( - args=[ - 'virtualenv', - '{testdir}/venv'.format(testdir=testdir), - run.Raw('&&'), - run.Raw('{testdir}/venv/bin/pip'.format(testdir=testdir)), - 'install', - 'boto' - ] - ) - create_bucket = """ -#!/usr/bin/env python -import boto -import boto.s3.connection -access_key = '{access_key}' -secret_key = '{secret_key}' - -conn = boto.connect_s3( - aws_access_key_id = access_key, - aws_secret_access_key = secret_key, - host = '{dns_name}', - is_secure=False, - calling_format = boto.s3.connection.OrdinaryCallingFormat(), - ) -bucket = conn.create_bucket('{bucket_name}') -for bucket in conn.get_all_buckets(): - print bucket.name + "\t" + bucket.creation_date -""".format(access_key=access_key, secret_key=secret_key, dns_name=dns_name, bucket_name=bucket_name) - py_bucket_file = '{testdir}/create_bucket.py'.format(testdir=testdir) - misc.sudo_write_file( - remote=client, - path=py_bucket_file, - data=create_bucket, - perms='0744', - ) - client.run( - args=[ - 'cat', - '{testdir}/create_bucket.py'.format(testdir=testdir), - ] - ) - client.run( - args=[ - '{testdir}/venv/bin/python'.format(testdir=testdir), - '{testdir}/create_bucket.py'.format(testdir=testdir), - ] - ) - - -def run_s3atest(client, maven_version, testdir, test_options): - """ - Finally run the s3a test - """ - aws_testdir = '{testdir}/hadoop/hadoop-tools/hadoop-aws/'.format(testdir=testdir) - run_test = '{testdir}/apache-maven-{maven_version}/bin/mvn'.format(testdir=testdir, maven_version=maven_version) - client.run( - args=[ - 'cd', - run.Raw(aws_testdir), - run.Raw('&&'), - run.Raw(run_test), - run.Raw(test_options) - ] - ) - - -def configure_s3a(client, dns_name, access_key, secret_key, bucket_name, testdir): - """ - Use the template to configure s3a test, Fill in access_key, secret_key - and other details required for test. - """ - config_template = """<configuration> -<property> -<name>fs.s3a.endpoint</name> -<value>{name}</value> -</property> - -<property> -<name>fs.s3a.connection.ssl.enabled</name> -<value>false</value> -</property> - -<property> -<name>test.fs.s3n.name</name> -<value>s3n://{bucket_name}/</value> -</property> - -<property> -<name>test.fs.s3a.name</name> -<value>s3a://{bucket_name}/</value> -</property> - -<property> -<name>test.fs.s3.name</name> -<value>s3://{bucket_name}/</value> -</property> - -<property> -<name>fs.s3.awsAccessKeyId</name> -<value>{access_key}</value> -</property> - -<property> -<name>fs.s3.awsSecretAccessKey</name> -<value>{secret_key}</value> -</property> - -<property> -<name>fs.s3n.awsAccessKeyId</name> -<value>{access_key}</value> -</property> - -<property> -<name>fs.s3n.awsSecretAccessKey</name> -<value>{secret_key}</value> -</property> - -<property> -<name>fs.s3a.access.key</name> -<description>AWS access key ID. Omit for Role-based authentication.</description> -<value>{access_key}</value> -</property> - -<property> -<name>fs.s3a.secret.key</name> -<description>AWS secret key. Omit for Role-based authentication.</description> -<value>{secret_key}</value> -</property> -</configuration> -""".format(name=dns_name, bucket_name=bucket_name, access_key=access_key, secret_key=secret_key) - config_path = testdir + '/hadoop/hadoop-tools/hadoop-aws/src/test/resources/auth-keys.xml' - misc.write_file( - remote=client, - path=config_path, - data=config_template, - ) - # output for debug - client.run(args=['cat', config_path]) diff --git a/src/ceph/qa/tasks/s3readwrite.py b/src/ceph/qa/tasks/s3readwrite.py deleted file mode 100644 index 9f1507e..0000000 --- a/src/ceph/qa/tasks/s3readwrite.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -Run rgw s3 readwite tests -""" -from cStringIO import StringIO -import base64 -import contextlib -import logging -import os -import random -import string -import yaml - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.config import config as teuth_config -from teuthology.orchestra import run -from teuthology.orchestra.connection import split_user - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download(ctx, config): - """ - Download the s3 tests from the git builder. - Remove downloaded s3 file upon exit. - - The context passed in should be identical to the context - passed in to the main task. - """ - assert isinstance(config, dict) - log.info('Downloading s3-tests...') - testdir = teuthology.get_testdir(ctx) - for (client, cconf) in config.items(): - branch = cconf.get('force-branch', None) - if not branch: - branch = cconf.get('branch', 'master') - sha1 = cconf.get('sha1') - ctx.cluster.only(client).run( - args=[ - 'git', 'clone', - '-b', branch, - teuth_config.ceph_git_base_url + 's3-tests.git', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - if sha1 is not None: - ctx.cluster.only(client).run( - args=[ - 'cd', '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - 'git', 'reset', '--hard', sha1, - ], - ) - try: - yield - finally: - log.info('Removing s3-tests...') - testdir = teuthology.get_testdir(ctx) - for client in config: - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - - -def _config_user(s3tests_conf, section, user): - """ - Configure users for this section by stashing away keys, ids, and - email addresses. - """ - s3tests_conf[section].setdefault('user_id', user) - s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user)) - s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user)) - s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20))) - s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40))) - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Create a default s3 user. - """ - assert isinstance(config, dict) - log.info('Creating rgw users...') - testdir = teuthology.get_testdir(ctx) - users = {'s3': 'foo'} - cached_client_user_names = dict() - for client in config['clients']: - cached_client_user_names[client] = dict() - s3tests_conf = config['s3tests_conf'][client] - s3tests_conf.setdefault('readwrite', {}) - s3tests_conf['readwrite'].setdefault('bucket', 'rwtest-' + client + '-{random}-') - s3tests_conf['readwrite'].setdefault('readers', 10) - s3tests_conf['readwrite'].setdefault('writers', 3) - s3tests_conf['readwrite'].setdefault('duration', 300) - s3tests_conf['readwrite'].setdefault('files', {}) - rwconf = s3tests_conf['readwrite'] - rwconf['files'].setdefault('num', 10) - rwconf['files'].setdefault('size', 2000) - rwconf['files'].setdefault('stddev', 500) - for section, user in users.iteritems(): - _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client)) - log.debug('creating user {user} on {client}'.format(user=s3tests_conf[section]['user_id'], - client=client)) - - # stash the 'delete_user' flag along with user name for easier cleanup - delete_this_user = True - if 'delete_user' in s3tests_conf['s3']: - delete_this_user = s3tests_conf['s3']['delete_user'] - log.debug('delete_user set to {flag} for {client}'.format(flag=delete_this_user, client=client)) - cached_client_user_names[client][section+user] = (s3tests_conf[section]['user_id'], delete_this_user) - - # skip actual user creation if the create_user flag is set to false for this client - if 'create_user' in s3tests_conf['s3'] and s3tests_conf['s3']['create_user'] == False: - log.debug('create_user set to False, skipping user creation for {client}'.format(client=client)) - continue - else: - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--display-name', s3tests_conf[section]['display_name'], - '--access-key', s3tests_conf[section]['access_key'], - '--secret', s3tests_conf[section]['secret_key'], - '--email', s3tests_conf[section]['email'], - ], - ) - try: - yield - finally: - for client in config['clients']: - for section, user in users.iteritems(): - #uid = '{user}.{client}'.format(user=user, client=client) - real_uid, delete_this_user = cached_client_user_names[client][section+user] - if delete_this_user: - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'rm', - '--uid', real_uid, - '--purge-data', - ], - ) - else: - log.debug('skipping delete for user {uid} on {client}'.format(uid=real_uid, client=client)) - -@contextlib.contextmanager -def configure(ctx, config): - """ - Configure the s3-tests. This includes the running of the - bootstrap code and the updating of local conf files. - """ - assert isinstance(config, dict) - log.info('Configuring s3-readwrite-tests...') - for client, properties in config['clients'].iteritems(): - s3tests_conf = config['s3tests_conf'][client] - if properties is not None and 'rgw_server' in properties: - host = None - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - log.info('roles: ' + str(roles)) - log.info('target: ' + str(target)) - if properties['rgw_server'] in roles: - _, host = split_user(target) - assert host is not None, "Invalid client specified as the rgw_server" - s3tests_conf['s3']['host'] = host - else: - s3tests_conf['s3']['host'] = 'localhost' - - def_conf = s3tests_conf['DEFAULT'] - s3tests_conf['s3'].setdefault('port', def_conf['port']) - s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure']) - - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'cd', - '{tdir}/s3-tests'.format(tdir=teuthology.get_testdir(ctx)), - run.Raw('&&'), - './bootstrap', - ], - ) - conf_fp = StringIO() - conf = dict( - s3=s3tests_conf['s3'], - readwrite=s3tests_conf['readwrite'], - ) - yaml.safe_dump(conf, conf_fp, default_flow_style=False) - teuthology.write_file( - remote=remote, - path='{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=teuthology.get_testdir(ctx), client=client), - data=conf_fp.getvalue(), - ) - yield - - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run the s3readwrite tests after everything is set up. - - :param ctx: Context passed to task - :param config: specific configuration information - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - conf = teuthology.get_file(remote, '{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=testdir, client=client)) - args = [ - '{tdir}/s3-tests/virtualenv/bin/s3tests-test-readwrite'.format(tdir=testdir), - ] - if client_config is not None and 'extra_args' in client_config: - args.extend(client_config['extra_args']) - - ctx.cluster.only(client).run( - args=args, - stdin=conf, - ) - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the s3tests-test-readwrite suite against rgw. - - To run all tests on all clients:: - - tasks: - - ceph: - - rgw: - - s3readwrite: - - To restrict testing to particular clients:: - - tasks: - - ceph: - - rgw: [client.0] - - s3readwrite: [client.0] - - To run against a server on client.1:: - - tasks: - - ceph: - - rgw: [client.1] - - s3readwrite: - client.0: - rgw_server: client.1 - - To pass extra test arguments - - tasks: - - ceph: - - rgw: [client.0] - - s3readwrite: - client.0: - readwrite: - bucket: mybucket - readers: 10 - writers: 3 - duration: 600 - files: - num: 10 - size: 2000 - stddev: 500 - client.1: - ... - - To override s3 configuration - - tasks: - - ceph: - - rgw: [client.0] - - s3readwrite: - client.0: - s3: - user_id: myuserid - display_name: myname - email: my@email - access_key: myaccesskey - secret_key: mysecretkey - - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - overrides = ctx.config.get('overrides', {}) - # merge each client section, not the top level. - for client in config.iterkeys(): - if not config[client]: - config[client] = {} - teuthology.deep_merge(config[client], overrides.get('s3readwrite', {})) - - log.debug('in s3readwrite, config is %s', config) - - s3tests_conf = {} - for client in clients: - if config[client] is None: - config[client] = {} - config[client].setdefault('s3', {}) - config[client].setdefault('readwrite', {}) - - s3tests_conf[client] = ({ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : False, - }, - 'readwrite' : config[client]['readwrite'], - 's3' : config[client]['s3'], - }) - - with contextutil.nested( - lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - pass - yield diff --git a/src/ceph/qa/tasks/s3roundtrip.py b/src/ceph/qa/tasks/s3roundtrip.py deleted file mode 100644 index 620b9d4..0000000 --- a/src/ceph/qa/tasks/s3roundtrip.py +++ /dev/null @@ -1,306 +0,0 @@ -""" -Run rgw roundtrip message tests -""" -from cStringIO import StringIO -import base64 -import contextlib -import logging -import os -import random -import string -import yaml - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.config import config as teuth_config -from teuthology.orchestra import run -from teuthology.orchestra.connection import split_user - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download(ctx, config): - """ - Download the s3 tests from the git builder. - Remove downloaded s3 file upon exit. - - The context passed in should be identical to the context - passed in to the main task. - """ - assert isinstance(config, dict) - log.info('Downloading s3-tests...') - testdir = teuthology.get_testdir(ctx) - for (client, cconf) in config.iteritems(): - branch = cconf.get('force-branch', None) - if not branch: - branch = cconf.get('branch', 'master') - ctx.cluster.only(client).run( - args=[ - 'git', 'clone', - '-b', branch, - teuth_config.ceph_git_base_url + 's3-tests.git', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - try: - yield - finally: - log.info('Removing s3-tests...') - for client in config: - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - -def _config_user(s3tests_conf, section, user): - """ - Configure users for this section by stashing away keys, ids, and - email addresses. - """ - s3tests_conf[section].setdefault('user_id', user) - s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user)) - s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user)) - s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20))) - s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40))) - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Create a default s3 user. - """ - assert isinstance(config, dict) - log.info('Creating rgw users...') - testdir = teuthology.get_testdir(ctx) - users = {'s3': 'foo'} - for client in config['clients']: - s3tests_conf = config['s3tests_conf'][client] - s3tests_conf.setdefault('roundtrip', {}) - s3tests_conf['roundtrip'].setdefault('bucket', 'rttest-' + client + '-{random}-') - s3tests_conf['roundtrip'].setdefault('readers', 10) - s3tests_conf['roundtrip'].setdefault('writers', 3) - s3tests_conf['roundtrip'].setdefault('duration', 300) - s3tests_conf['roundtrip'].setdefault('files', {}) - rtconf = s3tests_conf['roundtrip'] - rtconf['files'].setdefault('num', 10) - rtconf['files'].setdefault('size', 2000) - rtconf['files'].setdefault('stddev', 500) - for section, user in [('s3', 'foo')]: - _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client)) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--display-name', s3tests_conf[section]['display_name'], - '--access-key', s3tests_conf[section]['access_key'], - '--secret', s3tests_conf[section]['secret_key'], - '--email', s3tests_conf[section]['email'], - ], - ) - try: - yield - finally: - for client in config['clients']: - for user in users.itervalues(): - uid = '{user}.{client}'.format(user=user, client=client) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'rm', - '--uid', uid, - '--purge-data', - ], - ) - -@contextlib.contextmanager -def configure(ctx, config): - """ - Configure the s3-tests. This includes the running of the - bootstrap code and the updating of local conf files. - """ - assert isinstance(config, dict) - log.info('Configuring s3-roundtrip-tests...') - testdir = teuthology.get_testdir(ctx) - for client, properties in config['clients'].iteritems(): - s3tests_conf = config['s3tests_conf'][client] - if properties is not None and 'rgw_server' in properties: - host = None - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - log.info('roles: ' + str(roles)) - log.info('target: ' + str(target)) - if properties['rgw_server'] in roles: - _, host = split_user(target) - assert host is not None, "Invalid client specified as the rgw_server" - s3tests_conf['s3']['host'] = host - else: - s3tests_conf['s3']['host'] = 'localhost' - - def_conf = s3tests_conf['DEFAULT'] - s3tests_conf['s3'].setdefault('port', def_conf['port']) - s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure']) - - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'cd', - '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - './bootstrap', - ], - ) - conf_fp = StringIO() - conf = dict( - s3=s3tests_conf['s3'], - roundtrip=s3tests_conf['roundtrip'], - ) - yaml.safe_dump(conf, conf_fp, default_flow_style=False) - teuthology.write_file( - remote=remote, - path='{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client), - data=conf_fp.getvalue(), - ) - yield - - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run the s3 roundtrip after everything is set up. - - :param ctx: Context passed to task - :param config: specific configuration information - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - conf = teuthology.get_file(remote, '{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client)) - args = [ - '{tdir}/s3-tests/virtualenv/bin/s3tests-test-roundtrip'.format(tdir=testdir), - ] - if client_config is not None and 'extra_args' in client_config: - args.extend(client_config['extra_args']) - - ctx.cluster.only(client).run( - args=args, - stdin=conf, - ) - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the s3tests-test-roundtrip suite against rgw. - - To run all tests on all clients:: - - tasks: - - ceph: - - rgw: - - s3roundtrip: - - To restrict testing to particular clients:: - - tasks: - - ceph: - - rgw: [client.0] - - s3roundtrip: [client.0] - - To run against a server on client.1:: - - tasks: - - ceph: - - rgw: [client.1] - - s3roundtrip: - client.0: - rgw_server: client.1 - - To pass extra test arguments - - tasks: - - ceph: - - rgw: [client.0] - - s3roundtrip: - client.0: - roundtrip: - bucket: mybucket - readers: 10 - writers: 3 - duration: 600 - files: - num: 10 - size: 2000 - stddev: 500 - client.1: - ... - - To override s3 configuration - - tasks: - - ceph: - - rgw: [client.0] - - s3roundtrip: - client.0: - s3: - user_id: myuserid - display_name: myname - email: my@email - access_key: myaccesskey - secret_key: mysecretkey - - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - s3tests_conf = {} - for client in clients: - if config[client] is None: - config[client] = {} - config[client].setdefault('s3', {}) - config[client].setdefault('roundtrip', {}) - - s3tests_conf[client] = ({ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : False, - }, - 'roundtrip' : config[client]['roundtrip'], - 's3' : config[client]['s3'], - }) - - with contextutil.nested( - lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - pass - yield diff --git a/src/ceph/qa/tasks/s3tests.py b/src/ceph/qa/tasks/s3tests.py deleted file mode 100644 index ef5680d..0000000 --- a/src/ceph/qa/tasks/s3tests.py +++ /dev/null @@ -1,386 +0,0 @@ -""" -Run a set of s3 tests on rgw. -""" -from cStringIO import StringIO -from configobj import ConfigObj -import base64 -import contextlib -import logging -import os -import random -import string - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.config import config as teuth_config -from teuthology.orchestra import run -from teuthology.orchestra.connection import split_user - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def download(ctx, config): - """ - Download the s3 tests from the git builder. - Remove downloaded s3 file upon exit. - - The context passed in should be identical to the context - passed in to the main task. - """ - assert isinstance(config, dict) - log.info('Downloading s3-tests...') - testdir = teuthology.get_testdir(ctx) - s3_branches = [ 'giant', 'firefly', 'firefly-original', 'hammer' ] - for (client, cconf) in config.items(): - branch = cconf.get('force-branch', None) - if not branch: - ceph_branch = ctx.config.get('branch') - suite_branch = ctx.config.get('suite_branch', ceph_branch) - if suite_branch in s3_branches: - branch = cconf.get('branch', suite_branch) - else: - branch = cconf.get('branch', 'ceph-' + suite_branch) - if not branch: - raise ValueError( - "Could not determine what branch to use for s3tests!") - else: - log.info("Using branch '%s' for s3tests", branch) - sha1 = cconf.get('sha1') - git_remote = cconf.get('git_remote', None) or teuth_config.ceph_git_base_url - ctx.cluster.only(client).run( - args=[ - 'git', 'clone', - '-b', branch, - git_remote + 's3-tests.git', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - if sha1 is not None: - ctx.cluster.only(client).run( - args=[ - 'cd', '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - 'git', 'reset', '--hard', sha1, - ], - ) - try: - yield - finally: - log.info('Removing s3-tests...') - testdir = teuthology.get_testdir(ctx) - for client in config: - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - - -def _config_user(s3tests_conf, section, user): - """ - Configure users for this section by stashing away keys, ids, and - email addresses. - """ - s3tests_conf[section].setdefault('user_id', user) - s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user)) - s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user)) - s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20))) - s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40))) - - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Create a main and an alternate s3 user. - """ - assert isinstance(config, dict) - log.info('Creating rgw users...') - testdir = teuthology.get_testdir(ctx) - users = {'s3 main': 'foo', 's3 alt': 'bar', 's3 tenant': 'testx$tenanteduser'} - for client in config['clients']: - s3tests_conf = config['s3tests_conf'][client] - s3tests_conf.setdefault('fixtures', {}) - s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-') - for section, user in users.iteritems(): - _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client)) - log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client)) - cluster_name, daemon_type, client_id = teuthology.split_role(client) - client_with_id = daemon_type + '.' + client_id - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client_with_id, - 'user', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--display-name', s3tests_conf[section]['display_name'], - '--access-key', s3tests_conf[section]['access_key'], - '--secret', s3tests_conf[section]['secret_key'], - '--email', s3tests_conf[section]['email'], - '--cluster', cluster_name, - ], - ) - try: - yield - finally: - for client in config['clients']: - for user in users.itervalues(): - uid = '{user}.{client}'.format(user=user, client=client) - cluster_name, daemon_type, client_id = teuthology.split_role(client) - client_with_id = daemon_type + '.' + client_id - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client_with_id, - 'user', 'rm', - '--uid', uid, - '--purge-data', - '--cluster', cluster_name, - ], - ) - - -@contextlib.contextmanager -def configure(ctx, config): - """ - Configure the s3-tests. This includes the running of the - bootstrap code and the updating of local conf files. - """ - assert isinstance(config, dict) - log.info('Configuring s3-tests...') - testdir = teuthology.get_testdir(ctx) - for client, properties in config['clients'].iteritems(): - s3tests_conf = config['s3tests_conf'][client] - if properties is not None and 'rgw_server' in properties: - host = None - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - log.info('roles: ' + str(roles)) - log.info('target: ' + str(target)) - if properties['rgw_server'] in roles: - _, host = split_user(target) - assert host is not None, "Invalid client specified as the rgw_server" - s3tests_conf['DEFAULT']['host'] = host - else: - s3tests_conf['DEFAULT']['host'] = 'localhost' - - if properties is not None and 'slow_backend' in properties: - s3tests_conf['fixtures']['slow backend'] = properties['slow_backend'] - - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'cd', - '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - './bootstrap', - ], - ) - conf_fp = StringIO() - s3tests_conf.write(conf_fp) - teuthology.write_file( - remote=remote, - path='{tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client), - data=conf_fp.getvalue(), - ) - - log.info('Configuring boto...') - boto_src = os.path.join(os.path.dirname(__file__), 'boto.cfg.template') - for client, properties in config['clients'].iteritems(): - with file(boto_src, 'rb') as f: - (remote,) = ctx.cluster.only(client).remotes.keys() - conf = f.read().format( - idle_timeout=config.get('idle_timeout', 30) - ) - teuthology.write_file( - remote=remote, - path='{tdir}/boto.cfg'.format(tdir=testdir), - data=conf, - ) - - try: - yield - - finally: - log.info('Cleaning up boto...') - for client, properties in config['clients'].iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'rm', - '{tdir}/boto.cfg'.format(tdir=testdir), - ], - ) - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run the s3tests after everything is set up. - - :param ctx: Context passed to task - :param config: specific configuration information - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - attrs = ["!fails_on_rgw", "!lifecycle"] - for client, client_config in config.iteritems(): - args = [ - 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client), - 'BOTO_CONFIG={tdir}/boto.cfg'.format(tdir=testdir), - '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir), - '-w', - '{tdir}/s3-tests'.format(tdir=testdir), - '-v', - '-a', ','.join(attrs), - ] - if client_config is not None and 'extra_args' in client_config: - args.extend(client_config['extra_args']) - - ctx.cluster.only(client).run( - args=args, - label="s3 tests against rgw" - ) - yield - -@contextlib.contextmanager -def scan_for_leaked_encryption_keys(ctx, config): - """ - Scan radosgw logs for the encryption keys used by s3tests to - verify that we're not leaking secrets. - - :param ctx: Context passed to task - :param config: specific configuration information - """ - assert isinstance(config, dict) - - try: - yield - finally: - # x-amz-server-side-encryption-customer-key - s3test_customer_key = 'pO3upElrwuEXSoFwCfnZPdSsmt/xWeFa0N9KgDijwVs=' - - log.debug('Scanning radosgw logs for leaked encryption keys...') - procs = list() - for client, client_config in config.iteritems(): - if not client_config.get('scan_for_encryption_keys', True): - continue - cluster_name, daemon_type, client_id = teuthology.split_role(client) - client_with_cluster = '.'.join((cluster_name, daemon_type, client_id)) - (remote,) = ctx.cluster.only(client).remotes.keys() - proc = remote.run( - args=[ - 'grep', - '--binary-files=text', - s3test_customer_key, - '/var/log/ceph/rgw.{client}.log'.format(client=client_with_cluster), - ], - wait=False, - check_status=False, - ) - procs.append(proc) - - for proc in procs: - proc.wait() - if proc.returncode == 1: # 1 means no matches - continue - log.error('radosgw log is leaking encryption keys!') - raise Exception('radosgw log is leaking encryption keys') - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the s3-tests suite against rgw. - - To run all tests on all clients:: - - tasks: - - ceph: - - rgw: - - s3tests: - - To restrict testing to particular clients:: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: [client.0] - - To run against a server on client.1 and increase the boto timeout to 10m:: - - tasks: - - ceph: - - rgw: [client.1] - - s3tests: - client.0: - rgw_server: client.1 - idle_timeout: 600 - - To pass extra arguments to nose (e.g. to run a certain test):: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: - client.0: - extra_args: ['test_s3:test_object_acl_grand_public_read'] - client.1: - extra_args: ['--exclude', 'test_100_continue'] - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - overrides = ctx.config.get('overrides', {}) - # merge each client section, not the top level. - for client in config.iterkeys(): - if not config[client]: - config[client] = {} - teuthology.deep_merge(config[client], overrides.get('s3tests', {})) - - log.debug('s3tests config is %s', config) - - s3tests_conf = {} - for client in clients: - s3tests_conf[client] = ConfigObj( - indent_type='', - infile={ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : 'no', - }, - 'fixtures' : {}, - 's3 main' : {}, - 's3 alt' : {}, - 's3 tenant': {}, - } - ) - - with contextutil.nested( - lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - lambda: scan_for_leaked_encryption_keys(ctx=ctx, config=config), - ): - pass - yield diff --git a/src/ceph/qa/tasks/samba.py b/src/ceph/qa/tasks/samba.py deleted file mode 100644 index 8272e8b..0000000 --- a/src/ceph/qa/tasks/samba.py +++ /dev/null @@ -1,245 +0,0 @@ -""" -Samba -""" -import contextlib -import logging -import sys -import time - -from teuthology import misc as teuthology -from teuthology.orchestra import run -from teuthology.orchestra.daemon import DaemonGroup - -log = logging.getLogger(__name__) - - -def get_sambas(ctx, roles): - """ - Scan for roles that are samba. Yield the id of the the samba role - (samba.0, samba.1...) and the associated remote site - - :param ctx: Context - :param roles: roles for this test (extracted from yaml files) - """ - for role in roles: - assert isinstance(role, basestring) - PREFIX = 'samba.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - yield (id_, remote) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Setup samba smbd with ceph vfs module. This task assumes the samba - package has already been installed via the install task. - - The config is optional and defaults to starting samba on all nodes. - If a config is given, it is expected to be a list of - samba nodes to start smbd servers on. - - Example that starts smbd on all samba nodes:: - - tasks: - - install: - - install: - project: samba - extra_packages: ['samba'] - - ceph: - - samba: - - interactive: - - Example that starts smbd on just one of the samba nodes and cifs on the other:: - - tasks: - - samba: [samba.0] - - cifs: [samba.1] - - An optional backend can be specified, and requires a path which smbd will - use as the backend storage location: - - roles: - - [osd.0, osd.1, osd.2, mon.0, mon.1, mon.2, mds.a] - - [client.0, samba.0] - - tasks: - - ceph: - - ceph-fuse: [client.0] - - samba: - samba.0: - cephfuse: "{testdir}/mnt.0" - - This mounts ceph to {testdir}/mnt.0 using fuse, and starts smbd with - a UNC of //localhost/cephfuse. Access through that UNC will be on - the ceph fuse mount point. - - If no arguments are specified in the samba - role, the default behavior is to enable the ceph UNC //localhost/ceph - and use the ceph vfs module as the smbd backend. - - :param ctx: Context - :param config: Configuration - """ - log.info("Setting up smbd with ceph vfs...") - assert config is None or isinstance(config, list) or isinstance(config, dict), \ - "task samba got invalid config" - - if config is None: - config = dict(('samba.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - samba_servers = list(get_sambas(ctx=ctx, roles=config.keys())) - - testdir = teuthology.get_testdir(ctx) - - if not hasattr(ctx, 'daemons'): - ctx.daemons = DaemonGroup() - - for id_, remote in samba_servers: - - rolestr = "samba.{id_}".format(id_=id_) - - confextras = """vfs objects = ceph - ceph:config_file = /etc/ceph/ceph.conf""" - - unc = "ceph" - backend = "/" - - if config[rolestr] is not None: - # verify that there's just one parameter in role - if len(config[rolestr]) != 1: - log.error("samba config for role samba.{id_} must have only one parameter".format(id_=id_)) - raise Exception('invalid config') - confextras = "" - (unc, backendstr) = config[rolestr].items()[0] - backend = backendstr.format(testdir=testdir) - - # on first samba role, set ownership and permissions of ceph root - # so that samba tests succeed - if config[rolestr] is None and id_ == samba_servers[0][0]: - remote.run( - args=[ - 'mkdir', '-p', '/tmp/cmnt', run.Raw('&&'), - 'sudo', 'ceph-fuse', '/tmp/cmnt', run.Raw('&&'), - 'sudo', 'chown', 'ubuntu:ubuntu', '/tmp/cmnt/', run.Raw('&&'), - 'sudo', 'chmod', '1777', '/tmp/cmnt/', run.Raw('&&'), - 'sudo', 'umount', '/tmp/cmnt/', run.Raw('&&'), - 'rm', '-rf', '/tmp/cmnt', - ], - ) - else: - remote.run( - args=[ - 'sudo', 'chown', 'ubuntu:ubuntu', backend, run.Raw('&&'), - 'sudo', 'chmod', '1777', backend, - ], - ) - - teuthology.sudo_write_file(remote, "/usr/local/samba/etc/smb.conf", """ -[global] - workgroup = WORKGROUP - netbios name = DOMAIN - -[{unc}] - path = {backend} - {extras} - writeable = yes - valid users = ubuntu -""".format(extras=confextras, unc=unc, backend=backend)) - - # create ubuntu user - remote.run( - args=[ - 'sudo', '/usr/local/samba/bin/smbpasswd', '-e', 'ubuntu', - run.Raw('||'), - 'printf', run.Raw('"ubuntu\nubuntu\n"'), - run.Raw('|'), - 'sudo', '/usr/local/samba/bin/smbpasswd', '-s', '-a', 'ubuntu' - ]) - - smbd_cmd = [ - 'sudo', - 'daemon-helper', - 'term', - 'nostdin', - '/usr/local/samba/sbin/smbd', - '-F', - ] - ctx.daemons.add_daemon(remote, 'smbd', id_, - args=smbd_cmd, - logger=log.getChild("smbd.{id_}".format(id_=id_)), - stdin=run.PIPE, - wait=False, - ) - - # let smbd initialize, probably a better way... - seconds_to_sleep = 100 - log.info('Sleeping for %s seconds...' % seconds_to_sleep) - time.sleep(seconds_to_sleep) - log.info('Sleeping stopped...') - - try: - yield - finally: - log.info('Stopping smbd processes...') - exc_info = (None, None, None) - for d in ctx.daemons.iter_daemons_of_role('smbd'): - try: - d.stop() - except (run.CommandFailedError, - run.CommandCrashedError, - run.ConnectionLostError): - exc_info = sys.exc_info() - log.exception('Saw exception from %s.%s', d.role, d.id_) - if exc_info != (None, None, None): - raise exc_info[0], exc_info[1], exc_info[2] - - for id_, remote in samba_servers: - remote.run( - args=[ - 'sudo', - 'rm', '-rf', - '/usr/local/samba/etc/smb.conf', - '/usr/local/samba/private/*', - '/usr/local/samba/var/run/', - '/usr/local/samba/var/locks', - '/usr/local/samba/var/lock', - ], - ) - # make sure daemons are gone - try: - remote.run( - args=[ - 'while', - 'sudo', 'killall', '-9', 'smbd', - run.Raw(';'), - 'do', 'sleep', '1', - run.Raw(';'), - 'done', - ], - ) - - remote.run( - args=[ - 'sudo', - 'lsof', - backend, - ], - check_status=False - ) - remote.run( - args=[ - 'sudo', - 'fuser', - '-M', - backend, - ], - check_status=False - ) - except Exception: - log.exception("Saw exception") - pass diff --git a/src/ceph/qa/tasks/scrub.py b/src/ceph/qa/tasks/scrub.py deleted file mode 100644 index 9800d1e..0000000 --- a/src/ceph/qa/tasks/scrub.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Scrub osds -""" -import contextlib -import gevent -import logging -import random -import time - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run scrub periodically. Randomly chooses an OSD to scrub. - - The config should be as follows: - - scrub: - frequency: <seconds between scrubs> - deep: <bool for deepness> - - example: - - tasks: - - ceph: - - scrub: - frequency: 30 - deep: 0 - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'scrub task only accepts a dict for configuration' - - log.info('Beginning scrub...') - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - while len(manager.get_osd_status()['up']) < num_osds: - time.sleep(10) - - scrub_proc = Scrubber( - manager, - config, - ) - try: - yield - finally: - log.info('joining scrub') - scrub_proc.do_join() - -class Scrubber: - """ - Scrubbing is actually performed during initialzation - """ - def __init__(self, manager, config): - """ - Spawn scrubbing thread upon completion. - """ - self.ceph_manager = manager - self.ceph_manager.wait_for_clean() - - osd_status = self.ceph_manager.get_osd_status() - self.osds = osd_status['up'] - - self.config = config - if self.config is None: - self.config = dict() - - else: - def tmp(x): - """Local display""" - print x - self.log = tmp - - self.stopping = False - - log.info("spawning thread") - - self.thread = gevent.spawn(self.do_scrub) - - def do_join(self): - """Scrubbing thread finished""" - self.stopping = True - self.thread.get() - - def do_scrub(self): - """Perform the scrub operation""" - frequency = self.config.get("frequency", 30) - deep = self.config.get("deep", 0) - - log.info("stopping %s" % self.stopping) - - while not self.stopping: - osd = str(random.choice(self.osds)) - - if deep: - cmd = 'deep-scrub' - else: - cmd = 'scrub' - - log.info('%sbing %s' % (cmd, osd)) - self.ceph_manager.raw_cluster_cmd('osd', cmd, osd) - - time.sleep(frequency) diff --git a/src/ceph/qa/tasks/scrub_test.py b/src/ceph/qa/tasks/scrub_test.py deleted file mode 100644 index a545c9b..0000000 --- a/src/ceph/qa/tasks/scrub_test.py +++ /dev/null @@ -1,412 +0,0 @@ -"""Scrub testing""" -from cStringIO import StringIO - -import contextlib -import json -import logging -import os -import time -import tempfile - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - - -def wait_for_victim_pg(manager): - """Return a PG with some data and its acting set""" - # wait for some PG to have data that we can mess with - victim = None - while victim is None: - stats = manager.get_pg_stats() - for pg in stats: - size = pg['stat_sum']['num_bytes'] - if size > 0: - victim = pg['pgid'] - acting = pg['acting'] - return victim, acting - time.sleep(3) - - -def find_victim_object(ctx, pg, osd): - """Return a file to be fuzzed""" - (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys() - data_path = os.path.join( - '/var/lib/ceph/osd', - 'ceph-{id}'.format(id=osd), - 'fuse', - '{pg}_head'.format(pg=pg), - 'all', - ) - - # fuzz time - with contextlib.closing(StringIO()) as ls_fp: - osd_remote.run( - args=['sudo', 'ls', data_path], - stdout=ls_fp, - ) - ls_out = ls_fp.getvalue() - - # find an object file we can mess with (and not the pg info object) - osdfilename = next(line for line in ls_out.split('\n') - if not line.endswith('::::head#')) - assert osdfilename is not None - - # Get actual object name from osd stored filename - objname = osdfilename.split(':')[4] - return osd_remote, os.path.join(data_path, osdfilename), objname - - -def corrupt_file(osd_remote, path): - # put a single \0 at the beginning of the file - osd_remote.run( - args=['sudo', 'dd', - 'if=/dev/zero', - 'of=%s/data' % path, - 'bs=1', 'count=1', 'conv=notrunc'] - ) - - -def get_pgnum(pgid): - pos = pgid.find('.') - assert pos != -1 - return pgid[pos+1:] - - -def deep_scrub(manager, victim, pool): - # scrub, verify inconsistent - pgnum = get_pgnum(victim) - manager.do_pg_scrub(pool, pgnum, 'deep-scrub') - - stats = manager.get_single_pg_stats(victim) - inconsistent = stats['state'].find('+inconsistent') != -1 - assert inconsistent - - -def repair(manager, victim, pool): - # repair, verify no longer inconsistent - pgnum = get_pgnum(victim) - manager.do_pg_scrub(pool, pgnum, 'repair') - - stats = manager.get_single_pg_stats(victim) - inconsistent = stats['state'].find('+inconsistent') != -1 - assert not inconsistent - - -def test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, pool): - corrupt_file(osd_remote, obj_path) - deep_scrub(manager, pg, pool) - repair(manager, pg, pool) - - -def test_repair_bad_omap(ctx, manager, pg, osd, objname): - # Test deep-scrub with various omap modifications - # Modify omap on specific osd - log.info('fuzzing omap of %s' % objname) - manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']) - manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, - 'badkey', 'badval']) - manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']) - - deep_scrub(manager, pg, 'rbd') - # please note, the repair here is errnomous, it rewrites the correct omap - # digest and data digest on the replicas with the corresponding digests - # from the primary osd which is hosting the victim object, see - # find_victim_object(). - # so we need to either put this test and the end of this task or - # undo the mess-up manually before the "repair()" that just ensures - # the cleanup is sane, otherwise the succeeding tests will fail. if they - # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub. - manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'hdr']) - manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'badkey']) - manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, - 'key', 'val']) - repair(manager, pg, 'rbd') - - -class MessUp: - def __init__(self, manager, osd_remote, pool, osd_id, - obj_name, obj_path, omap_key, omap_val): - self.manager = manager - self.osd = osd_remote - self.pool = pool - self.osd_id = osd_id - self.obj = obj_name - self.path = obj_path - self.omap_key = omap_key - self.omap_val = omap_val - - @contextlib.contextmanager - def _test_with_file(self, messup_cmd, *checks): - temp = tempfile.mktemp() - backup_cmd = ['sudo', 'cp', os.path.join(self.path, 'data'), temp] - self.osd.run(args=backup_cmd) - self.osd.run(args=messup_cmd.split()) - yield checks - create_cmd = ['sudo', 'mkdir', self.path] - self.osd.run(args=create_cmd, check_status=False) - restore_cmd = ['sudo', 'cp', temp, os.path.join(self.path, 'data')] - self.osd.run(args=restore_cmd) - - def remove(self): - cmd = 'sudo rmdir {path}'.format(path=self.path) - return self._test_with_file(cmd, 'missing') - - def append(self): - cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \ - 'conv=notrunc oflag=append'.format(path=self.path) - return self._test_with_file(cmd, - 'data_digest_mismatch', - 'size_mismatch') - - def truncate(self): - cmd = 'sudo dd if=/dev/null of={path}/data'.format(path=self.path) - return self._test_with_file(cmd, - 'data_digest_mismatch', - 'size_mismatch') - - def change_obj(self): - cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \ - 'conv=notrunc'.format(path=self.path) - return self._test_with_file(cmd, - 'data_digest_mismatch') - - @contextlib.contextmanager - def rm_omap(self): - cmd = ['rmomapkey', self.pool, self.obj, self.omap_key] - self.manager.osd_admin_socket(self.osd_id, cmd) - yield ('omap_digest_mismatch',) - cmd = ['setomapval', self.pool, self.obj, - self.omap_key, self.omap_val] - self.manager.osd_admin_socket(self.osd_id, cmd) - - @contextlib.contextmanager - def add_omap(self): - cmd = ['setomapval', self.pool, self.obj, 'badkey', 'badval'] - self.manager.osd_admin_socket(self.osd_id, cmd) - yield ('omap_digest_mismatch',) - cmd = ['rmomapkey', self.pool, self.obj, 'badkey'] - self.manager.osd_admin_socket(self.osd_id, cmd) - - @contextlib.contextmanager - def change_omap(self): - cmd = ['setomapval', self.pool, self.obj, self.omap_key, 'badval'] - self.manager.osd_admin_socket(self.osd_id, cmd) - yield ('omap_digest_mismatch',) - cmd = ['setomapval', self.pool, self.obj, self.omap_key, self.omap_val] - self.manager.osd_admin_socket(self.osd_id, cmd) - - -class InconsistentObjChecker: - """Check the returned inconsistents/inconsistent info""" - - def __init__(self, osd, acting, obj_name): - self.osd = osd - self.acting = acting - self.obj = obj_name - assert self.osd in self.acting - - def basic_checks(self, inc): - assert inc['object']['name'] == self.obj - assert inc['object']['snap'] == "head" - assert len(inc['shards']) == len(self.acting), \ - "the number of returned shard does not match with the acting set" - - def run(self, check, inc): - func = getattr(self, check) - func(inc) - - def _check_errors(self, inc, err_name): - bad_found = False - good_found = False - for shard in inc['shards']: - log.info('shard = %r' % shard) - log.info('err = %s' % err_name) - assert 'osd' in shard - osd = shard['osd'] - err = err_name in shard['errors'] - if osd == self.osd: - assert bad_found is False, \ - "multiple entries found for the given OSD" - assert err is True, \ - "Didn't find '{err}' in errors".format(err=err_name) - bad_found = True - else: - assert osd in self.acting, "shard not in acting set" - assert err is False, \ - "Expected '{err}' in errors".format(err=err_name) - good_found = True - assert bad_found is True, \ - "Shard for osd.{osd} not found".format(osd=self.osd) - assert good_found is True, \ - "No other acting shards found" - - def _check_attrs(self, inc, attr_name): - bad_attr = None - good_attr = None - for shard in inc['shards']: - log.info('shard = %r' % shard) - log.info('attr = %s' % attr_name) - assert 'osd' in shard - osd = shard['osd'] - attr = shard.get(attr_name, False) - if osd == self.osd: - assert bad_attr is None, \ - "multiple entries found for the given OSD" - bad_attr = attr - else: - assert osd in self.acting, "shard not in acting set" - assert good_attr is None or good_attr == attr, \ - "multiple good attrs found" - good_attr = attr - assert bad_attr is not None, \ - "bad {attr} not found".format(attr=attr_name) - assert good_attr is not None, \ - "good {attr} not found".format(attr=attr_name) - assert good_attr != bad_attr, \ - "bad attr is identical to the good ones: " \ - "{0} == {1}".format(good_attr, bad_attr) - - def data_digest_mismatch(self, inc): - assert 'data_digest_mismatch' in inc['errors'] - self._check_attrs(inc, 'data_digest') - - def missing(self, inc): - assert 'missing' in inc['union_shard_errors'] - self._check_errors(inc, 'missing') - - def size_mismatch(self, inc): - assert 'size_mismatch' in inc['errors'] - self._check_attrs(inc, 'size') - - def omap_digest_mismatch(self, inc): - assert 'omap_digest_mismatch' in inc['errors'] - self._check_attrs(inc, 'omap_digest') - - -def test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd_id, - obj_name, obj_path): - mon = manager.controller - pool = 'rbd' - omap_key = 'key' - omap_val = 'val' - manager.do_rados(mon, ['-p', pool, 'setomapval', obj_name, - omap_key, omap_val]) - # Update missing digests, requires "osd deep scrub update digest min age: 0" - pgnum = get_pgnum(pg) - manager.do_pg_scrub(pool, pgnum, 'deep-scrub') - - messup = MessUp(manager, osd_remote, pool, osd_id, obj_name, obj_path, - omap_key, omap_val) - for test in [messup.rm_omap, messup.add_omap, messup.change_omap, - messup.append, messup.truncate, messup.change_obj, - messup.remove]: - with test() as checks: - deep_scrub(manager, pg, pool) - cmd = 'rados list-inconsistent-pg {pool} ' \ - '--format=json'.format(pool=pool) - with contextlib.closing(StringIO()) as out: - mon.run(args=cmd.split(), stdout=out) - pgs = json.loads(out.getvalue()) - assert pgs == [pg] - - cmd = 'rados list-inconsistent-obj {pg} ' \ - '--format=json'.format(pg=pg) - with contextlib.closing(StringIO()) as out: - mon.run(args=cmd.split(), stdout=out) - objs = json.loads(out.getvalue()) - assert len(objs['inconsistents']) == 1 - - checker = InconsistentObjChecker(osd_id, acting, obj_name) - inc_obj = objs['inconsistents'][0] - log.info('inc = %r', inc_obj) - checker.basic_checks(inc_obj) - for check in checks: - checker.run(check, inc_obj) - - -def task(ctx, config): - """ - Test [deep] scrub - - tasks: - - chef: - - install: - - ceph: - log-whitelist: - - '!= data_digest' - - '!= omap_digest' - - '!= size' - - deep-scrub 0 missing, 1 inconsistent objects - - deep-scrub [0-9]+ errors - - repair 0 missing, 1 inconsistent objects - - repair [0-9]+ errors, [0-9]+ fixed - - shard [0-9]+ missing - - deep-scrub 1 missing, 1 inconsistent objects - - does not match object info size - - attr name mistmatch - - deep-scrub 1 missing, 0 inconsistent objects - - failed to pick suitable auth object - conf: - osd: - osd deep scrub update digest min age: 0 - - scrub_test: - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'scrub_test task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < num_osds: - time.sleep(10) - - for i in range(num_osds): - manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs', - '--', '--osd-objectstore-fuse') - manager.flush_pg_stats(range(num_osds)) - manager.wait_for_clean() - - # write some data - p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', - 'write', '-b', '4096']) - log.info('err is %d' % p.exitstatus) - - # wait for some PG to have data that we can mess with - pg, acting = wait_for_victim_pg(manager) - osd = acting[0] - - osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd) - manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val']) - log.info('err is %d' % p.exitstatus) - manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr']) - log.info('err is %d' % p.exitstatus) - - # Update missing digests, requires "osd deep scrub update digest min age: 0" - pgnum = get_pgnum(pg) - manager.do_pg_scrub('rbd', pgnum, 'deep-scrub') - - log.info('messing with PG %s on osd %d' % (pg, osd)) - test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd') - test_repair_bad_omap(ctx, manager, pg, osd, obj_name) - test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd, - obj_name, obj_path) - log.info('test successful!') - - # shut down fuse mount - for i in range(num_osds): - manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs', - '--', '--no-osd-objectstore-fuse') - time.sleep(5) - log.info('done') diff --git a/src/ceph/qa/tasks/swift.py b/src/ceph/qa/tasks/swift.py deleted file mode 100644 index 28f75dd..0000000 --- a/src/ceph/qa/tasks/swift.py +++ /dev/null @@ -1,263 +0,0 @@ -""" -Test Swift API -""" -from cStringIO import StringIO -from configobj import ConfigObj -import base64 -import contextlib -import logging -import os - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.config import config as teuth_config -from teuthology.orchestra import run -from teuthology.orchestra.connection import split_user - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download(ctx, config): - """ - Download the Swift API. - """ - testdir = teuthology.get_testdir(ctx) - assert isinstance(config, list) - log.info('Downloading swift...') - for client in config: - ctx.cluster.only(client).run( - args=[ - 'git', 'clone', - teuth_config.ceph_git_base_url + 'swift.git', - '{tdir}/swift'.format(tdir=testdir), - ], - ) - try: - yield - finally: - log.info('Removing swift...') - testdir = teuthology.get_testdir(ctx) - for client in config: - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/swift'.format(tdir=testdir), - ], - ) - -def _config_user(testswift_conf, account, user, suffix): - """ - Configure a swift user - - :param account: Swift account - :param user: User name - :param suffix: user name and email suffixes. - """ - testswift_conf['func_test'].setdefault('account{s}'.format(s=suffix), account) - testswift_conf['func_test'].setdefault('username{s}'.format(s=suffix), user) - testswift_conf['func_test'].setdefault('email{s}'.format(s=suffix), '{account}+test@test.test'.format(account=account)) - testswift_conf['func_test'].setdefault('display_name{s}'.format(s=suffix), 'Mr. {account} {user}'.format(account=account, user=user)) - testswift_conf['func_test'].setdefault('password{s}'.format(s=suffix), base64.b64encode(os.urandom(40))) - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Create rgw users to interact with the swift interface. - """ - assert isinstance(config, dict) - log.info('Creating rgw users...') - testdir = teuthology.get_testdir(ctx) - users = {'': 'foo', '2': 'bar'} - for client in config['clients']: - cluster_name, daemon_type, client_id = teuthology.split_role(client) - testswift_conf = config['testswift_conf'][client] - for suffix, user in users.iteritems(): - _config_user(testswift_conf, '{user}.{client}'.format(user=user, client=client), user, suffix) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - '--cluster', cluster_name, - 'user', 'create', - '--subuser', '{account}:{user}'.format(account=testswift_conf['func_test']['account{s}'.format(s=suffix)],user=user), - '--display-name', testswift_conf['func_test']['display_name{s}'.format(s=suffix)], - '--secret', testswift_conf['func_test']['password{s}'.format(s=suffix)], - '--email', testswift_conf['func_test']['email{s}'.format(s=suffix)], - '--key-type', 'swift', - '--access', 'full', - ], - ) - try: - yield - finally: - for client in config['clients']: - for user in users.itervalues(): - uid = '{user}.{client}'.format(user=user, client=client) - cluster_name, daemon_type, client_id = teuthology.split_role(client) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - '--cluster', cluster_name, - 'user', 'rm', - '--uid', uid, - '--purge-data', - ], - ) - -@contextlib.contextmanager -def configure(ctx, config): - """ - Configure rgw and Swift - """ - assert isinstance(config, dict) - log.info('Configuring testswift...') - testdir = teuthology.get_testdir(ctx) - for client, properties in config['clients'].iteritems(): - log.info('client={c}'.format(c=client)) - log.info('config={c}'.format(c=config)) - testswift_conf = config['testswift_conf'][client] - if properties is not None and 'rgw_server' in properties: - host = None - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - log.info('roles: ' + str(roles)) - log.info('target: ' + str(target)) - if properties['rgw_server'] in roles: - _, host = split_user(target) - assert host is not None, "Invalid client specified as the rgw_server" - testswift_conf['func_test']['auth_host'] = host - else: - testswift_conf['func_test']['auth_host'] = 'localhost' - - log.info(client) - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'cd', - '{tdir}/swift'.format(tdir=testdir), - run.Raw('&&'), - './bootstrap', - ], - ) - conf_fp = StringIO() - testswift_conf.write(conf_fp) - teuthology.write_file( - remote=remote, - path='{tdir}/archive/testswift.{client}.conf'.format(tdir=testdir, client=client), - data=conf_fp.getvalue(), - ) - yield - - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run an individual Swift test. - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - args = [ - 'SWIFT_TEST_CONFIG_FILE={tdir}/archive/testswift.{client}.conf'.format(tdir=testdir, client=client), - '{tdir}/swift/virtualenv/bin/nosetests'.format(tdir=testdir), - '-w', - '{tdir}/swift/test/functional'.format(tdir=testdir), - '-v', - '-a', '!fails_on_rgw', - ] - if client_config is not None and 'extra_args' in client_config: - args.extend(client_config['extra_args']) - - ctx.cluster.only(client).run( - args=args, - ) - yield - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the testswift suite against rgw. - - To run all tests on all clients:: - - tasks: - - ceph: - - rgw: - - testswift: - - To restrict testing to particular clients:: - - tasks: - - ceph: - - rgw: [client.0] - - testswift: [client.0] - - To run against a server on client.1:: - - tasks: - - ceph: - - rgw: [client.1] - - testswift: - client.0: - rgw_server: client.1 - - To pass extra arguments to nose (e.g. to run a certain test):: - - tasks: - - ceph: - - rgw: [client.0] - - testswift: - client.0: - extra_args: ['test.functional.tests:TestFileUTF8', '-m', 'testCopy'] - client.1: - extra_args: ['--exclude', 'TestFile'] - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task testswift only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - log.info('clients={c}'.format(c=clients)) - - testswift_conf = {} - for client in clients: - testswift_conf[client] = ConfigObj( - indent_type='', - infile={ - 'func_test': - { - 'auth_port' : 7280, - 'auth_ssl' : 'no', - 'auth_prefix' : '/auth/', - }, - } - ) - - with contextutil.nested( - lambda: download(ctx=ctx, config=clients), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - testswift_conf=testswift_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - testswift_conf=testswift_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - pass - yield diff --git a/src/ceph/qa/tasks/systemd.py b/src/ceph/qa/tasks/systemd.py deleted file mode 100644 index 50471db..0000000 --- a/src/ceph/qa/tasks/systemd.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Systemd test -""" -import contextlib -import logging -import re -import time - -from cStringIO import StringIO -from teuthology.orchestra import run -from teuthology.misc import reconnect, get_first_mon, wait_until_healthy - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - - tasks: - ceph-deploy: - systemd: - - Test ceph systemd services can start, stop and restart and - check for any failed services and report back errors - """ - for remote, roles in ctx.cluster.remotes.iteritems(): - remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), - 'grep', 'ceph']) - r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'), - 'grep', 'ceph'], stdout=StringIO(), - check_status=False) - log.info(r.stdout.getvalue()) - if r.stdout.getvalue().find('failed'): - log.info("Ceph services in failed state") - - # test overall service stop and start using ceph.target - # ceph.target tests are meant for ceph systemd tests - # and not actual process testing using 'ps' - log.info("Stopping all Ceph services") - remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) - r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'], - stdout=StringIO(), check_status=False) - log.info(r.stdout.getvalue()) - log.info("Checking process status") - r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), - 'grep', 'ceph'], stdout=StringIO()) - if r.stdout.getvalue().find('Active: inactive'): - log.info("Sucessfully stopped all ceph services") - else: - log.info("Failed to stop ceph services") - - log.info("Starting all Ceph services") - remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target']) - r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'], - stdout=StringIO()) - log.info(r.stdout.getvalue()) - if r.stdout.getvalue().find('Active: active'): - log.info("Sucessfully started all Ceph services") - else: - log.info("info", "Failed to start Ceph services") - r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), - 'grep', 'ceph'], stdout=StringIO()) - log.info(r.stdout.getvalue()) - time.sleep(4) - - # test individual services start stop - name = remote.shortname - mon_name = 'ceph-mon@' + name + '.service' - mds_name = 'ceph-mds@' + name + '.service' - mgr_name = 'ceph-mgr@' + name + '.service' - mon_role_name = 'mon.' + name - mds_role_name = 'mds.' + name - mgr_role_name = 'mgr.' + name - m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue()) - if m_osd: - osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1)) - remote.run(args=['sudo', 'systemctl', 'status', - osd_service]) - remote.run(args=['sudo', 'systemctl', 'stop', - osd_service]) - time.sleep(4) # immediate check will result in deactivating state - r = remote.run(args=['sudo', 'systemctl', 'status', osd_service], - stdout=StringIO(), check_status=False) - log.info(r.stdout.getvalue()) - if r.stdout.getvalue().find('Active: inactive'): - log.info("Sucessfully stopped single osd ceph service") - else: - log.info("Failed to stop ceph osd services") - remote.run(args=['sudo', 'systemctl', 'start', - osd_service]) - time.sleep(4) - if mon_role_name in roles: - remote.run(args=['sudo', 'systemctl', 'status', mon_name]) - remote.run(args=['sudo', 'systemctl', 'stop', mon_name]) - time.sleep(4) # immediate check will result in deactivating state - r = remote.run(args=['sudo', 'systemctl', 'status', mon_name], - stdout=StringIO(), check_status=False) - if r.stdout.getvalue().find('Active: inactive'): - log.info("Sucessfully stopped single mon ceph service") - else: - log.info("Failed to stop ceph mon service") - remote.run(args=['sudo', 'systemctl', 'start', mon_name]) - time.sleep(4) - if mgr_role_name in roles: - remote.run(args=['sudo', 'systemctl', 'status', mgr_name]) - remote.run(args=['sudo', 'systemctl', 'stop', mgr_name]) - time.sleep(4) # immediate check will result in deactivating state - r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name], - stdout=StringIO(), check_status=False) - if r.stdout.getvalue().find('Active: inactive'): - log.info("Sucessfully stopped single ceph mgr service") - else: - log.info("Failed to stop ceph mgr service") - remote.run(args=['sudo', 'systemctl', 'start', mgr_name]) - time.sleep(4) - if mds_role_name in roles: - remote.run(args=['sudo', 'systemctl', 'status', mds_name]) - remote.run(args=['sudo', 'systemctl', 'stop', mds_name]) - time.sleep(4) # immediate check will result in deactivating state - r = remote.run(args=['sudo', 'systemctl', 'status', mds_name], - stdout=StringIO(), check_status=False) - if r.stdout.getvalue().find('Active: inactive'): - log.info("Sucessfully stopped single ceph mds service") - else: - log.info("Failed to stop ceph mds service") - remote.run(args=['sudo', 'systemctl', 'start', mds_name]) - time.sleep(4) - - # reboot all nodes and verify the systemd units restart - # workunit that runs would fail if any of the systemd unit doesnt start - ctx.cluster.run(args='sudo reboot', wait=False, check_status=False) - # avoid immediate reconnect - time.sleep(120) - reconnect(ctx, 480) # reconnect all nodes - # for debug info - ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), - 'grep', 'ceph']) - # wait for HEALTH_OK - mon = get_first_mon(ctx, config) - (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys() - wait_until_healthy(ctx, mon_remote, use_sudo=True) - yield diff --git a/src/ceph/qa/tasks/tests/__init__.py b/src/ceph/qa/tasks/tests/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/src/ceph/qa/tasks/tests/__init__.py +++ /dev/null diff --git a/src/ceph/qa/tasks/tests/test_buildpackages.py b/src/ceph/qa/tasks/tests/test_buildpackages.py deleted file mode 100644 index fed5aa0..0000000 --- a/src/ceph/qa/tasks/tests/test_buildpackages.py +++ /dev/null @@ -1,170 +0,0 @@ -# py.test -v -s tests/test_buildpackages.py - -from mock import patch, Mock - -from .. import buildpackages -from teuthology import packaging - -def test_get_tag_branch_sha1(): - gitbuilder = packaging.GitbuilderProject( - 'ceph', - { - 'os_type': 'centos', - 'os_version': '7.0', - }) - (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder) - assert tag == None - assert branch == None - assert sha1 is not None - - gitbuilder = packaging.GitbuilderProject( - 'ceph', - { - 'os_type': 'centos', - 'os_version': '7.0', - 'sha1': 'asha1', - }) - (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder) - assert tag == None - assert branch == None - assert sha1 == 'asha1' - - remote = Mock - remote.arch = 'x86_64' - remote.os = Mock - remote.os.name = 'ubuntu' - remote.os.version = '14.04' - remote.os.codename = 'trusty' - remote.system_type = 'deb' - ctx = Mock - ctx.cluster = Mock - ctx.cluster.remotes = {remote: ['client.0']} - - expected_tag = 'v0.94.1' - expected_sha1 = 'expectedsha1' - def check_output(cmd, shell): - assert shell == True - return expected_sha1 + " refs/tags/" + expected_tag - with patch.multiple( - buildpackages, - check_output=check_output, - ): - gitbuilder = packaging.GitbuilderProject( - 'ceph', - { - 'os_type': 'centos', - 'os_version': '7.0', - 'sha1': 'asha1', - 'all': { - 'tag': tag, - }, - }, - ctx = ctx, - remote = remote) - (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder) - assert tag == expected_tag - assert branch == None - assert sha1 == expected_sha1 - - expected_branch = 'hammer' - expected_sha1 = 'otherexpectedsha1' - def check_output(cmd, shell): - assert shell == True - return expected_sha1 + " refs/heads/" + expected_branch - with patch.multiple( - buildpackages, - check_output=check_output, - ): - gitbuilder = packaging.GitbuilderProject( - 'ceph', - { - 'os_type': 'centos', - 'os_version': '7.0', - 'sha1': 'asha1', - 'all': { - 'branch': branch, - }, - }, - ctx = ctx, - remote = remote) - (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder) - assert tag == None - assert branch == expected_branch - assert sha1 == expected_sha1 - -def test_lookup_configs(): - expected_system_type = 'deb' - def make_remote(): - remote = Mock() - remote.arch = 'x86_64' - remote.os = Mock() - remote.os.name = 'ubuntu' - remote.os.version = '14.04' - remote.os.codename = 'trusty' - remote.system_type = expected_system_type - return remote - ctx = Mock() - class cluster: - remote1 = make_remote() - remote2 = make_remote() - remotes = { - remote1: ['client.0'], - remote2: ['mon.a','osd.0'], - } - def only(self, role): - result = Mock() - if role in ('client.0',): - result.remotes = { cluster.remote1: None } - elif role in ('osd.0', 'mon.a'): - result.remotes = { cluster.remote2: None } - else: - result.remotes = None - return result - ctx.cluster = cluster() - ctx.config = { - 'roles': [ ['client.0'], ['mon.a','osd.0'] ], - } - - # nothing -> nothing - assert buildpackages.lookup_configs(ctx, {}) == [] - assert buildpackages.lookup_configs(ctx, {1:[1,2,3]}) == [] - assert buildpackages.lookup_configs(ctx, [[1,2,3]]) == [] - assert buildpackages.lookup_configs(ctx, None) == [] - - # - # the overrides applies to install and to install.upgrade - # that have no tag, branch or sha1 - # - config = { - 'overrides': { - 'install': { - 'ceph': { - 'sha1': 'overridesha1', - 'tag': 'overridetag', - 'branch': 'overridebranch', - }, - }, - }, - 'tasks': [ - { - 'install': { - 'sha1': 'installsha1', - }, - }, - { - 'install.upgrade': { - 'osd.0': { - }, - 'client.0': { - 'sha1': 'client0sha1', - }, - }, - } - ], - } - ctx.config = config - expected_configs = [{'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'}, - {'project': 'ceph', 'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'}, - {'project': 'ceph', 'sha1': 'client0sha1'}] - - assert buildpackages.lookup_configs(ctx, config) == expected_configs diff --git a/src/ceph/qa/tasks/tests/test_devstack.py b/src/ceph/qa/tasks/tests/test_devstack.py deleted file mode 100644 index 117b307..0000000 --- a/src/ceph/qa/tasks/tests/test_devstack.py +++ /dev/null @@ -1,48 +0,0 @@ -from textwrap import dedent - -from .. import devstack - - -class TestDevstack(object): - def test_parse_os_table(self): - table_str = dedent(""" - +---------------------+--------------------------------------+ - | Property | Value | - +---------------------+--------------------------------------+ - | attachments | [] | - | availability_zone | nova | - | bootable | false | - | created_at | 2014-02-21T17:14:47.548361 | - | display_description | None | - | display_name | NAME | - | id | ffdbd1bb-60dc-4d95-acfe-88774c09ad3e | - | metadata | {} | - | size | 1 | - | snapshot_id | None | - | source_volid | None | - | status | creating | - | volume_type | None | - +---------------------+--------------------------------------+ - """).strip() - expected = { - 'Property': 'Value', - 'attachments': '[]', - 'availability_zone': 'nova', - 'bootable': 'false', - 'created_at': '2014-02-21T17:14:47.548361', - 'display_description': 'None', - 'display_name': 'NAME', - 'id': 'ffdbd1bb-60dc-4d95-acfe-88774c09ad3e', - 'metadata': '{}', - 'size': '1', - 'snapshot_id': 'None', - 'source_volid': 'None', - 'status': 'creating', - 'volume_type': 'None'} - - vol_info = devstack.parse_os_table(table_str) - assert vol_info == expected - - - - diff --git a/src/ceph/qa/tasks/tests/test_radosgw_admin.py b/src/ceph/qa/tasks/tests/test_radosgw_admin.py deleted file mode 100644 index 59f3578..0000000 --- a/src/ceph/qa/tasks/tests/test_radosgw_admin.py +++ /dev/null @@ -1,31 +0,0 @@ -from mock import Mock - -from .. import radosgw_admin - -acl_with_version = """<?xml version="1.0" encoding="UTF-8"?><AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy> -""" # noqa - - -acl_without_version = """<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy> -""" # noqa - - -class TestGetAcl(object): - - def setup(self): - self.key = Mock() - - def test_removes_xml_version(self): - self.key.get_xml_acl = Mock(return_value=acl_with_version) - result = radosgw_admin.get_acl(self.key) - assert result.startswith('<AccessControlPolicy') - - def test_xml_version_is_already_removed(self): - self.key.get_xml_acl = Mock(return_value=acl_without_version) - result = radosgw_admin.get_acl(self.key) - assert result.startswith('<AccessControlPolicy') - - def test_newline_gets_trimmed(self): - self.key.get_xml_acl = Mock(return_value=acl_without_version) - result = radosgw_admin.get_acl(self.key) - assert result.endswith('\n') is False diff --git a/src/ceph/qa/tasks/teuthology_integration.py b/src/ceph/qa/tasks/teuthology_integration.py deleted file mode 100644 index b5a2278..0000000 --- a/src/ceph/qa/tasks/teuthology_integration.py +++ /dev/null @@ -1,19 +0,0 @@ -import logging -from teuthology import misc -from teuthology.task import Task - -log = logging.getLogger(__name__) - - -class TeuthologyIntegration(Task): - - def begin(self): - misc.sh(""" - set -x - pip install tox - tox - # tox -e py27-integration - tox -e openstack-integration - """) - -task = TeuthologyIntegration diff --git a/src/ceph/qa/tasks/tgt.py b/src/ceph/qa/tasks/tgt.py deleted file mode 100644 index c2b322e..0000000 --- a/src/ceph/qa/tasks/tgt.py +++ /dev/null @@ -1,177 +0,0 @@ -""" -Task to handle tgt - -Assumptions made: - The ceph-extras tgt package may need to get installed. - The open-iscsi package needs to get installed. -""" -import logging -import contextlib - -from teuthology import misc as teuthology -from teuthology import contextutil - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def start_tgt_remotes(ctx, start_tgtd): - """ - This subtask starts up a tgtd on the clients specified - """ - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - tgtd_list = [] - for rem, roles in remotes.iteritems(): - for _id in roles: - if _id in start_tgtd: - if not rem in tgtd_list: - tgtd_list.append(rem) - size = ctx.config.get('image_size', 10240) - rem.run( - args=[ - 'rbd', - 'create', - 'iscsi-image', - '--size', - str(size), - ]) - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--mode', - 'target', - '--op', - 'new', - '--tid', - '1', - '--targetname', - 'rbd', - ]) - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--mode', - 'logicalunit', - '--op', - 'new', - '--tid', - '1', - '--lun', - '1', - '--backing-store', - 'iscsi-image', - '--bstype', - 'rbd', - ]) - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--op', - 'bind', - '--mode', - 'target', - '--tid', - '1', - '-I', - 'ALL', - ]) - try: - yield - - finally: - for rem in tgtd_list: - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--mode', - 'target', - '--op', - 'delete', - '--force', - '--tid', - '1', - ]) - rem.run( - args=[ - 'rbd', - 'snap', - 'purge', - 'iscsi-image', - ]) - rem.run( - args=[ - 'sudo', - 'rbd', - 'rm', - 'iscsi-image', - ]) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Start up tgt. - - To start on on all clients:: - - tasks: - - ceph: - - tgt: - - To start on certain clients:: - - tasks: - - ceph: - - tgt: [client.0, client.3] - - or - - tasks: - - ceph: - - tgt: - client.0: - client.3: - - An image blocksize size can also be specified:: - - tasks: - - ceph: - - tgt: - image_size = 20480 - - The general flow of things here is: - 1. Find clients on which tgt is supposed to run (start_tgtd) - 2. Remotely start up tgt daemon - On cleanup: - 3. Stop tgt daemon - - The iscsi administration is handled by the iscsi task. - """ - if config: - config = {key : val for key, val in config.items() - if key.startswith('client')} - # config at this point should only contain keys starting with 'client' - start_tgtd = [] - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - log.info(remotes) - if not config: - start_tgtd = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - else: - start_tgtd = config - log.info(start_tgtd) - with contextutil.nested( - lambda: start_tgt_remotes(ctx=ctx, start_tgtd=start_tgtd),): - yield diff --git a/src/ceph/qa/tasks/thrash_pool_snaps.py b/src/ceph/qa/tasks/thrash_pool_snaps.py deleted file mode 100644 index c71c9ce..0000000 --- a/src/ceph/qa/tasks/thrash_pool_snaps.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Thrash -- Simulate random osd failures. -""" -import contextlib -import logging -import gevent -import time -import random - - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - "Thrash" snap creation and removal on the listed pools - - Example: - - thrash_pool_snaps: - pools: [.rgw.buckets, .rgw.buckets.index] - max_snaps: 10 - min_snaps: 5 - period: 10 - """ - stopping = False - def do_thrash(): - pools = config.get('pools', []) - max_snaps = config.get('max_snaps', 10) - min_snaps = config.get('min_snaps', 5) - period = config.get('period', 30) - snaps = [] - manager = ctx.managers['ceph'] - def remove_snap(): - assert len(snaps) > 0 - snap = random.choice(snaps) - log.info("Removing snap %s" % (snap,)) - for pool in pools: - manager.remove_pool_snap(pool, str(snap)) - snaps.remove(snap) - def add_snap(snap): - log.info("Adding snap %s" % (snap,)) - for pool in pools: - manager.add_pool_snap(pool, str(snap)) - snaps.append(snap) - index = 0 - while not stopping: - index += 1 - time.sleep(period) - if len(snaps) <= min_snaps: - add_snap(index) - elif len(snaps) >= max_snaps: - remove_snap() - else: - random.choice([lambda: add_snap(index), remove_snap])() - log.info("Stopping") - thread = gevent.spawn(do_thrash) - yield - stopping = True - thread.join() - diff --git a/src/ceph/qa/tasks/thrashosds-health.yaml b/src/ceph/qa/tasks/thrashosds-health.yaml deleted file mode 100644 index 9defe69..0000000 --- a/src/ceph/qa/tasks/thrashosds-health.yaml +++ /dev/null @@ -1,14 +0,0 @@ -overrides: - ceph: - log-whitelist: - - overall HEALTH_ - - \(OSDMAP_FLAGS\) - - \(OSD_ - - \(PG_ - - \(POOL_ - - \(CACHE_POOL_ - - \(SMALLER_PGP_NUM\) - - \(OBJECT_ - - \(REQUEST_SLOW\) - - \(TOO_FEW_PGS\) - - \(MON_DOWN\) diff --git a/src/ceph/qa/tasks/thrashosds.py b/src/ceph/qa/tasks/thrashosds.py deleted file mode 100644 index 420b735..0000000 --- a/src/ceph/qa/tasks/thrashosds.py +++ /dev/null @@ -1,204 +0,0 @@ -""" -Thrash -- Simulate random osd failures. -""" -import contextlib -import logging -import ceph_manager -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - "Thrash" the OSDs by randomly marking them out/down (and then back - in) until the task is ended. This loops, and every op_delay - seconds it randomly chooses to add or remove an OSD (even odds) - unless there are fewer than min_out OSDs out of the cluster, or - more than min_in OSDs in the cluster. - - All commands are run on mon0 and it stops when __exit__ is called. - - The config is optional, and is a dict containing some or all of: - - cluster: (default 'ceph') the name of the cluster to thrash - - min_in: (default 4) the minimum number of OSDs to keep in the - cluster - - min_out: (default 0) the minimum number of OSDs to keep out of the - cluster - - op_delay: (5) the length of time to sleep between changing an - OSD's status - - min_dead: (0) minimum number of osds to leave down/dead. - - max_dead: (0) maximum number of osds to leave down/dead before waiting - for clean. This should probably be num_replicas - 1. - - clean_interval: (60) the approximate length of time to loop before - waiting until the cluster goes clean. (In reality this is used - to probabilistically choose when to wait, and the method used - makes it closer to -- but not identical to -- the half-life.) - - scrub_interval: (-1) the approximate length of time to loop before - waiting until a scrub is performed while cleaning. (In reality - this is used to probabilistically choose when to wait, and it - only applies to the cases where cleaning is being performed). - -1 is used to indicate that no scrubbing will be done. - - chance_down: (0.4) the probability that the thrasher will mark an - OSD down rather than marking it out. (The thrasher will not - consider that OSD out of the cluster, since presently an OSD - wrongly marked down will mark itself back up again.) This value - can be either an integer (eg, 75) or a float probability (eg - 0.75). - - chance_test_min_size: (0) chance to run test_pool_min_size, - which: - - kills all but one osd - - waits - - kills that osd - - revives all other osds - - verifies that the osds fully recover - - timeout: (360) the number of seconds to wait for the cluster - to become clean after each cluster change. If this doesn't - happen within the timeout, an exception will be raised. - - revive_timeout: (150) number of seconds to wait for an osd asok to - appear after attempting to revive the osd - - thrash_primary_affinity: (true) randomly adjust primary-affinity - - chance_pgnum_grow: (0) chance to increase a pool's size - chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool - pool_grow_by: (10) amount to increase pgnum by - max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd - - pause_short: (3) duration of short pause - pause_long: (80) duration of long pause - pause_check_after: (50) assert osd down after this long - chance_inject_pause_short: (1) chance of injecting short stall - chance_inject_pause_long: (0) chance of injecting long stall - - clean_wait: (0) duration to wait before resuming thrashing once clean - - sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a - random live osd - - powercycle: (false) whether to power cycle the node instead - of just the osd process. Note that this assumes that a single - osd is the only important process on the node. - - bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash. - the delay lets the BlockDevice "accept" more aio operations but blocks - any flush, and then eventually crashes (losing some or all ios). If 0, - no bdev failure injection is enabled. - - bdev_inject_crash_probability: (.5) probability of doing a bdev failure - injection crash vs a normal OSD kill. - - chance_test_backfill_full: (0) chance to simulate full disks stopping - backfill - - chance_test_map_discontinuity: (0) chance to test map discontinuity - map_discontinuity_sleep_time: (40) time to wait for map trims - - ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down - chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%) - - optrack_toggle_delay: (2.0) duration to delay between toggling op tracker - enablement to all osds - - dump_ops_enable: (true) continuously dump ops on all live osds - - noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub - - disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based - tests - - chance_thrash_cluster_full: .05 - - chance_thrash_pg_upmap: 1.0 - chance_thrash_pg_upmap_items: 1.0 - - example: - - tasks: - - ceph: - - thrashosds: - cluster: ceph - chance_down: 10 - op_delay: 3 - min_in: 1 - timeout: 600 - - interactive: - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'thrashosds task only accepts a dict for configuration' - # add default value for sighup_delay - config['sighup_delay'] = config.get('sighup_delay', 0.1) - # add default value for optrack_toggle_delay - config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0) - # add default value for dump_ops_enable - config['dump_ops_enable'] = config.get('dump_ops_enable', "true") - # add default value for noscrub_toggle_delay - config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0) - # add default value for random_eio - config['random_eio'] = config.get('random_eio', 0.0) - - log.info("config is {config}".format(config=str(config))) - - overrides = ctx.config.get('overrides', {}) - log.info("overrides is {overrides}".format(overrides=str(overrides))) - teuthology.deep_merge(config, overrides.get('thrashosds', {})) - cluster = config.get('cluster', 'ceph') - - log.info("config is {config}".format(config=str(config))) - - if 'powercycle' in config: - - # sync everyone first to avoid collateral damage to / etc. - log.info('Doing preliminary sync to avoid collateral damage...') - ctx.cluster.run(args=['sync']) - - if 'ipmi_user' in ctx.teuthology_config: - for remote in ctx.cluster.remotes.keys(): - log.debug('checking console status of %s' % remote.shortname) - if not remote.console.check_status(): - log.warn('Failed to get console status for %s', - remote.shortname) - - # check that all osd remotes have a valid console - osds = ctx.cluster.only(teuthology.is_type('osd', cluster)) - for remote in osds.remotes.keys(): - if not remote.console.has_ipmi_credentials: - raise Exception( - 'IPMI console required for powercycling, ' - 'but not available on osd role: {r}'.format( - r=remote.name)) - - cluster_manager = ctx.managers[cluster] - for f in ['powercycle', 'bdev_inject_crash']: - if config.get(f): - cluster_manager.config[f] = config.get(f) - - log.info('Beginning thrashosds...') - thrash_proc = ceph_manager.Thrasher( - cluster_manager, - config, - logger=log.getChild('thrasher') - ) - try: - yield - finally: - log.info('joining thrashosds') - thrash_proc.do_join() - cluster_manager.wait_for_all_osds_up() - cluster_manager.flush_all_pg_stats() - cluster_manager.wait_for_recovery(config.get('timeout', 360)) diff --git a/src/ceph/qa/tasks/userdata_setup.yaml b/src/ceph/qa/tasks/userdata_setup.yaml deleted file mode 100644 index d39695b..0000000 --- a/src/ceph/qa/tasks/userdata_setup.yaml +++ /dev/null @@ -1,25 +0,0 @@ -#cloud-config-archive - -- type: text/cloud-config - content: | - output: - all: '| tee -a /var/log/cloud-init-output.log' - -# allow passwordless access for debugging -- | - #!/bin/bash - exec passwd -d ubuntu - -- | - #!/bin/bash - - # mount a NFS share for storing logs - apt-get update - apt-get -y install nfs-common - mkdir /mnt/log - # 10.0.2.2 is the host - mount -v -t nfs -o proto=tcp 10.0.2.2:{mnt_dir} /mnt/log - - # mount the iso image that has the test script - mkdir /mnt/cdrom - mount -t auto /dev/cdrom /mnt/cdrom diff --git a/src/ceph/qa/tasks/userdata_teardown.yaml b/src/ceph/qa/tasks/userdata_teardown.yaml deleted file mode 100644 index 7f3d64f..0000000 --- a/src/ceph/qa/tasks/userdata_teardown.yaml +++ /dev/null @@ -1,11 +0,0 @@ -- | - #!/bin/bash - cp /var/log/cloud-init-output.log /mnt/log - -- | - #!/bin/bash - umount /mnt/log - -- | - #!/bin/bash - shutdown -h -P now diff --git a/src/ceph/qa/tasks/util/__init__.py b/src/ceph/qa/tasks/util/__init__.py deleted file mode 100644 index 5b8575e..0000000 --- a/src/ceph/qa/tasks/util/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -from teuthology import misc - -def get_remote(ctx, cluster, service_type, service_id): - """ - Get the Remote for the host where a particular role runs. - - :param cluster: name of the cluster the service is part of - :param service_type: e.g. 'mds', 'osd', 'client' - :param service_id: The third part of a role, e.g. '0' for - the role 'ceph.client.0' - :return: a Remote instance for the host where the - requested role is placed - """ - def _is_instance(role): - role_tuple = misc.split_role(role) - return role_tuple == (cluster, service_type, str(service_id)) - try: - (remote,) = ctx.cluster.only(_is_instance).remotes.keys() - except ValueError: - raise KeyError("Service {0}.{1}.{2} not found".format(cluster, - service_type, - service_id)) - return remote - -def get_remote_for_role(ctx, role): - return get_remote(ctx, *misc.split_role(role)) diff --git a/src/ceph/qa/tasks/util/rados.py b/src/ceph/qa/tasks/util/rados.py deleted file mode 100644 index a83f9e1..0000000 --- a/src/ceph/qa/tasks/util/rados.py +++ /dev/null @@ -1,87 +0,0 @@ -import logging - -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def rados(ctx, remote, cmd, wait=True, check_status=False): - testdir = teuthology.get_testdir(ctx) - log.info("rados %s" % ' '.join(cmd)) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - ]; - pre.extend(cmd) - proc = remote.run( - args=pre, - check_status=check_status, - wait=wait, - ) - if wait: - return proc.exitstatus - else: - return proc - -def create_ec_pool(remote, name, profile_name, pgnum, profile={}, cluster_name="ceph", application=None): - remote.run(args=['sudo', 'ceph'] + - cmd_erasure_code_profile(profile_name, profile) + ['--cluster', cluster_name]) - remote.run(args=[ - 'sudo', 'ceph', 'osd', 'pool', 'create', name, - str(pgnum), str(pgnum), 'erasure', profile_name, '--cluster', cluster_name - ]) - if application: - remote.run(args=[ - 'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name - ], check_status=False) # may fail as EINVAL when run in jewel upgrade test - -def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application=None): - remote.run(args=[ - 'sudo', 'ceph', 'osd', 'pool', 'create', name, str(pgnum), str(pgnum), '--cluster', cluster_name - ]) - if application: - remote.run(args=[ - 'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name - ], check_status=False) - -def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"): - remote.run(args=[ - 'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum), '--cluster', cluster_name - ]) - remote.run(args=[ - 'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name, - str(size), '--cluster', cluster_name - ]) - -def cmd_erasure_code_profile(profile_name, profile): - """ - Return the shell command to run to create the erasure code profile - described by the profile parameter. - - :param profile_name: a string matching [A-Za-z0-9-_.]+ - :param profile: a map whose semantic depends on the erasure code plugin - :returns: a shell command as an array suitable for Remote.run - - If profile is {}, it is replaced with - - { 'k': '2', 'm': '1', 'crush-failure-domain': 'osd'} - - for backward compatibility. In previous versions of teuthology, - these values were hardcoded as function arguments and some yaml - files were designed with these implicit values. The teuthology - code should not know anything about the erasure code profile - content or semantic. The valid values and parameters are outside - its scope. - """ - - if profile == {}: - profile = { - 'k': '2', - 'm': '1', - 'crush-failure-domain': 'osd' - } - return [ - 'osd', 'erasure-code-profile', 'set', - profile_name - ] + [ str(key) + '=' + str(value) for key, value in profile.iteritems() ] diff --git a/src/ceph/qa/tasks/util/rgw.py b/src/ceph/qa/tasks/util/rgw.py deleted file mode 100644 index ab76b50..0000000 --- a/src/ceph/qa/tasks/util/rgw.py +++ /dev/null @@ -1,81 +0,0 @@ -from cStringIO import StringIO -import logging -import json -import requests - -from requests.packages.urllib3 import PoolManager -from requests.packages.urllib3.util import Retry -from urlparse import urlparse - -from teuthology.orchestra.connection import split_user -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def rgwadmin(ctx, client, cmd, stdin=StringIO(), check_status=False, - format='json', decode=True, log_level=logging.DEBUG): - log.info('rgwadmin: {client} : {cmd}'.format(client=client,cmd=cmd)) - testdir = teuthology.get_testdir(ctx) - cluster_name, daemon_type, client_id = teuthology.split_role(client) - client_with_id = daemon_type + '.' + client_id - pre = [ - 'adjust-ulimits', - 'ceph-coverage'.format(tdir=testdir), - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin'.format(tdir=testdir), - '--log-to-stderr', - '--format', format, - '-n', client_with_id, - '--cluster', cluster_name, - ] - pre.extend(cmd) - log.log(log_level, 'rgwadmin: cmd=%s' % pre) - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - proc = remote.run( - args=pre, - check_status=check_status, - stdout=StringIO(), - stderr=StringIO(), - stdin=stdin, - ) - r = proc.exitstatus - out = proc.stdout.getvalue() - if not decode: - return (r, out) - j = None - if not r and out != '': - try: - j = json.loads(out) - log.log(log_level, ' json result: %s' % j) - except ValueError: - j = out - log.log(log_level, ' raw result: %s' % j) - return (r, j) - -def get_user_summary(out, user): - """Extract the summary for a given user""" - user_summary = None - for summary in out['summary']: - if summary.get('user') == user: - user_summary = summary - - if not user_summary: - raise AssertionError('No summary info found for user: %s' % user) - - return user_summary - -def get_user_successful_ops(out, user): - summary = out['summary'] - if len(summary) == 0: - return 0 - return get_user_summary(out, user)['total']['successful_ops'] - -def wait_for_radosgw(url): - """ poll the given url until it starts accepting connections - - add_daemon() doesn't wait until radosgw finishes startup, so this is used - to avoid racing with later tasks that expect radosgw to be up and listening - """ - # use a connection pool with retry/backoff to poll until it starts listening - http = PoolManager(retries=Retry(connect=8, backoff_factor=1)) - http.request('GET', url) diff --git a/src/ceph/qa/tasks/util/test/__init__.py b/src/ceph/qa/tasks/util/test/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/src/ceph/qa/tasks/util/test/__init__.py +++ /dev/null diff --git a/src/ceph/qa/tasks/util/test/test_rados.py b/src/ceph/qa/tasks/util/test/test_rados.py deleted file mode 100644 index ee1cfa6..0000000 --- a/src/ceph/qa/tasks/util/test/test_rados.py +++ /dev/null @@ -1,40 +0,0 @@ -# -# The MIT License -# -# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com> -# -# Author: Loic Dachary <loic@dachary.org> -# -# Permission is hereby granted, free of charge, to any person -# obtaining a copy of this software and associated documentation -# files (the "Software"), to deal in the Software without -# restriction, including without limitation the rights to use, -# copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be -# included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. -# -from .. import rados - -class TestRados(object): - - def test_cmd_erasure_code_profile(self): - name = 'NAME' - cmd = rados.cmd_erasure_code_profile(name, {}) - assert 'k=2' in cmd - assert name in cmd - cmd = rados.cmd_erasure_code_profile(name, { 'k': '88' }) - assert 'k=88' in cmd - assert name in cmd diff --git a/src/ceph/qa/tasks/vstart_runner.py b/src/ceph/qa/tasks/vstart_runner.py deleted file mode 100644 index 842e80d..0000000 --- a/src/ceph/qa/tasks/vstart_runner.py +++ /dev/null @@ -1,1079 +0,0 @@ -""" -vstart_runner: override Filesystem and Mount interfaces to run a CephFSTestCase against a vstart -ceph instance instead of a packaged/installed cluster. Use this to turn around test cases -quickly during development. - -Simple usage (assuming teuthology and ceph checked out in ~/git): - - # Activate the teuthology virtualenv - source ~/git/teuthology/virtualenv/bin/activate - # Go into your ceph build directory - cd ~/git/ceph/build - # Invoke a test using this script - python ~/git/ceph/qa/tasks/vstart_runner.py --create tasks.cephfs.test_data_scan - -Alternative usage: - - # Alternatively, if you use different paths, specify them as follows: - LD_LIBRARY_PATH=`pwd`/lib PYTHONPATH=~/git/teuthology:~/git/ceph/qa:`pwd`/../src/pybind:`pwd`/lib/cython_modules/lib.2 python ~/git/ceph/qa/tasks/vstart_runner.py - - # If you wish to drop to a python shell on failures, use --interactive: - python ~/git/ceph/qa/tasks/vstart_runner.py --interactive - - # If you wish to run a named test case, pass it as an argument: - python ~/git/ceph/qa/tasks/vstart_runner.py tasks.cephfs.test_data_scan - -""" - -from StringIO import StringIO -from collections import defaultdict -import getpass -import signal -import tempfile -import threading -import datetime -import shutil -import re -import os -import time -import json -import sys -import errno -from unittest import suite, loader -import unittest -import platform -from teuthology.orchestra.run import Raw, quote -from teuthology.orchestra.daemon import DaemonGroup -from teuthology.config import config as teuth_config - -import logging - -log = logging.getLogger(__name__) - -handler = logging.FileHandler("./vstart_runner.log") -formatter = logging.Formatter( - fmt=u'%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s', - datefmt='%Y-%m-%dT%H:%M:%S') -handler.setFormatter(formatter) -log.addHandler(handler) -log.setLevel(logging.INFO) - - -def respawn_in_path(lib_path, python_paths): - execv_cmd = ['python'] - if platform.system() == "Darwin": - lib_path_var = "DYLD_LIBRARY_PATH" - else: - lib_path_var = "LD_LIBRARY_PATH" - - py_binary = os.environ.get("PYTHON", "python") - - if lib_path_var in os.environ: - if lib_path not in os.environ[lib_path_var]: - os.environ[lib_path_var] += ':' + lib_path - os.execvp(py_binary, execv_cmd + sys.argv) - else: - os.environ[lib_path_var] = lib_path - os.execvp(py_binary, execv_cmd + sys.argv) - - for p in python_paths: - sys.path.insert(0, p) - - -# Let's use some sensible defaults -if os.path.exists("./CMakeCache.txt") and os.path.exists("./bin"): - - # A list of candidate paths for each package we need - guesses = [ - ["~/git/teuthology", "~/scm/teuthology", "~/teuthology"], - ["lib/cython_modules/lib.2"], - ["../src/pybind"], - ] - - python_paths = [] - - # Up one level so that "tasks.foo.bar" imports work - python_paths.append(os.path.abspath( - os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") - )) - - for package_guesses in guesses: - for g in package_guesses: - g_exp = os.path.abspath(os.path.expanduser(g)) - if os.path.exists(g_exp): - python_paths.append(g_exp) - - ld_path = os.path.join(os.getcwd(), "lib/") - print "Using guessed paths {0} {1}".format(ld_path, python_paths) - respawn_in_path(ld_path, python_paths) - - -try: - from teuthology.exceptions import CommandFailedError - from tasks.ceph_manager import CephManager - from tasks.cephfs.fuse_mount import FuseMount - from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster - from mgr.mgr_test_case import MgrCluster - from teuthology.contextutil import MaxWhileTries - from teuthology.task import interactive -except ImportError: - sys.stderr.write("***\nError importing packages, have you activated your teuthology virtualenv " - "and set PYTHONPATH to point to teuthology and ceph-qa-suite?\n***\n\n") - raise - -# Must import after teuthology because of gevent monkey patching -import subprocess - -if os.path.exists("./CMakeCache.txt"): - # Running in build dir of a cmake build - BIN_PREFIX = "./bin/" - SRC_PREFIX = "../src" -else: - # Running in src/ of an autotools build - BIN_PREFIX = "./" - SRC_PREFIX = "./" - - -class LocalRemoteProcess(object): - def __init__(self, args, subproc, check_status, stdout, stderr): - self.args = args - self.subproc = subproc - if stdout is None: - self.stdout = StringIO() - else: - self.stdout = stdout - - if stderr is None: - self.stderr = StringIO() - else: - self.stderr = stderr - - self.check_status = check_status - self.exitstatus = self.returncode = None - - def wait(self): - if self.finished: - # Avoid calling communicate() on a dead process because it'll - # give you stick about std* already being closed - if self.exitstatus != 0: - raise CommandFailedError(self.args, self.exitstatus) - else: - return - - out, err = self.subproc.communicate() - self.stdout.write(out) - self.stderr.write(err) - - self.exitstatus = self.returncode = self.subproc.returncode - - if self.exitstatus != 0: - sys.stderr.write(out) - sys.stderr.write(err) - - if self.check_status and self.exitstatus != 0: - raise CommandFailedError(self.args, self.exitstatus) - - @property - def finished(self): - if self.exitstatus is not None: - return True - - if self.subproc.poll() is not None: - out, err = self.subproc.communicate() - self.stdout.write(out) - self.stderr.write(err) - self.exitstatus = self.returncode = self.subproc.returncode - return True - else: - return False - - def kill(self): - log.info("kill ") - if self.subproc.pid and not self.finished: - log.info("kill: killing pid {0} ({1})".format( - self.subproc.pid, self.args)) - safe_kill(self.subproc.pid) - else: - log.info("kill: already terminated ({0})".format(self.args)) - - @property - def stdin(self): - class FakeStdIn(object): - def __init__(self, mount_daemon): - self.mount_daemon = mount_daemon - - def close(self): - self.mount_daemon.kill() - - return FakeStdIn(self) - - -class LocalRemote(object): - """ - Amusingly named class to present the teuthology RemoteProcess interface when we are really - running things locally for vstart - - Run this inside your src/ dir! - """ - - def __init__(self): - self.name = "local" - self.hostname = "localhost" - self.user = getpass.getuser() - - def get_file(self, path, sudo, dest_dir): - tmpfile = tempfile.NamedTemporaryFile(delete=False).name - shutil.copy(path, tmpfile) - return tmpfile - - def put_file(self, src, dst, sudo=False): - shutil.copy(src, dst) - - def run(self, args, check_status=True, wait=True, - stdout=None, stderr=None, cwd=None, stdin=None, - logger=None, label=None, env=None): - log.info("run args={0}".format(args)) - - # We don't need no stinkin' sudo - args = [a for a in args if a != "sudo"] - - # We have to use shell=True if any run.Raw was present, e.g. && - shell = any([a for a in args if isinstance(a, Raw)]) - - if shell: - filtered = [] - i = 0 - while i < len(args): - if args[i] == 'adjust-ulimits': - i += 1 - elif args[i] == 'ceph-coverage': - i += 2 - elif args[i] == 'timeout': - i += 2 - else: - filtered.append(args[i]) - i += 1 - - args = quote(filtered) - log.info("Running {0}".format(args)) - - subproc = subprocess.Popen(args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=subprocess.PIPE, - cwd=cwd, - shell=True) - else: - log.info("Running {0}".format(args)) - - for arg in args: - if not isinstance(arg, basestring): - raise RuntimeError("Oops, can't handle arg {0} type {1}".format( - arg, arg.__class__ - )) - - subproc = subprocess.Popen(args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=subprocess.PIPE, - cwd=cwd, - env=env) - - if stdin: - if not isinstance(stdin, basestring): - raise RuntimeError("Can't handle non-string stdins on a vstart cluster") - - # Hack: writing to stdin is not deadlock-safe, but it "always" works - # as long as the input buffer is "small" - subproc.stdin.write(stdin) - - proc = LocalRemoteProcess( - args, subproc, check_status, - stdout, stderr - ) - - if wait: - proc.wait() - - return proc - - -class LocalDaemon(object): - def __init__(self, daemon_type, daemon_id): - self.daemon_type = daemon_type - self.daemon_id = daemon_id - self.controller = LocalRemote() - self.proc = None - - @property - def remote(self): - return LocalRemote() - - def running(self): - return self._get_pid() is not None - - def _get_pid(self): - """ - Return PID as an integer or None if not found - """ - ps_txt = self.controller.run( - args=["ps", "ww", "-u"+str(os.getuid())] - ).stdout.getvalue().strip() - lines = ps_txt.split("\n")[1:] - - for line in lines: - if line.find("ceph-{0} -i {1}".format(self.daemon_type, self.daemon_id)) != -1: - log.info("Found ps line for daemon: {0}".format(line)) - return int(line.split()[0]) - log.info("No match for {0} {1}: {2}".format( - self.daemon_type, self.daemon_id, ps_txt - )) - return None - - def wait(self, timeout): - waited = 0 - while self._get_pid() is not None: - if waited > timeout: - raise MaxWhileTries("Timed out waiting for daemon {0}.{1}".format(self.daemon_type, self.daemon_id)) - time.sleep(1) - waited += 1 - - def stop(self, timeout=300): - if not self.running(): - log.error('tried to stop a non-running daemon') - return - - pid = self._get_pid() - log.info("Killing PID {0} for {1}.{2}".format(pid, self.daemon_type, self.daemon_id)) - os.kill(pid, signal.SIGKILL) - - waited = 0 - while pid is not None: - new_pid = self._get_pid() - if new_pid is not None and new_pid != pid: - log.info("Killing new PID {0}".format(new_pid)) - pid = new_pid - os.kill(pid, signal.SIGKILL) - - if new_pid is None: - break - else: - if waited > timeout: - raise MaxWhileTries( - "Timed out waiting for daemon {0}.{1}".format( - self.daemon_type, self.daemon_id)) - time.sleep(1) - waited += 1 - - self.wait(timeout=timeout) - - def restart(self): - if self._get_pid() is not None: - self.stop() - - self.proc = self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id]) - - -def safe_kill(pid): - """ - os.kill annoyingly raises exception if process already dead. Ignore it. - """ - try: - return os.kill(pid, signal.SIGKILL) - except OSError as e: - if e.errno == errno.ESRCH: - # Raced with process termination - pass - else: - raise - - -class LocalFuseMount(FuseMount): - def __init__(self, test_dir, client_id): - super(LocalFuseMount, self).__init__(None, test_dir, client_id, LocalRemote()) - - @property - def config_path(self): - return "./ceph.conf" - - def get_keyring_path(self): - # This is going to end up in a config file, so use an absolute path - # to avoid assumptions about daemons' pwd - return os.path.abspath("./client.{0}.keyring".format(self.client_id)) - - def run_shell(self, args, wait=True): - # FIXME maybe should add a pwd arg to teuthology.orchestra so that - # the "cd foo && bar" shenanigans isn't needed to begin with and - # then we wouldn't have to special case this - return self.client_remote.run( - args, wait=wait, cwd=self.mountpoint - ) - - @property - def _prefix(self): - return BIN_PREFIX - - def _asok_path(self): - # In teuthology, the asok is named after the PID of the ceph-fuse process, because it's - # run foreground. When running it daemonized however, the asok is named after - # the PID of the launching process, not the long running ceph-fuse process. Therefore - # we need to give an exact path here as the logic for checking /proc/ for which - # asok is alive does not work. - path = "./out/client.{0}.{1}.asok".format(self.client_id, self.fuse_daemon.subproc.pid) - log.info("I think my launching pid was {0}".format(self.fuse_daemon.subproc.pid)) - return path - - def umount(self): - if self.is_mounted(): - super(LocalFuseMount, self).umount() - - def mount(self, mount_path=None, mount_fs_name=None): - self.client_remote.run( - args=[ - 'mkdir', - '--', - self.mountpoint, - ], - ) - - def list_connections(): - self.client_remote.run( - args=["mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"], - check_status=False - ) - p = self.client_remote.run( - args=["ls", "/sys/fs/fuse/connections"], - check_status=False - ) - if p.exitstatus != 0: - log.warn("ls conns failed with {0}, assuming none".format(p.exitstatus)) - return [] - - ls_str = p.stdout.getvalue().strip() - if ls_str: - return [int(n) for n in ls_str.split("\n")] - else: - return [] - - # Before starting ceph-fuse process, note the contents of - # /sys/fs/fuse/connections - pre_mount_conns = list_connections() - log.info("Pre-mount connections: {0}".format(pre_mount_conns)) - - prefix = [os.path.join(BIN_PREFIX, "ceph-fuse")] - if os.getuid() != 0: - prefix += ["--client-die-on-failed-remount=false"] - - if mount_path is not None: - prefix += ["--client_mountpoint={0}".format(mount_path)] - - if mount_fs_name is not None: - prefix += ["--client_mds_namespace={0}".format(mount_fs_name)] - - self.fuse_daemon = self.client_remote.run(args= - prefix + [ - "-f", - "--name", - "client.{0}".format(self.client_id), - self.mountpoint - ], wait=False) - - log.info("Mounting client.{0} with pid {1}".format(self.client_id, self.fuse_daemon.subproc.pid)) - - # Wait for the connection reference to appear in /sys - waited = 0 - post_mount_conns = list_connections() - while len(post_mount_conns) <= len(pre_mount_conns): - if self.fuse_daemon.finished: - # Did mount fail? Raise the CommandFailedError instead of - # hitting the "failed to populate /sys/" timeout - self.fuse_daemon.wait() - time.sleep(1) - waited += 1 - if waited > 30: - raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format( - waited - )) - post_mount_conns = list_connections() - - log.info("Post-mount connections: {0}".format(post_mount_conns)) - - # Record our fuse connection number so that we can use it when - # forcing an unmount - new_conns = list(set(post_mount_conns) - set(pre_mount_conns)) - if len(new_conns) == 0: - raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns)) - elif len(new_conns) > 1: - raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns)) - else: - self._fuse_conn = new_conns[0] - - def _run_python(self, pyscript): - """ - Override this to remove the daemon-helper prefix that is used otherwise - to make the process killable. - """ - return self.client_remote.run(args=[ - 'python', '-c', pyscript - ], wait=False) - - -class LocalCephManager(CephManager): - def __init__(self): - # Deliberately skip parent init, only inheriting from it to get - # util methods like osd_dump that sit on top of raw_cluster_cmd - self.controller = LocalRemote() - - # A minority of CephManager fns actually bother locking for when - # certain teuthology tests want to run tasks in parallel - self.lock = threading.RLock() - - self.log = lambda x: log.info(x) - - def find_remote(self, daemon_type, daemon_id): - """ - daemon_type like 'mds', 'osd' - daemon_id like 'a', '0' - """ - return LocalRemote() - - def run_ceph_w(self): - proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph"), "-w"], wait=False, stdout=StringIO()) - return proc - - def raw_cluster_cmd(self, *args): - """ - args like ["osd", "dump"} - return stdout string - """ - proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args)) - return proc.stdout.getvalue() - - def raw_cluster_cmd_result(self, *args): - """ - like raw_cluster_cmd but don't check status, just return rc - """ - proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args), check_status=False) - return proc.exitstatus - - def admin_socket(self, daemon_type, daemon_id, command, check_status=True): - return self.controller.run( - args=[os.path.join(BIN_PREFIX, "ceph"), "daemon", "{0}.{1}".format(daemon_type, daemon_id)] + command, check_status=check_status - ) - - # FIXME: copypasta - def get_mds_status(self, mds): - """ - Run cluster commands for the mds in order to get mds information - """ - out = self.raw_cluster_cmd('mds', 'dump', '--format=json') - j = json.loads(' '.join(out.splitlines()[1:])) - # collate; for dup ids, larger gid wins. - for info in j['info'].itervalues(): - if info['name'] == mds: - return info - return None - - # FIXME: copypasta - def get_mds_status_by_rank(self, rank): - """ - Run cluster commands for the mds in order to get mds information - check rank. - """ - j = self.get_mds_status_all() - # collate; for dup ids, larger gid wins. - for info in j['info'].itervalues(): - if info['rank'] == rank: - return info - return None - - def get_mds_status_all(self): - """ - Run cluster command to extract all the mds status. - """ - out = self.raw_cluster_cmd('mds', 'dump', '--format=json') - j = json.loads(' '.join(out.splitlines()[1:])) - return j - - -class LocalCephCluster(CephCluster): - def __init__(self, ctx): - # Deliberately skip calling parent constructor - self._ctx = ctx - self.mon_manager = LocalCephManager() - self._conf = defaultdict(dict) - - @property - def admin_remote(self): - return LocalRemote() - - def get_config(self, key, service_type=None): - if service_type is None: - service_type = 'mon' - - # FIXME hardcoded vstart service IDs - service_id = { - 'mon': 'a', - 'mds': 'a', - 'osd': '0' - }[service_type] - - return self.json_asok(['config', 'get', key], service_type, service_id)[key] - - def _write_conf(self): - # In teuthology, we have the honour of writing the entire ceph.conf, but - # in vstart land it has mostly already been written and we need to carefully - # append to it. - conf_path = "./ceph.conf" - banner = "\n#LOCAL_TEST\n" - existing_str = open(conf_path).read() - - if banner in existing_str: - existing_str = existing_str[0:existing_str.find(banner)] - - existing_str += banner - - for subsys, kvs in self._conf.items(): - existing_str += "\n[{0}]\n".format(subsys) - for key, val in kvs.items(): - # Comment out existing instance if it exists - log.info("Searching for existing instance {0}/{1}".format( - key, subsys - )) - existing_section = re.search("^\[{0}\]$([\n]|[^\[])+".format( - subsys - ), existing_str, re.MULTILINE) - - if existing_section: - section_str = existing_str[existing_section.start():existing_section.end()] - existing_val = re.search("^\s*[^#]({0}) =".format(key), section_str, re.MULTILINE) - if existing_val: - start = existing_section.start() + existing_val.start(1) - log.info("Found string to replace at {0}".format( - start - )) - existing_str = existing_str[0:start] + "#" + existing_str[start:] - - existing_str += "{0} = {1}\n".format(key, val) - - open(conf_path, "w").write(existing_str) - - def set_ceph_conf(self, subsys, key, value): - self._conf[subsys][key] = value - self._write_conf() - - def clear_ceph_conf(self, subsys, key): - del self._conf[subsys][key] - self._write_conf() - - -class LocalMDSCluster(LocalCephCluster, MDSCluster): - def __init__(self, ctx): - super(LocalMDSCluster, self).__init__(ctx) - - self.mds_ids = ctx.daemons.daemons['mds'].keys() - self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids]) - - def clear_firewall(self): - # FIXME: unimplemented - pass - - def newfs(self, name='cephfs', create=True): - return LocalFilesystem(self._ctx, name=name, create=create) - - -class LocalMgrCluster(LocalCephCluster, MgrCluster): - def __init__(self, ctx): - super(LocalMgrCluster, self).__init__(ctx) - - self.mgr_ids = ctx.daemons.daemons['mgr'].keys() - self.mgr_daemons = dict([(id_, LocalDaemon("mgr", id_)) for id_ in self.mgr_ids]) - - -class LocalFilesystem(Filesystem, LocalMDSCluster): - def __init__(self, ctx, fscid=None, name='cephfs', create=False): - # Deliberately skip calling parent constructor - self._ctx = ctx - - self.id = None - self.name = None - self.metadata_pool_name = None - self.metadata_overlay = False - self.data_pool_name = None - self.data_pools = None - - # Hack: cheeky inspection of ceph.conf to see what MDSs exist - self.mds_ids = set() - for line in open("ceph.conf").readlines(): - match = re.match("^\[mds\.(.+)\]$", line) - if match: - self.mds_ids.add(match.group(1)) - - if not self.mds_ids: - raise RuntimeError("No MDSs found in ceph.conf!") - - self.mds_ids = list(self.mds_ids) - - log.info("Discovered MDS IDs: {0}".format(self.mds_ids)) - - self.mon_manager = LocalCephManager() - - self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids]) - - self.client_remote = LocalRemote() - - self._conf = defaultdict(dict) - - if name is not None: - if fscid is not None: - raise RuntimeError("cannot specify fscid when creating fs") - if create and not self.legacy_configured(): - self.create() - else: - if fscid is not None: - self.id = fscid - self.getinfo(refresh=True) - - # Stash a reference to the first created filesystem on ctx, so - # that if someone drops to the interactive shell they can easily - # poke our methods. - if not hasattr(self._ctx, "filesystem"): - self._ctx.filesystem = self - - @property - def _prefix(self): - return BIN_PREFIX - - def set_clients_block(self, blocked, mds_id=None): - raise NotImplementedError() - - def get_pgs_per_fs_pool(self): - # FIXME: assuming there are 3 OSDs - return 3 * int(self.get_config('mon_pg_warn_min_per_osd')) - - -class InteractiveFailureResult(unittest.TextTestResult): - """ - Specialization that implements interactive-on-error style - behavior. - """ - def addFailure(self, test, err): - super(InteractiveFailureResult, self).addFailure(test, err) - log.error(self._exc_info_to_string(err, test)) - log.error("Failure in test '{0}', going interactive".format( - self.getDescription(test) - )) - interactive.task(ctx=None, config=None) - - def addError(self, test, err): - super(InteractiveFailureResult, self).addError(test, err) - log.error(self._exc_info_to_string(err, test)) - log.error("Error in test '{0}', going interactive".format( - self.getDescription(test) - )) - interactive.task(ctx=None, config=None) - - -def enumerate_methods(s): - log.info("e: {0}".format(s)) - for t in s._tests: - if isinstance(t, suite.BaseTestSuite): - for sub in enumerate_methods(t): - yield sub - else: - yield s, t - - -def load_tests(modules, loader): - if modules: - log.info("Executing modules: {0}".format(modules)) - module_suites = [] - for mod_name in modules: - # Test names like cephfs.test_auto_repair - module_suites.append(loader.loadTestsFromName(mod_name)) - log.info("Loaded: {0}".format(list(module_suites))) - return suite.TestSuite(module_suites) - else: - log.info("Executing all cephfs tests") - return loader.discover( - os.path.join(os.path.dirname(os.path.abspath(__file__)), "cephfs") - ) - - -def scan_tests(modules): - overall_suite = load_tests(modules, loader.TestLoader()) - - max_required_mds = 0 - max_required_clients = 0 - max_required_mgr = 0 - - for suite, case in enumerate_methods(overall_suite): - max_required_mds = max(max_required_mds, - getattr(case, "MDSS_REQUIRED", 0)) - max_required_clients = max(max_required_clients, - getattr(case, "CLIENTS_REQUIRED", 0)) - max_required_mgr = max(max_required_mgr, - getattr(case, "MGRS_REQUIRED", 0)) - - return max_required_mds, max_required_clients, max_required_mgr - - -class LocalCluster(object): - def __init__(self, rolename="placeholder"): - self.remotes = { - LocalRemote(): [rolename] - } - - def only(self, requested): - return self.__class__(rolename=requested) - - -class LocalContext(object): - def __init__(self): - self.config = {} - self.teuthology_config = teuth_config - self.cluster = LocalCluster() - self.daemons = DaemonGroup() - - # Shove some LocalDaemons into the ctx.daemons DaemonGroup instance so that any - # tests that want to look these up via ctx can do so. - # Inspect ceph.conf to see what roles exist - for conf_line in open("ceph.conf").readlines(): - for svc_type in ["mon", "osd", "mds", "mgr"]: - if svc_type not in self.daemons.daemons: - self.daemons.daemons[svc_type] = {} - match = re.match("^\[{0}\.(.+)\]$".format(svc_type), conf_line) - if match: - svc_id = match.group(1) - self.daemons.daemons[svc_type][svc_id] = LocalDaemon(svc_type, svc_id) - - def __del__(self): - shutil.rmtree(self.teuthology_config['test_path']) - - -def exec_test(): - # Parse arguments - interactive_on_error = False - create_cluster = False - - args = sys.argv[1:] - flags = [a for a in args if a.startswith("-")] - modules = [a for a in args if not a.startswith("-")] - for f in flags: - if f == "--interactive": - interactive_on_error = True - elif f == "--create": - create_cluster = True - else: - log.error("Unknown option '{0}'".format(f)) - sys.exit(-1) - - # Help developers by stopping up-front if their tree isn't built enough for all the - # tools that the tests might want to use (add more here if needed) - require_binaries = ["ceph-dencoder", "cephfs-journal-tool", "cephfs-data-scan", - "cephfs-table-tool", "ceph-fuse", "rados"] - missing_binaries = [b for b in require_binaries if not os.path.exists(os.path.join(BIN_PREFIX, b))] - if missing_binaries: - log.error("Some ceph binaries missing, please build them: {0}".format(" ".join(missing_binaries))) - sys.exit(-1) - - max_required_mds, max_required_clients, max_required_mgr = scan_tests(modules) - - remote = LocalRemote() - - # Tolerate no MDSs or clients running at start - ps_txt = remote.run( - args=["ps", "-u"+str(os.getuid())] - ).stdout.getvalue().strip() - lines = ps_txt.split("\n")[1:] - for line in lines: - if 'ceph-fuse' in line or 'ceph-mds' in line: - pid = int(line.split()[0]) - log.warn("Killing stray process {0}".format(line)) - os.kill(pid, signal.SIGKILL) - - # Fire up the Ceph cluster if the user requested it - if create_cluster: - log.info("Creating cluster with {0} MDS daemons".format( - max_required_mds)) - remote.run([os.path.join(SRC_PREFIX, "stop.sh")], check_status=False) - remote.run(["rm", "-rf", "./out"]) - remote.run(["rm", "-rf", "./dev"]) - vstart_env = os.environ.copy() - vstart_env["FS"] = "0" - vstart_env["MDS"] = max_required_mds.__str__() - vstart_env["OSD"] = "1" - vstart_env["MGR"] = max(max_required_mgr, 1).__str__() - - remote.run([os.path.join(SRC_PREFIX, "vstart.sh"), "-n", "-d", "--nolockdep"], - env=vstart_env) - - # Wait for OSD to come up so that subsequent injectargs etc will - # definitely succeed - LocalCephCluster(LocalContext()).mon_manager.wait_for_all_osds_up(timeout=30) - - # List of client mounts, sufficient to run the selected tests - clients = [i.__str__() for i in range(0, max_required_clients)] - - test_dir = tempfile.mkdtemp() - teuth_config['test_path'] = test_dir - - # Construct Mount classes - mounts = [] - for client_id in clients: - # Populate client keyring (it sucks to use client.admin for test clients - # because it's awkward to find the logs later) - client_name = "client.{0}".format(client_id) - - if client_name not in open("./keyring").read(): - p = remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "auth", "get-or-create", client_name, - "osd", "allow rw", - "mds", "allow", - "mon", "allow r"]) - - open("./keyring", "a").write(p.stdout.getvalue()) - - mount = LocalFuseMount(test_dir, client_id) - mounts.append(mount) - if mount.is_mounted(): - log.warn("unmounting {0}".format(mount.mountpoint)) - mount.umount_wait() - else: - if os.path.exists(mount.mountpoint): - os.rmdir(mount.mountpoint) - - ctx = LocalContext() - ceph_cluster = LocalCephCluster(ctx) - mds_cluster = LocalMDSCluster(ctx) - mgr_cluster = LocalMgrCluster(ctx) - - from tasks.cephfs_test_runner import DecoratingLoader - - class LogStream(object): - def __init__(self): - self.buffer = "" - - def write(self, data): - self.buffer += data - if "\n" in self.buffer: - lines = self.buffer.split("\n") - for line in lines[:-1]: - pass - # sys.stderr.write(line + "\n") - log.info(line) - self.buffer = lines[-1] - - def flush(self): - pass - - decorating_loader = DecoratingLoader({ - "ctx": ctx, - "mounts": mounts, - "ceph_cluster": ceph_cluster, - "mds_cluster": mds_cluster, - "mgr_cluster": mgr_cluster, - }) - - # For the benefit of polling tests like test_full -- in teuthology land we set this - # in a .yaml, here it's just a hardcoded thing for the developer's pleasure. - remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "tell", "osd.*", "injectargs", "--osd-mon-report-interval-max", "5"]) - ceph_cluster.set_ceph_conf("osd", "osd_mon_report_interval_max", "5") - - # Vstart defaults to two segments, which very easily gets a "behind on trimming" health warning - # from normal IO latency. Increase it for running teests. - ceph_cluster.set_ceph_conf("mds", "mds log max segments", "10") - - # Make sure the filesystem created in tests has uid/gid that will let us talk to - # it after mounting it (without having to go root). Set in 'global' not just 'mds' - # so that cephfs-data-scan will pick it up too. - ceph_cluster.set_ceph_conf("global", "mds root ino uid", "%s" % os.getuid()) - ceph_cluster.set_ceph_conf("global", "mds root ino gid", "%s" % os.getgid()) - - # Monkeypatch get_package_version to avoid having to work out what kind of distro we're on - def _get_package_version(remote, pkg_name): - # Used in cephfs tests to find fuse version. Your development workstation *does* have >=2.9, right? - return "2.9" - - import teuthology.packaging - teuthology.packaging.get_package_version = _get_package_version - - overall_suite = load_tests(modules, decorating_loader) - - # Filter out tests that don't lend themselves to interactive running, - victims = [] - for case, method in enumerate_methods(overall_suite): - fn = getattr(method, method._testMethodName) - - drop_test = False - - if hasattr(fn, 'is_for_teuthology') and getattr(fn, 'is_for_teuthology') is True: - drop_test = True - log.warn("Dropping test because long running: ".format(method.id())) - - if getattr(fn, "needs_trimming", False) is True: - drop_test = (os.getuid() != 0) - log.warn("Dropping test because client trim unavailable: ".format(method.id())) - - if drop_test: - # Don't drop the test if it was explicitly requested in arguments - is_named = False - for named in modules: - if named.endswith(method.id()): - is_named = True - break - - if not is_named: - victims.append((case, method)) - - log.info("Disabling {0} tests because of is_for_teuthology or needs_trimming".format(len(victims))) - for s, method in victims: - s._tests.remove(method) - - if interactive_on_error: - result_class = InteractiveFailureResult - else: - result_class = unittest.TextTestResult - fail_on_skip = False - - class LoggingResult(result_class): - def startTest(self, test): - log.info("Starting test: {0}".format(self.getDescription(test))) - test.started_at = datetime.datetime.utcnow() - return super(LoggingResult, self).startTest(test) - - def stopTest(self, test): - log.info("Stopped test: {0} in {1}s".format( - self.getDescription(test), - (datetime.datetime.utcnow() - test.started_at).total_seconds() - )) - - def addSkip(self, test, reason): - if fail_on_skip: - # Don't just call addFailure because that requires a traceback - self.failures.append((test, reason)) - else: - super(LoggingResult, self).addSkip(test, reason) - - # Execute! - result = unittest.TextTestRunner( - stream=LogStream(), - resultclass=LoggingResult, - verbosity=2, - failfast=True).run(overall_suite) - - if not result.wasSuccessful(): - result.printErrors() # duplicate output at end for convenience - - bad_tests = [] - for test, error in result.errors: - bad_tests.append(str(test)) - for test, failure in result.failures: - bad_tests.append(str(test)) - - sys.exit(-1) - else: - sys.exit(0) - - -if __name__ == "__main__": - exec_test() diff --git a/src/ceph/qa/tasks/watch_notify_same_primary.py b/src/ceph/qa/tasks/watch_notify_same_primary.py deleted file mode 100644 index 8f6d33b..0000000 --- a/src/ceph/qa/tasks/watch_notify_same_primary.py +++ /dev/null @@ -1,134 +0,0 @@ - -""" -watch_notify_same_primary task -""" -from cStringIO import StringIO -import contextlib -import logging - -from teuthology.orchestra import run -from teuthology.contextutil import safe_while - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run watch_notify_same_primary - - The config should be as follows: - - watch_notify_same_primary: - clients: [client list] - - The client list should contain 1 client - - The test requires 3 osds. - - example: - - tasks: - - ceph: - - watch_notify_same_primary: - clients: [client.0] - - interactive: - """ - log.info('Beginning watch_notify_same_primary...') - assert isinstance(config, dict), \ - "please list clients to run on" - - clients = config.get('clients', ['client.0']) - assert len(clients) == 1 - role = clients[0] - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - manager = ctx.managers['ceph'] - manager.raw_cluster_cmd('osd', 'set', 'noout') - - pool = manager.create_pool_with_unique_name() - def obj(n): return "foo-{num}".format(num=n) - def start_watch(n): - remote.run( - args = [ - "rados", - "-p", pool, - "put", - obj(n), - "/etc/resolv.conf"], - logger=log.getChild('watch.{id}'.format(id=n))) - proc = remote.run( - args = [ - "rados", - "-p", pool, - "watch", - obj(n)], - stdin=run.PIPE, - stdout=StringIO(), - stderr=StringIO(), - wait=False) - return proc - - num = 20 - - watches = [start_watch(i) for i in range(num)] - - # wait for them all to register - for i in range(num): - with safe_while() as proceed: - while proceed(): - proc = remote.run( - args = [ - "rados", - "-p", pool, - "listwatchers", - obj(i)], - stdout=StringIO()) - lines = proc.stdout.getvalue() - num_watchers = lines.count('watcher=') - log.info('i see %d watchers for %s', num_watchers, obj(i)) - if num_watchers >= 1: - break - - def notify(n, msg): - remote.run( - args = [ - "rados", - "-p", pool, - "notify", - obj(n), - msg], - logger=log.getChild('notify.{id}'.format(id=n))) - - [notify(n, 'notify1') for n in range(len(watches))] - - manager.kill_osd(0) - manager.mark_down_osd(0) - - [notify(n, 'notify2') for n in range(len(watches))] - - try: - yield - finally: - log.info('joining watch_notify_stress') - for watch in watches: - watch.stdin.write("\n") - - run.wait(watches) - - for watch in watches: - lines = watch.stdout.getvalue().split("\n") - got1 = False - got2 = False - for l in lines: - if 'notify1' in l: - got1 = True - if 'notify2' in l: - got2 = True - log.info(lines) - assert got1 and got2 - - manager.revive_osd(0) - manager.remove_pool(pool) diff --git a/src/ceph/qa/tasks/watch_notify_stress.py b/src/ceph/qa/tasks/watch_notify_stress.py deleted file mode 100644 index 6db313f..0000000 --- a/src/ceph/qa/tasks/watch_notify_stress.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -test_stress_watch task -""" -import contextlib -import logging -import proc_thrasher - -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run test_stress_watch - - The config should be as follows: - - test_stress_watch: - clients: [client list] - - example: - - tasks: - - ceph: - - test_stress_watch: - clients: [client.0] - - interactive: - """ - log.info('Beginning test_stress_watch...') - assert isinstance(config, dict), \ - "please list clients to run on" - testwatch = {} - - remotes = [] - - for role in config.get('clients', ['client.0']): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - remotes.append(remote) - - args =['CEPH_CLIENT_ID={id_}'.format(id_=id_), - 'CEPH_ARGS="{flags}"'.format(flags=config.get('flags', '')), - 'daemon-helper', - 'kill', - 'multi_stress_watch foo foo' - ] - - log.info("args are %s" % (args,)) - - proc = proc_thrasher.ProcThrasher({}, remote, - args=[run.Raw(i) for i in args], - logger=log.getChild('testwatch.{id}'.format(id=id_)), - stdin=run.PIPE, - wait=False - ) - proc.start() - testwatch[id_] = proc - - try: - yield - finally: - log.info('joining watch_notify_stress') - for i in testwatch.itervalues(): - i.join() diff --git a/src/ceph/qa/tasks/workunit.py b/src/ceph/qa/tasks/workunit.py deleted file mode 100644 index f69b396..0000000 --- a/src/ceph/qa/tasks/workunit.py +++ /dev/null @@ -1,486 +0,0 @@ -""" -Workunit task -- Run ceph on sets of specific clients -""" -import logging -import pipes -import os -import re - -from copy import deepcopy -from util import get_remote_for_role - -from teuthology import misc -from teuthology.config import config as teuth_config -from teuthology.orchestra.run import CommandFailedError -from teuthology.parallel import parallel -from teuthology.orchestra import run - -log = logging.getLogger(__name__) - - -class Refspec: - def __init__(self, refspec): - self.refspec = refspec - - def __str__(self): - return self.refspec - - def _clone(self, git_url, clonedir, opts=None): - if opts is None: - opts = [] - return (['rm', '-rf', clonedir] + - [run.Raw('&&')] + - ['git', 'clone'] + opts + - [git_url, clonedir]) - - def _cd(self, clonedir): - return ['cd', clonedir] - - def _checkout(self): - return ['git', 'checkout', self.refspec] - - def clone(self, git_url, clonedir): - return (self._clone(git_url, clonedir) + - [run.Raw('&&')] + - self._cd(clonedir) + - [run.Raw('&&')] + - self._checkout()) - - -class Branch(Refspec): - def __init__(self, tag): - Refspec.__init__(self, tag) - - def clone(self, git_url, clonedir): - opts = ['--depth', '1', - '--branch', self.refspec] - return (self._clone(git_url, clonedir, opts) + - [run.Raw('&&')] + - self._cd(clonedir)) - - -class Head(Refspec): - def __init__(self): - Refspec.__init__(self, 'HEAD') - - def clone(self, git_url, clonedir): - opts = ['--depth', '1'] - return (self._clone(git_url, clonedir, opts) + - [run.Raw('&&')] + - self._cd(clonedir)) - - -def task(ctx, config): - """ - Run ceph on all workunits found under the specified path. - - For example:: - - tasks: - - ceph: - - ceph-fuse: [client.0] - - workunit: - clients: - client.0: [direct_io, xattrs.sh] - client.1: [snaps] - branch: foo - - You can also run a list of workunits on all clients: - tasks: - - ceph: - - ceph-fuse: - - workunit: - tag: v0.47 - clients: - all: [direct_io, xattrs.sh, snaps] - - If you have an "all" section it will run all the workunits - on each client simultaneously, AFTER running any workunits specified - for individual clients. (This prevents unintended simultaneous runs.) - - To customize tests, you can specify environment variables as a dict. You - can also specify a time limit for each work unit (defaults to 3h): - - tasks: - - ceph: - - ceph-fuse: - - workunit: - sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6 - clients: - all: [snaps] - env: - FOO: bar - BAZ: quux - timeout: 3h - - This task supports roles that include a ceph cluster, e.g.:: - - tasks: - - ceph: - - workunit: - clients: - backup.client.0: [foo] - client.1: [bar] # cluster is implicitly 'ceph' - - You can also specify an alternative top-level dir to 'qa/workunits', like - 'qa/standalone', with:: - - tasks: - - install: - - workunit: - basedir: qa/standalone - clients: - client.0: - - test-ceph-helpers.sh - - :param ctx: Context - :param config: Configuration - """ - assert isinstance(config, dict) - assert isinstance(config.get('clients'), dict), \ - 'configuration must contain a dictionary of clients' - - # mimic the behavior of the "install" task, where the "overrides" are - # actually the defaults of that task. in other words, if none of "sha1", - # "tag", or "branch" is specified by a "workunit" tasks, we will update - # it with the information in the "workunit" sub-task nested in "overrides". - overrides = deepcopy(ctx.config.get('overrides', {}).get('workunit', {})) - refspecs = {'branch': Branch, 'tag': Refspec, 'sha1': Refspec} - if any(map(lambda i: i in config, refspecs.iterkeys())): - for i in refspecs.iterkeys(): - overrides.pop(i, None) - misc.deep_merge(config, overrides) - - for spec, cls in refspecs.iteritems(): - refspec = config.get(spec) - if refspec: - refspec = cls(refspec) - break - if refspec is None: - refspec = Head() - - timeout = config.get('timeout', '3h') - - log.info('Pulling workunits from ref %s', refspec) - - created_mountpoint = {} - - if config.get('env') is not None: - assert isinstance(config['env'], dict), 'env must be a dictionary' - clients = config['clients'] - - # Create scratch dirs for any non-all workunits - log.info('Making a separate scratch dir for every client...') - for role in clients.iterkeys(): - assert isinstance(role, basestring) - if role == "all": - continue - - assert 'client' in role - created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir')) - created_mountpoint[role] = created_mnt_dir - - # Execute any non-all workunits - with parallel() as p: - for role, tests in clients.iteritems(): - if role != "all": - p.spawn(_run_tests, ctx, refspec, role, tests, - config.get('env'), - basedir=config.get('basedir','qa/workunits'), - timeout=timeout) - - # Clean up dirs from any non-all workunits - for role, created in created_mountpoint.items(): - _delete_dir(ctx, role, created) - - # Execute any 'all' workunits - if 'all' in clients: - all_tasks = clients["all"] - _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'), - config.get('basedir', 'qa/workunits'), - config.get('subdir'), timeout=timeout) - - -def _client_mountpoint(ctx, cluster, id_): - """ - Returns the path to the expected mountpoint for workunits running - on some kind of filesystem. - """ - # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet, - # only include the cluster name in the dir if the cluster is not 'ceph' - if cluster == 'ceph': - dir_ = 'mnt.{0}'.format(id_) - else: - dir_ = 'mnt.{0}.{1}'.format(cluster, id_) - return os.path.join(misc.get_testdir(ctx), dir_) - - -def _delete_dir(ctx, role, created_mountpoint): - """ - Delete file used by this role, and delete the directory that this - role appeared in. - - :param ctx: Context - :param role: "role.#" where # is used for the role id. - """ - cluster, _, id_ = misc.split_role(role) - remote = get_remote_for_role(ctx, role) - mnt = _client_mountpoint(ctx, cluster, id_) - client = os.path.join(mnt, 'client.{id}'.format(id=id_)) - - # Remove the directory inside the mount where the workunit ran - remote.run( - args=[ - 'sudo', - 'rm', - '-rf', - '--', - client, - ], - ) - log.info("Deleted dir {dir}".format(dir=client)) - - # If the mount was an artificially created dir, delete that too - if created_mountpoint: - remote.run( - args=[ - 'rmdir', - '--', - mnt, - ], - ) - log.info("Deleted artificial mount point {dir}".format(dir=client)) - - -def _make_scratch_dir(ctx, role, subdir): - """ - Make scratch directories for this role. This also makes the mount - point if that directory does not exist. - - :param ctx: Context - :param role: "role.#" where # is used for the role id. - :param subdir: use this subdir (False if not used) - """ - created_mountpoint = False - cluster, _, id_ = misc.split_role(role) - remote = get_remote_for_role(ctx, role) - dir_owner = remote.user - mnt = _client_mountpoint(ctx, cluster, id_) - # if neither kclient nor ceph-fuse are required for a workunit, - # mnt may not exist. Stat and create the directory if it doesn't. - try: - remote.run( - args=[ - 'stat', - '--', - mnt, - ], - ) - log.info('Did not need to create dir {dir}'.format(dir=mnt)) - except CommandFailedError: - remote.run( - args=[ - 'mkdir', - '--', - mnt, - ], - ) - log.info('Created dir {dir}'.format(dir=mnt)) - created_mountpoint = True - - if not subdir: - subdir = 'client.{id}'.format(id=id_) - - if created_mountpoint: - remote.run( - args=[ - 'cd', - '--', - mnt, - run.Raw('&&'), - 'mkdir', - '--', - subdir, - ], - ) - else: - remote.run( - args=[ - # cd first so this will fail if the mount point does - # not exist; pure install -d will silently do the - # wrong thing - 'cd', - '--', - mnt, - run.Raw('&&'), - 'sudo', - 'install', - '-d', - '-m', '0755', - '--owner={user}'.format(user=dir_owner), - '--', - subdir, - ], - ) - - return created_mountpoint - - -def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None): - """ - Make a scratch directory for each client in the cluster, and then for each - test spawn _run_tests() for each role. - - See run_tests() for parameter documentation. - """ - is_client = misc.is_type('client') - client_remotes = {} - created_mountpoint = {} - for remote, roles_for_host in ctx.cluster.remotes.items(): - for role in roles_for_host: - if is_client(role): - client_remotes[role] = remote - created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir) - - for unit in tests: - with parallel() as p: - for role, remote in client_remotes.items(): - p.spawn(_run_tests, ctx, refspec, role, [unit], env, - basedir, - subdir, - timeout=timeout) - - # cleanup the generated client directories - for role, _ in client_remotes.items(): - _delete_dir(ctx, role, created_mountpoint[role]) - - -def _run_tests(ctx, refspec, role, tests, env, basedir, - subdir=None, timeout=None): - """ - Run the individual test. Create a scratch directory and then extract the - workunits from git. Make the executables, and then run the tests. - Clean up (remove files created) after the tests are finished. - - :param ctx: Context - :param refspec: branch, sha1, or version tag used to identify this - build - :param tests: specific tests specified. - :param env: environment set in yaml file. Could be None. - :param subdir: subdirectory set in yaml file. Could be None - :param timeout: If present, use the 'timeout' command on the remote host - to limit execution time. Must be specified by a number - followed by 's' for seconds, 'm' for minutes, 'h' for - hours, or 'd' for days. If '0' or anything that evaluates - to False is passed, the 'timeout' command is not used. - """ - testdir = misc.get_testdir(ctx) - assert isinstance(role, basestring) - cluster, type_, id_ = misc.split_role(role) - assert type_ == 'client' - remote = get_remote_for_role(ctx, role) - mnt = _client_mountpoint(ctx, cluster, id_) - # subdir so we can remove and recreate this a lot without sudo - if subdir is None: - scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp') - else: - scratch_tmp = os.path.join(mnt, subdir) - clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role) - srcdir = '{cdir}/{basedir}'.format(cdir=clonedir, - basedir=basedir) - - git_url = teuth_config.get_ceph_qa_suite_git_url() - # if we are running an upgrade test, and ceph-ci does not have branches like - # `jewel`, so should use ceph.git as an alternative. - try: - remote.run(logger=log.getChild(role), - args=refspec.clone(git_url, clonedir)) - except CommandFailedError: - if git_url.endswith('/ceph-ci.git'): - alt_git_url = git_url.replace('/ceph-ci.git', '/ceph.git') - elif git_url.endswith('/ceph-ci'): - alt_git_url = re.sub(r'/ceph-ci$', '/ceph.git', git_url) - else: - raise - log.info( - "failed to check out '%s' from %s; will also try in %s", - refspec, - git_url, - alt_git_url, - ) - remote.run(logger=log.getChild(role), - args=refspec.clone(alt_git_url, clonedir)) - remote.run( - logger=log.getChild(role), - args=[ - 'cd', '--', srcdir, - run.Raw('&&'), - 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi', - run.Raw('&&'), - 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir), - run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)), - ], - ) - - workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role) - workunits = sorted(misc.get_file(remote, workunits_file).split('\0')) - assert workunits - - try: - assert isinstance(tests, list) - for spec in tests: - log.info('Running workunits matching %s on %s...', spec, role) - prefix = '{spec}/'.format(spec=spec) - to_run = [w for w in workunits if w == spec or w.startswith(prefix)] - if not to_run: - raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec)) - for workunit in to_run: - log.info('Running workunit %s...', workunit) - args = [ - 'mkdir', '-p', '--', scratch_tmp, - run.Raw('&&'), - 'cd', '--', scratch_tmp, - run.Raw('&&'), - run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'), - run.Raw('CEPH_REF={ref}'.format(ref=refspec)), - run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)), - run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)), - run.Raw('CEPH_ID="{id}"'.format(id=id_)), - run.Raw('PATH=$PATH:/usr/sbin'), - run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)), - run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)), - ] - if env is not None: - for var, val in env.iteritems(): - quoted_val = pipes.quote(val) - env_arg = '{var}={val}'.format(var=var, val=quoted_val) - args.append(run.Raw(env_arg)) - args.extend([ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir)]) - if timeout and timeout != '0': - args.extend(['timeout', timeout]) - args.extend([ - '{srcdir}/{workunit}'.format( - srcdir=srcdir, - workunit=workunit, - ), - ]) - remote.run( - logger=log.getChild(role), - args=args, - label="workunit test {workunit}".format(workunit=workunit) - ) - remote.run( - logger=log.getChild(role), - args=['sudo', 'rm', '-rf', '--', scratch_tmp], - ) - finally: - log.info('Stopping %s on %s...', tests, role) - remote.run( - logger=log.getChild(role), - args=[ - 'rm', '-rf', '--', workunits_file, clonedir, - ], - ) |